# 🤖 Обучение RuBERT для классификации фейковых новостей
Этот ноутбук обучает модель DeepPavlov/rubert-base-cased для задачи классификации фейковых новостей и подходит для использования в Telegram-боте или исследовательском проекте.

In [2]:
!pip install transformers torch scikit-learn pandas -q

[0m

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Загрузка и подготовка данных
df = pd.read_csv("Dataset_With_Core_Fake_Claim_Formatted.csv")
df['text'] = df.apply(lambda row: row['CORE_FAKE_CLAIM'] if row['TYPE'] == 'fake_pair' else str(row['REAL_TEXT']).strip(), axis=1)
df['label'] = df['TYPE'].apply(lambda x: 1 if x == 'fake_pair' else 0)

X_train, X_val, y_train, y_val = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Токенизация
model_name = "DeepPavlov/rubert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = NewsDataset(X_train, y_train, tokenizer)
val_dataset = NewsDataset(X_val, y_val, tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Загрузка модели и обучение
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
training_args = TrainingArguments(
    output_dir="./rubert_fakenews",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ClearML Task: created new task id=64fd904f6b044c2d9204c253b7853705
2025-04-22 04:46:04,036 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: http://34.90.169.210:8080/projects/3dd09fb90a3342f2b98e9671493f8798/experiments/64fd904f6b044c2d9204c253b7853705/output/log
CLEARML-SERVER new package available: UPGRADE to v2.0.0 is recommended!
Release Notes:
### Breaking Changes

MongoDB major version was upgraded from v5.x to 6.x.
Please note that if your current ClearML Server version is smaller than v1.17 (where MongoDB v5.x was first used), you'll need to first upgrade to ClearML Server v1.17.
#### Upgrading to ClearML Server v1.17 from a previous version
- If using docker-compose,  use the following docker-compose files:
  * [docker-compose file](https://github.com/allegroai/clearml-server/blob/2976ce69cc91550a3614996e8a8d8cd799af2efd/upgrade/1_17_to_2_0/docker-compose.yml)
  * [docker-compose file foe Windows](https://github.com/allegroai/clearml-serv

Epoch,Training Loss,Validation Loss
1,0.3625,0.116076
2,0.1014,0.344147
3,0.0583,0.02084
4,0.0017,0.125334


2025-04-22 04:46:28,100 - clearml.storage - INFO - Uploading: 2035.59MB to /tmp/model_package.otw7dhii.zip


████████████████▍            61% | 1235.00/2035.59 MB [00:45<00:39, 20.05MB/s]: 

2025-04-22 04:47:13,725 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, http://34.90.169.210:8081/HuggingFace Transformers/Trainer.64fd904f6b044c2d9204c253b7853705/models/checkpoint-69.zip)


██████████████████████████▉ 100% | 2035.00/2035.59 MB [01:11<00:00, 94.76MB/s]: 

2025-04-22 04:47:43,741 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, http://34.90.169.210:8081/HuggingFace Transformers/Trainer.64fd904f6b044c2d9204c253b7853705/models/checkpoint-69.zip)


██████████████████████████▉ 100% | 2035.59/2035.59 MB [01:24<00:00, 24.06MB/s]: 

2025-04-22 04:47:52,716 - clearml.Task - INFO - Completed model upload to http://34.90.169.210:8081/HuggingFace%20Transformers/Trainer.64fd904f6b044c2d9204c253b7853705/models/checkpoint-23.zip





2025-04-22 04:47:53,456 - clearml.storage - INFO - Uploading: 2035.59MB to /tmp/model_package.k51l4wlj.zip


██████████████████████████▉ 100% | 2035.59/2035.59 MB [00:27<00:00, 74.69MB/s]: 

2025-04-22 04:48:20,714 - clearml.Task - INFO - Completed model upload to http://34.90.169.210:8081/HuggingFace%20Transformers/Trainer.64fd904f6b044c2d9204c253b7853705/models/checkpoint-46.zip





2025-04-22 04:48:21,523 - clearml.storage - INFO - Uploading: 2035.60MB to /tmp/model_package.2f6zzzvm.zip


  full_bar = Bar(frac,
████████████████████████████ 100% | 2035.60/2035.6 MB [00:41<00:00, 49.20MB/s]: 
████████████████████████████ 100% | 2035.60/2035.6 MB [00:25<00:00, 80.72MB/s]: 


TrainOutput(global_step=92, training_loss=0.16159479430390764, metrics={'train_runtime': 153.7072, 'train_samples_per_second': 4.658, 'train_steps_per_second': 0.599, 'total_flos': 94193757818880.0, 'train_loss': 0.16159479430390764, 'epoch': 4.0})

In [3]:
def calculate_accuracy(predictions, labels):
    correct = (predictions == labels).sum().item()
    total = len(labels)
    accuracy = correct / total
    return accuracy

# Оценка модели
predictions = trainer.predict(val_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids
accuracy = calculate_accuracy(preds, labels)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 1.0000


In [4]:
# Сохраняем модель
model.save_pretrained("rubert_fakenews20")
tokenizer.save_pretrained("rubert_fakenews20")

('rubert_fakenews20/tokenizer_config.json',
 'rubert_fakenews20/special_tokens_map.json',
 'rubert_fakenews20/vocab.txt',
 'rubert_fakenews20/added_tokens.json',
 'rubert_fakenews20/tokenizer.json')

In [5]:
### Claculate gpu usage of the model in mb
import torch

def get_gpu_memory_usage(model):
    """
    Calculate the GPU memory usage of a PyTorch model in MB.
    """
    if torch.cuda.is_available():
        gpu_memory = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 ** 2)
        return gpu_memory
    else:
        return 0
# Calculate the GPU memory usage of the model
gpu_memory_usage = get_gpu_memory_usage(model)

In [6]:
gpu_memory_usage

678.4628982543945