In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, logging
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, f1_score
from datasets import Dataset

In [2]:
logging.set_verbosity_error()

In [3]:
train_df = pd.read_csv("data/preprocessed/clean_train.csv", sep=";")
test_df = pd.read_csv("data/preprocessed/clean_test.csv", sep=";")

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [4]:
MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


def tokenize_function(example):
    return tokenizer(
        example["message"],
        padding="max_length",
        truncation=True,
        max_length=128
    )


train_enc = train_dataset.map(tokenize_function, batched=True)
test_enc = test_dataset.map(tokenize_function, batched=True)

train_enc.set_format("torch", columns=["input_ids", "attention_mask", "is_toxic"])
test_enc.set_format("torch", columns=["input_ids", "attention_mask", "is_toxic"])


Map:   0%|          | 0/12711 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Используем устройство:", device)
# Перенос модели на GPU
model.to(device)

print("Модель на устройстве:", next(model.parameters()).device)

Используем устройство: cuda
Модель на устройстве: cuda:0


In [6]:
train_loader = DataLoader(train_enc, batch_size=8, shuffle=True)
test_loader = DataLoader(test_enc, batch_size=8)

optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [7]:
import shutil
shutil.rmtree("models/roberta-toxic", ignore_errors=True)

In [8]:
epochs = 3
model.train()
for epoch in range(epochs):
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in loop:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != "is_toxic"}
        labels = batch["is_toxic"].to(device)

        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Средняя потеря за эпоху {epoch + 1}: {avg_loss:.4f}")

model.save_pretrained("models/roberta-toxic")
tokenizer.save_pretrained("models/roberta-toxic")
print("\nМодель и токенизатор сохранены в 'models/roberta-toxic'")

Epoch 1:   0%|          | 0/1589 [00:00<?, ?it/s]

Средняя потеря за эпоху 1: 0.2822


Epoch 2:   0%|          | 0/1589 [00:00<?, ?it/s]

Средняя потеря за эпоху 2: 0.1856


Epoch 3:   0%|          | 0/1589 [00:00<?, ?it/s]

Средняя потеря за эпоху 3: 0.1681


Epoch 4:   0%|          | 0/1589 [00:00<?, ?it/s]

Средняя потеря за эпоху 4: 0.2428


Epoch 5:   0%|          | 0/1589 [00:00<?, ?it/s]

Средняя потеря за эпоху 5: 0.2367


Epoch 6:   0%|          | 0/1589 [00:00<?, ?it/s]

Средняя потеря за эпоху 6: 0.1590

Модель и токенизатор сохранены в 'models/roberta-toxic'


In [9]:
m = AutoModelForSequenceClassification.from_pretrained("models/roberta-toxic")
print("Тип модели:", type(m))
print("Параметры классификатора:", m.classifier)

Тип модели: <class 'transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification'>
Параметры классификатора: RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)


In [13]:
model = AutoModelForSequenceClassification.from_pretrained("models/roberta-toxic")
tokenizer = AutoTokenizer.from_pretrained("models/roberta-toxic")
model.to(device)

model.eval()
preds, labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Оценка"):
        inputs = {k: v.to(device) for k, v in batch.items() if k != "is_toxic"}
        y_true = batch["is_toxic"].to(device)

        outputs = model(**inputs)
        y_pred = torch.argmax(outputs.logits, dim=1)

        preds.extend(y_pred.cpu().numpy())
        labels.extend(y_true.cpu().numpy())

Оценка:   0%|          | 0/29 [00:00<?, ?it/s]

In [14]:
print("\n=== Отчёт по метрикам ===")
print(classification_report(labels, preds, digits=2))


=== Отчёт по метрикам ===
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       119
           1       0.90      0.95      0.92       109

    accuracy                           0.93       228
   macro avg       0.93      0.93      0.93       228
weighted avg       0.93      0.93      0.93       228



In [15]:
print("F1-score:", round(f1_score(labels, preds), 2))

F1-score: 0.92
