In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, f1_score
from torch.utils.data import Dataset

class RuWBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_df = pd.read_csv('/content/drive/MyDrive/RuWB_train_168000.csv')
val_df = pd.read_csv('/content/drive/MyDrive/RuWB_val_21000.csv')
test_df = pd.read_csv('/content/drive/MyDrive/RuWB_test_21000.csv')
label_mapping = {1: 0, 3: 1, 5: 2}
train_df['label'] = train_df['rating'].map(label_mapping)
val_df['label'] = val_df['rating'].map(label_mapping)
test_df['label'] = test_df['rating'].map(label_mapping)
train_texts = train_df['text'].tolist()
y_train = train_df['label'].values
val_texts = val_df['text'].tolist()
y_val = val_df['label'].values
test_texts = test_df['text'].tolist()
y_test = test_df['label'].values

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruBert-base", num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_dataset = RuWBDataset(train_texts, y_train, tokenizer)
val_dataset = RuWBDataset(val_texts, y_val, tokenizer)
test_dataset = RuWBDataset(test_texts, y_test, tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    macro_f1 = f1_score(labels, predictions, average='macro')
    return {"macro_f1": macro_f1}

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.1,
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none",
    metric_for_best_model="macro_f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

val_predictions = trainer.predict(val_dataset)
y_val_pred = val_predictions.predictions.argmax(axis=1)
print("Macro F1 (Validation):", round(f1_score(y_val, y_val_pred, average='macro')*100, 2))
test_predictions = trainer.predict(test_dataset)
y_test_pred = test_predictions.predictions.argmax(axis=1)
print(classification_report(y_test, y_test_pred))
print("Macro F1 (Test):", round(f1_score(y_test, y_test_pred, average='macro')*100, 2))
