In [8]:
import pandas as pd
import numpy as np
import torch

In [9]:
df = pd.read_csv('train.csv')
df.drop(columns=['usefull', 'unusefull'], inplace=True)
df

Unnamed: 0,review_text,product_name,category,stars
0,"Заказывали в подарок, коробка пришла не первой...",Ультразвуковой увлажнитель воздуха,Бытовая техника,5
1,товар пришёл раньше срока спасибо!!!,Анаболический комплекс,Спортивное питание,5
2,"Красивый,не шумный,но работает только от сети....",Робот для мойки окон,Бытовая техника,3
3,"как поет в своих песнях Раут ""хороший клоун - ...",Предтренировочный комплекс,Спортивное питание,5
4,"Идеально подошла по размеру, все работают",Умная дверная ручка,Бытовая техника,5
...,...,...,...,...
11693,Все целое и сухое. Доставка в срок. Состав хор...,Удобрение для сливы,Сад и огород,5
11694,Хорошо упакованы,Тапочки,Одежда,5
11695,"телефон супер,четыре дня держит заряд в постоя...",Смартфон,Электроника,5
11696,Прекрасно моет Окна и плитку.,Робот мойщик окон,Бытовая техника,5


In [10]:
df.describe()

Unnamed: 0,stars
count,11698.0
mean,4.389041
std,1.230628
min,1.0
25%,5.0
50%,5.0
75%,5.0
max,5.0


In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, ClassLabel, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss
from transformers import EarlyStoppingCallback


df_train, df_val = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['stars']
)

# 2. Create HuggingFace datasets
dataset = {
    'train': Dataset.from_pandas(df_train.reset_index(drop=True)),
    'validation': Dataset.from_pandas(df_val.reset_index(drop=True))
}

# 3. Compute class weights for imbalance
class_counts = np.bincount(df_train['stars'] - 1, minlength=5)
# Inverse frequency scaled
class_weights = (1.0 / class_counts)
class_weights = class_weights / class_weights.sum() * len(class_counts)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# 4. Prepare labels and tokenization Prepare labels and tokenization
labels = ClassLabel(num_classes=5, names=[str(i) for i in range(1,6)])

def preprocess(example):
    example['label'] = labels.str2int(str(example['stars']))
    return example

dataset = {split: ds.map(preprocess) for split, ds in dataset.items()}

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
def tokenize_fn(batch):
    texts = [
        f"[PRODUCT] {pn} [REVIEW] {rt} [CATEGORY] {cat}"
        for pn, rt, cat in zip(batch['product_name'], batch['review_text'], batch['category'])
    ]
    return tokenizer(texts, truncation=True, padding='max_length', max_length=128)

dataset = {split: ds.map(tokenize_fn, batched=True) for split, ds in dataset.items()}
dataset = {
    split: ds.remove_columns(['product_name', 'review_text', 'category', 'stars'])
    for split, ds in dataset.items()
}

# 5. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    'DeepPavlov/rubert-base-cased',
    num_labels=5,
)

# 6. Custom Trainer to apply class weights in loss
torch.cuda.empty_cache()
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# 7. Training arguments
training_args = TrainingArguments(
    output_dir='output/rubert-finetuned',
    num_train_epochs=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
)

# 8. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'macro_f1': f1_score(labels, preds, average='macro')
    }

# 9. Initialize trainer and train
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

trainer.train()
trainer.save_model('models/rubert-finetuned')

Map: 100%|██████████| 9358/9358 [00:00<00:00, 42536.54 examples/s]
Map: 100%|██████████| 2340/2340 [00:00<00:00, 43333.03 examples/s]
Map: 100%|██████████| 9358/9358 [00:00<00:00, 18076.19 examples/s]
Map: 100%|██████████| 2340/2340 [00:00<00:00, 18000.02 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.2538,1.202211,0.732906,0.418208
2,1.0525,1.252298,0.775641,0.478588
3,0.9708,1.494214,0.468376,0.337801
4,0.831,1.823864,0.749573,0.42825
5,0.7641,1.874616,0.773077,0.468438
6,0.4854,2.40309,0.77735,0.466722
7,0.458,2.640055,0.794872,0.461404


In [13]:
df_test = pd.read_csv('test.csv')
df_test.drop(columns=['usefull', 'unusefull'], inplace=True)
dataset_test = Dataset.from_pandas(df_test)

def tokenize_test(batch):
    texts = [
        f"[PRODUCT] {pn} [REVIEW] {rt} [CATEGORY] {cat}"
        for pn, rt, cat in zip(batch['product_name'], batch['review_text'], batch['category'])
    ]
    return tokenizer(texts, truncation=True, padding='max_length', max_length=128)


dataset_test = dataset_test.map(tokenize_test, batched=True)
dataset_test = dataset_test.remove_columns(
    [col for col in dataset_test.column_names if col not in tokenizer.model_input_names]
)

test_outputs = trainer.predict(dataset_test)
preds = np.argmax(test_outputs.predictions, axis=-1)
y_pred_test = [int(labels.int2str(int(label))) for label in preds]

df_submission = pd.DataFrame({
    '_id': df_test['_id'],
    'stars': y_pred_test
})
df_submission.to_csv('submission.csv', index=False)

Map: 100%|██████████| 957/957 [00:00<00:00, 18765.10 examples/s]
