In [160]:
import pandas as pd
import numpy as np
import torch

In [161]:
RANDOM_STATE = 42
TRAIN = False
DEMONSTRATION = True

MODEL = 'DeepPavlov/rubert-base-cased'
OUTPUT_PATH = f'output/{MODEL}-finetuned'
MODEL_PATH = f'models/{MODEL}-finetuned'

In [162]:
df = pd.read_csv('train.csv')
df.drop(columns=['usefull', 'unusefull'], inplace=True)
df

Unnamed: 0,review_text,product_name,category,stars
0,"Заказывали в подарок, коробка пришла не первой...",Ультразвуковой увлажнитель воздуха,Бытовая техника,5
1,товар пришёл раньше срока спасибо!!!,Анаболический комплекс,Спортивное питание,5
2,"Красивый,не шумный,но работает только от сети....",Робот для мойки окон,Бытовая техника,3
3,"как поет в своих песнях Раут ""хороший клоун - ...",Предтренировочный комплекс,Спортивное питание,5
4,"Идеально подошла по размеру, все работают",Умная дверная ручка,Бытовая техника,5
...,...,...,...,...
11693,Все целое и сухое. Доставка в срок. Состав хор...,Удобрение для сливы,Сад и огород,5
11694,Хорошо упакованы,Тапочки,Одежда,5
11695,"телефон супер,четыре дня держит заряд в постоя...",Смартфон,Электроника,5
11696,Прекрасно моет Окна и плитку.,Робот мойщик окон,Бытовая техника,5


In [163]:
df.describe()

Unnamed: 0,stars
count,11698.0
mean,4.389041
std,1.230628
min,1.0
25%,5.0
50%,5.0
75%,5.0
max,5.0


In [164]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

df_train, df_val = train_test_split(
    df, test_size=0.2, random_state=RANDOM_STATE, stratify=df['stars']
)

dataset = {
    'train': Dataset.from_pandas(df_train.reset_index(drop=True)),
    'validation': Dataset.from_pandas(df_val.reset_index(drop=True))
}

In [165]:
import torch.nn as nn
from transformers import Trainer

class WeightedCELossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(model.device))
        loss = loss_fct(logits, labels)
        
        return (loss, outputs) if return_outputs else loss

In [166]:
from datasets import ClassLabel

class_counts = np.bincount(df_train['stars'] - 1, minlength=5)
class_weights = (1.0 / class_counts)
class_weights = class_weights / class_weights.sum() * len(class_counts)
class_weights = torch.tensor(class_weights, dtype=torch.float)

labels = ClassLabel(num_classes=5, names=[str(i) for i in range(1,6)])

In [167]:
def preprocess(example):
    example['label'] = labels.str2int(str(example['stars']))
    return example

In [168]:
def tokenize_fn(batch):
    texts = [
        (f"[PRODUCT] {pn} "
         f"[REVIEW] {rt} "
         f"[CATEGORY] {cat} ")
        for pn, rt, cat in zip(batch['product_name'], batch['review_text'], batch['category'])
    ]
    
    return tokenizer(texts, truncation=True, padding='max_length', max_length=256)

In [169]:
dataset = {split: ds.map(tokenize_fn, batched=True) for split, ds in dataset.items()}
dataset = {
    split: ds.remove_columns(['product_name', 'review_text', 'category', 'stars'])
    for split, ds in dataset.items()
}

Map: 100%|██████████| 9358/9358 [00:00<00:00, 10197.62 examples/s]
Map: 100%|██████████| 2340/2340 [00:00<00:00, 12352.32 examples/s]


In [170]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='macro'),
    }

In [171]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset, ClassLabel, Dataset

if TRAIN:
    model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=5)
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    dataset = {split: ds.map(preprocess) for split, ds in dataset.items()}
    
    training_args = TrainingArguments(
        output_dir=OUTPUT_PATH,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        save_total_limit=2,
        num_train_epochs=30,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_ratio=0.1,
        weight_decay=0.01,
        learning_rate=2e-5,
        logging_steps=25,
        seed=RANDOM_STATE,
        fp16=torch.cuda.is_available(),
    )
    
    trainer = WeightedCELossTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
        class_weights=class_weights,
    )
    
    trainer.train()
    trainer.save_model(MODEL_PATH)

In [172]:
if DEMONSTRATION:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
    trainer = Trainer(model=model, tokenizer=tokenizer)
    
    df_test = pd.read_csv('test.csv')
    df_test.drop(columns=['usefull', 'unusefull'], inplace=True)
    dataset_test = Dataset.from_pandas(df_test)
    dataset_test = dataset_test.map(tokenize_fn, batched=True)
    dataset_test = dataset_test.remove_columns(
        [col for col in dataset_test.column_names if col not in tokenizer.model_input_names]
    )
    
    test_outputs = trainer.predict(dataset_test)
    preds = np.argmax(test_outputs.predictions, axis=-1)
    y_pred_test = [int(labels.int2str(int(label))) for label in preds]
    
    df_submission = pd.DataFrame({
        '_id': df_test['_id'],
        'stars': y_pred_test
    })
    df_submission.to_csv('submission.csv', index=False)

  trainer = Trainer(model=model, tokenizer=tokenizer)
Map: 100%|██████████| 957/957 [00:00<00:00, 12258.05 examples/s]
