In [23]:
import evaluate
import pandas as pd
import numpy as np

from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback)

#### Переменные

In [24]:
DATASET_NAME = "./datasets/ru-plus.csv"
TEST_SIZE = 0.2
MODEL_NAME = "ai-forever/sbert_large_nlu_ru"
SAVE_DIRECTORY = "./models/sbert_plus"
OUTPUT_LOG_NAME = "./output/sbert_plus"

#### Загружаем данные

In [25]:
df = pd.read_csv(DATASET_NAME, delimiter="|")
df.columns = ["text", "label"]
df['label'] = df['label'].astype(int)

#### Конвертируем датасет в Dataset

In [26]:
train, test_valid = train_test_split(df, test_size=TEST_SIZE, shuffle=True)
train = Dataset.from_pandas(train)
test, valid = train_test_split(test_valid, test_size=0.5)
test_ds = Dataset.from_pandas(test)
valid = Dataset.from_pandas(valid)

#### Выполняем предобработку текста

In [27]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenized_train = train.map(tokenize_function)
tokenized_test = test_ds.map(tokenize_function)
tokenized_valid = valid.map(tokenize_function)

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

#### Загружаем предобученную модель

In [28]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/sbert_large_nlu_ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Задаем параметры обучения

In [29]:
training_args = TrainingArguments(
    output_dir=OUTPUT_LOG_NAME,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=32,
    load_best_model_at_end=True,
    report_to="none"
)

#### Определяем как считать метрику

In [30]:
metric = evaluate.load("f1")

#### Выполняем обучение

In [31]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro')

early_stopper = EarlyStoppingCallback(early_stopping_threshold=0.0001)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
    callbacks=[early_stopper]
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,F1
1,0.8449,0.446377,0.965899
2,0.3032,0.18673,0.965899
3,0.074,0.022907,1.0
4,0.0149,0.011545,1.0
5,0.0059,0.005149,1.0
6,0.0035,0.003616,1.0
7,0.0026,0.003005,1.0
8,0.0022,0.002604,1.0
9,0.0019,0.002261,1.0
10,0.0017,0.002043,1.0


TrainOutput(global_step=156, training_loss=0.0968607859686017, metrics={'train_runtime': 123.6648, 'train_samples_per_second': 47.095, 'train_steps_per_second': 3.105, 'total_flos': 275619631961856.0, 'train_loss': 0.0968607859686017, 'epoch': 13.0})

#### Сохраняем модель

In [32]:
tokenizer.save_pretrained(SAVE_DIRECTORY)
model.save_pretrained(SAVE_DIRECTORY)

#### Считаем f1-score для тест dataset

In [33]:
from sklearn.metrics import f1_score
from transformers import pipeline

MODEL_TASK = "sentiment-analysis"
classifier = pipeline(MODEL_TASK, model=model, tokenizer=tokenizer)
texts = test['text'].tolist()
labels = test['label'].tolist()

results = classifier(texts)

LABEL_MAP = {
    'LABEL_0': 0,
    'LABEL_1': 1,
    'LABEL_2': 2,
}
counter = 0

for text, label, result in zip(texts, labels, results):
    if LABEL_MAP[result['label']] != label:
        counter += 1
        print(f"Текст: {text}")
        print(f"Предсказано: {LABEL_MAP[result['label']]}, Значение: {label} Оценка: {result['score']}")
        print()
        
print(f"Всего ошибочно: {counter}")

def map_func(el):
    return LABEL_MAP[el['label']]
mapped_results = list(map(map_func, results))

f1 = f1_score(labels, mapped_results, average='macro')
print(f"F1 Score: {f1}")

Всего ошибочно: 0
F1 Score: 1.0
