In [1]:
import evaluate
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer)



#### Переменные

In [2]:
DATASET_NAME = "./datasets/ru.csv"
TEST_SIZE = 0.2
MODEL_NAME = "ai-forever/sbert_large_nlu_ru"
SAVE_DIRECTORY = "./models/sbert_pretrained"
OUTPUT_LOG_NAME = "./output/sbert_pretrained"

#### Загружаем данные

In [3]:
df = pd.read_csv(DATASET_NAME, delimiter="|")
df.columns = ["text", "label"]
df['label'] = df['label'].astype(int)

#### Конвертируем датасет в Dataset

In [4]:
train, test_valid = train_test_split(df, test_size=TEST_SIZE, shuffle=True)
train = Dataset.from_pandas(train)
test, valid = train_test_split(test_valid, test_size=0.5)
test = Dataset.from_pandas(test)
valid = Dataset.from_pandas(valid)

#### Выполняем предобработку текста

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=4)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenized_train = train.map(tokenize_function)
tokenized_test = test.map(tokenize_function)
tokenized_valid = valid.map(tokenize_function)

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

#### Загружаем предобученную модель

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/sbert_large_nlu_ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Задаем параметры обучения

In [7]:
training_args = TrainingArguments(
    output_dir=OUTPUT_LOG_NAME,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=5,
    report_to="none"
)

#### Определяем как считать метрику

In [8]:
metric = evaluate.load("f1")

#### Выполняем обучение

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,F1
1,0.7343,0.221322,0.93266
2,0.2032,0.097992,0.93266
3,0.1096,0.005209,1.0
4,0.2748,0.168124,0.93266
5,0.1513,0.29454,0.93266


TrainOutput(global_step=105, training_loss=0.2946480728331066, metrics={'train_runtime': 40.6635, 'train_samples_per_second': 15.37, 'train_steps_per_second': 2.582, 'total_flos': 4550461485000.0, 'train_loss': 0.2946480728331066, 'epoch': 5.0})

In [10]:
print(trainer.predict(tokenized_test))

PredictionOutput(predictions=array([[-3.1638935,  3.905433 , -1.2453672],
       [ 5.0375886, -2.0865383, -2.56602  ],
       [ 4.9885073, -2.0727909, -2.5221233],
       [ 4.8645515, -2.205911 , -2.343811 ],
       [-3.1247156,  3.9036365, -1.2490503],
       [-3.233235 ,  3.8105006, -1.0932819],
       [-3.1535063,  3.9004333, -1.2308774],
       [-3.0366454,  3.940231 , -1.3659871],
       [-2.4993548, -2.4095638,  4.670753 ],
       [ 5.008894 , -2.1995285, -2.3664968],
       [ 5.0589895, -2.1490304, -2.5028868],
       [ 5.0353804, -2.0209274, -2.5901031],
       [-3.2306428,  3.6455564, -0.8535343],
       [-3.1238456,  3.896061 , -1.2523072],
       [ 5.090787 , -2.110083 , -2.492352 ],
       [-2.4041572, -2.2186928,  4.595579 ]], dtype=float32), label_ids=array([1, 0, 0, 0, 1, 1, 1, 1, 2, 0, 0, 0, 1, 1, 0, 2]), metrics={'test_loss': 0.004128485918045044, 'test_f1': 1.0, 'test_runtime': 0.1027, 'test_samples_per_second': 155.757, 'test_steps_per_second': 29.205})


#### Сохраняем модель

In [11]:
tokenizer.save_pretrained(SAVE_DIRECTORY)
model.save_pretrained(SAVE_DIRECTORY)