In [1]:
import evaluate
import pandas as pd
import numpy as np

from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback)

DATASET_NAME = "./datasets/ru-plus.csv"
TEST_SIZE = 0.2
MODEL_NAME = "ai-forever/sbert_large_nlu_ru"
SAVE_DIRECTORY = "./models/sbert_plus_multi"
OUTPUT_LOG_NAME = "./output/sbert_plus_multi"

METRICS_THRESHOLD = 0.9

df = pd.read_csv(DATASET_NAME, delimiter="|")
df.columns = ["text", "label"]
df_label_0 = df[df["label"] == 0]
df_label_1 = df[df["label"] == 1]
multi_class = []
for i in range(50):
    row_0 = df_label_0.sample(n=1, random_state=np.random.RandomState())
    row_1 = df_label_1.sample(n=1, random_state=np.random.RandomState())
    text = f"{row_0['text'].values[0]}, {row_1['text'].values[0].lower()}"
    multi_class.append({'text': text, 'label': 3})

df = pd.concat([df, pd.DataFrame(multi_class)], ignore_index=True)
df['label'] = df['label'].map({0: [1, 0, 0], 1: [0, 1, 0], 2: [0, 0, 1], 3: [1, 1, 0]})

In [2]:
train, test_valid = train_test_split(df, test_size=TEST_SIZE, shuffle=True)
train = Dataset.from_pandas(train)
test, valid = train_test_split(test_valid, test_size=0.5)
test_ds = Dataset.from_pandas(test)
valid = Dataset.from_pandas(valid)

In [3]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenized_train = train.map(tokenize_function)
tokenized_test = test_ds.map(tokenize_function)
tokenized_valid = valid.map(tokenize_function)

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    problem_type="multi_label_classification",
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/sbert_large_nlu_ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
training_args = TrainingArguments(
    output_dir=OUTPUT_LOG_NAME,
    learning_rate=1e-5,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=32,
    load_best_model_at_end=True,
    report_to="none"
)

In [6]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def predict(val):
    return (val > METRICS_THRESHOLD).astype(int).reshape(-1)

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   return clf_metrics.compute(predictions=predict(predictions), references=labels.astype(int).reshape(-1))

In [7]:
early_stopper = EarlyStoppingCallback(early_stopping_threshold=0.0001)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
    callbacks=[early_stopper]
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6032,0.50874,0.595238,0.0,0.0,0.0
2,0.4332,0.372445,0.714286,0.454545,1.0,0.294118
3,0.3126,0.273071,0.857143,0.785714,1.0,0.647059
4,0.2207,0.197641,0.940476,0.920635,1.0,0.852941
5,0.1488,0.139121,0.964286,0.953846,1.0,0.911765
6,0.1023,0.101198,0.988095,0.985075,1.0,0.970588
7,0.0728,0.079651,0.988095,0.985075,1.0,0.970588
8,0.0572,0.065287,0.988095,0.985075,1.0,0.970588
9,0.0467,0.049374,1.0,1.0,1.0,1.0
10,0.0387,0.044807,0.988095,0.985075,1.0,0.970588


TrainOutput(global_step=434, training_loss=0.07815941866092418, metrics={'train_runtime': 266.4872, 'train_samples_per_second': 26.658, 'train_steps_per_second': 1.681, 'total_flos': 1603393328116224.0, 'train_loss': 0.07815941866092418, 'epoch': 31.0})

In [8]:
tokenizer.save_pretrained(SAVE_DIRECTORY)
model.save_pretrained(SAVE_DIRECTORY)