In [1]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# Установка необходимых библиотек:
!pip install scikit-learn
!pip install numpy
!pip install datasets
!pip install pandas
!pip install transformers[torch]
!pip install razdel
!pip install evaluate



In [3]:
import pandas as pd
import numpy as np
import random
from datasets import Dataset, DatasetDict
from evaluate import load
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import torch
from functools import partial
from razdel import tokenize

In [4]:
# Фиксирование рандома, чтобы результат был более менее воспроизводим
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False


seed_all(1234)

In [5]:
# Загрузка данных
data = pd.read_csv("in_domain_train.csv", usecols=['sentence', 'acceptable'])
test_data = pd.read_csv("in_domain_dev.csv")
test_data = test_data[['sentence', 'acceptable']]

train_data, val_data = train_test_split(data, test_size=0.2)
train_data = train_data.reset_index()[['sentence', 'acceptable']]
val_data = val_data.reset_index()[['sentence', 'acceptable']]

In [6]:
train_data

Unnamed: 0,sentence,acceptable
0,Приближался нечеловеческие рев и топот.,0
1,В котором часу завтра утром начинается конфере...,1
2,Она сидела на диване рядом с мужем.,1
3,"Это были студенты, сами сведущие в эскимосском...",1
4,"При всем том я оптимист и думаю, что мой конфл...",1
5,По окончанию института Григорий отправился на ...,0
6,Геологической партии не было на базе.,1
7,"Спортсмены, приехавшие на чемпионат, надеялись...",0
8,Между ними началось нечто неожиданное.,1
9,В случае если не оказать своевременно медицинс...,1


In [7]:
# Загружаем нужную модель
model_checkpoint = "sberbank-ai/ruT5-large"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    model = model.to("cuda")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [47]:
POS_LABEL = "yes"
NEG_LABEL = "no"

def preprocess_examples(examples, tokenizer):
    result = tokenizer(examples["sentence"], padding=False)

    if "acceptable" in examples:
        label_sequences = []
        for label in examples["acceptable"]:
            if label == 1:
                target_sequence = POS_LABEL
            elif label == 0:
                target_sequence = NEG_LABEL
            else:
                raise ValueError("Unknown class label")
            label_sequences.append(target_sequence)

    else:
        # a hack to avoid the "You have to specify either decoder_input_ids or decoder_inputs_embeds" error
        # for test data
        label_sequences = ["" for _ in examples["sentence"]]

    result["labels"] = tokenizer(label_sequences, padding=False)["input_ids"]
    result["length"] = [len(list(tokenize(sentence))) for sentence in examples["sentence"]]
    return result

In [50]:
ACCURACY = load("accuracy", keep_in_memory=True)
MCC = load("matthews_correlation", keep_in_memory=True)


def compute_metrics(p, tokenizer):
    string_preds = tokenizer.batch_decode(p.predictions, skip_special_tokens=True)
    int_preds = [1 if prediction == POS_LABEL else 0 for prediction in string_preds]

    labels = np.where(p.label_ids != -100, p.label_ids, tokenizer.pad_token_id)
    string_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    int_labels = []

    for string_label in string_labels:
        if string_label == POS_LABEL:
            int_labels.append(1)
        elif string_label == NEG_LABEL or string_label == "":  # second case accounts for test data
            int_labels.append(0)
        else:
            raise ValueError()

    acc_result = ACCURACY.compute(predictions=int_preds, references=int_labels)
    mcc_result = MCC.compute(predictions=int_preds, references=int_labels)

    result = {"accuracy": acc_result["accuracy"], "mcc": mcc_result["matthews_correlation"]}

    return result


In [51]:
train, val, test = map(Dataset.from_pandas, (train_data, val_data, test_data))
data = DatasetDict(train=train, val=val, test=test)

tokenized_dataset = data.map(
    partial(preprocess_examples, tokenizer=tokenizer),
    batched=True,
    remove_columns=["sentence"],
)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [52]:
# Параметры, которые будут использоваться для обучения
training_args = Seq2SeqTrainingArguments(
    output_dir='./fine-tuning_RuT5_results',  #Выходной каталог
    overwrite_output_dir=True,
    eval_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-3,
    weight_decay=1e-4,
    num_train_epochs=3,
    lr_scheduler_type="constant",
    save_strategy="epoch",
    save_total_limit=1,
    seed=10,
    fp16=True,
    dataloader_num_workers=4,
    group_by_length=True,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_mcc",
    optim="adafactor",
    predict_with_generate=True,

)

In [53]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

trainer = Seq2SeqTrainer(model=model,
                         args=training_args,
                         train_dataset=tokenized_dataset['train'],
                         eval_dataset=tokenized_dataset['val'],
                         compute_metrics=partial(compute_metrics, tokenizer=tokenizer),
                         data_collator=data_collator,
                         )

In [54]:
train_result = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Mcc
1,No log,1.123655,0.2,0.0
2,No log,0.775712,0.8,0.0
3,No log,1.058987,0.2,0.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [55]:
# Проверяем метрики
print("train", train_result.metrics)

train {'train_runtime': 65.9786, 'train_samples_per_second': 0.455, 'train_steps_per_second': 0.136, 'total_flos': 2562536448000.0, 'train_loss': 3.0727820926242404, 'epoch': 3.0}


In [56]:
val_predictions = trainer.predict(
    test_dataset=trainer.eval_dataset,
    metric_key_prefix="test",
    max_length=10
)
print("val", val_predictions.metrics)


val {'test_loss': 1.3397712707519531, 'test_accuracy': 0.2, 'test_mcc': 0.0, 'test_runtime': 2.7002, 'test_samples_per_second': 3.703, 'test_steps_per_second': 1.111}


In [14]:
# Сохранение обученной модели
model_path = './fine-tune-RuT5'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./fine-tune-RuT5/tokenizer_config.json',
 './fine-tune-RuT5/special_tokens_map.json',
 './fine-tune-RuT5/spiece.model',
 './fine-tune-RuT5/added_tokens.json')

In [60]:
# Написание функции для получения предикта
def get_prediction():
    test_pred = trainer.predict(tokenized_dataset['test'])
    labels = np.argmax(test_pred.predictions, axis=-1)
    return labels

pred = get_prediction()

In [61]:
# Проверка полученного результата
print(classification_report(test['acceptable'], pred))
print(f1_score(test['acceptable'], pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.90      1.00      0.95         9

    accuracy                           0.90        10
   macro avg       0.45      0.50      0.47        10
weighted avg       0.81      0.90      0.85        10

0.9473684210526315


[1, 0, 1, 1, 1, 1, 1, 1, 1, 1]