In [16]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [17]:
# Установка необходимых библиотек:
!pip install scikit-learn
!pip install numpy
!pip install datasets
!pip install pandas
!pip install transformers[torch]
!pip install razdel
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [18]:
import pandas as pd
import numpy as np
import random
from datasets import Dataset, DatasetDict
from evaluate import load
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import torch
from functools import partial
from razdel import tokenize

In [19]:
# Фиксирование рандома, чтобы результат был более менее воспроизводим
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False


seed_all(1234)

In [20]:
# Загрузка данных
BASE_DIR = "/home/alexandr/PycharmProjects/NLP_homework/homework3"
data = pd.read_csv("in_domain_train.csv", usecols=['sentence', 'acceptable'])
test_data = pd.read_csv("in_domain_dev.csv")
test_data = test_data[['sentence', 'acceptable']]

train_data, val_data = train_test_split(data, test_size=0.2)
train_data = train_data.reset_index()[['sentence', 'acceptable']]
val_data = val_data.reset_index()[['sentence', 'acceptable']]

# Новый раздел

In [14]:
train_data

Unnamed: 0,sentence,acceptable
0,Приближался нечеловеческие рев и топот.,0
1,В котором часу завтра утром начинается конфере...,1
2,Она сидела на диване рядом с мужем.,1
3,"Это были студенты, сами сведущие в эскимосском...",1
4,"При всем том я оптимист и думаю, что мой конфл...",1
...,...,...
6290,Она хочет поговорить с каким-нибудь хорошим сп...,1
6291,"При расставании я целовал три раза чудесные, с...",1
6292,"Парк в старом русле реки Турии, раскинувшиеся ...",0
6293,Страны НАТО хотят взять с Ирана обязательство ...,0


In [22]:
# Загружаем нужную модель
model_checkpoint = "sberbank-ai/ruT5-large"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint, num_labels=2)
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    model = model.to("cuda")

pytorch_model.bin:  50%|####9     | 1.47G/2.95G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [23]:
# Функция обработки
def preprocess_examples(examples, tokenizer):
    result = tokenizer(examples["sentence"], padding=False)

    if "acceptable" in examples:
        label_sequences = []
        for label in examples["acceptable"]:
            if label in [0, 1]:
                target_sequence = str(label)
            else:
                raise ValueError("Unknown class label")
            label_sequences.append(target_sequence)

    else:
        # a hack to avoid the "You have to specify either decoder_input_ids or decoder_inputs_embeds" error
        # for test data
        label_sequences = ["" for _ in examples["sentence"]]

    result["labels"] = tokenizer(label_sequences, padding=False)["input_ids"]
    result["length"] = [len(list(tokenize(sentence))) for sentence in examples["sentence"]]
    return result

In [24]:

train, val, test = map(Dataset.from_pandas, (train_data, val_data, test_data))
data = DatasetDict(train=train, val=val, test=test)
tokenized_dataset = data.map(
    partial(preprocess_examples, tokenizer=tokenizer),
    batched=True,
    remove_columns=["sentence"],
)

Map:   0%|          | 0/6295 [00:00<?, ? examples/s]

Map:   0%|          | 0/1574 [00:00<?, ? examples/s]

Map:   0%|          | 0/983 [00:00<?, ? examples/s]

In [18]:
# Функция для расчета метрики
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    print(labels)
    print(pred)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [25]:
ACCURACY = load("accuracy", keep_in_memory=True)
MCC = load("matthews_correlation", keep_in_memory=True)

# Функция для расчета метрики
def compute_metrics(p, tokenizer):
    string_preds = tokenizer.batch_decode(p.predictions, skip_special_tokens=True)
    int_preds = [int(prediction) for prediction in string_preds]

    labels = np.where(p.label_ids != -100, p.label_ids, tokenizer.pad_token_id)
    string_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    int_labels = []

    for string_label in string_labels:
        if string_label in [0, 1]:
            int_labels.append(int(string_label))
        else:
            raise ValueError()

    acc_result = ACCURACY.compute(predictions=int_preds, references=int_labels)
    mcc_result = MCC.compute(predictions=int_preds, references=int_labels)

    result = {"accuracy": acc_result["accuracy"], "mcc": mcc_result["matthews_correlation"]}

    return result

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

In [32]:
# Параметры, которые будут использоваться для обучения
training_args = Seq2SeqTrainingArguments(
    output_dir='./fine-tuning_RuT5_results',  #Выходной каталог
    num_train_epochs=3,  #Кол-во эпох для обучения
    per_device_train_batch_size=16,  #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size=8,  #Размер пакета для каждого устройства во время валидации
    weight_decay=5e-2,  #Понижение весов
    load_best_model_at_end=True,  #Загружать ли лучшую модель после обучения
    learning_rate=2e-5,  #Скорость обучения
    eval_strategy='epoch',  #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy='epoch',  #Логирование после каждой эпохи
    save_strategy='epoch',  #Сохранение после каждой эпохи
    gradient_accumulation_steps=16,
    save_total_limit=1,
    fp16=True,
    dataloader_num_workers=4,
    group_by_length=True,
    report_to="none",
    metric_for_best_model="eval_mcc",
    optim="adafactor",
    predict_with_generate=True,
)

In [33]:

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
trainer = Seq2SeqTrainer(model=model,
                         args=training_args,
                         train_dataset=tokenized_dataset['train'],
                         eval_dataset=tokenized_dataset['val'],
                         compute_metrics=compute_metrics,
                         data_collator=data_collator,
                         )

In [31]:
trainer.train()

InductorError: CppCompileError: C++ compile error

Command:
g++ /tmp/torchinductor_alexandr/mh/cmhvw2g25ue5wt335rhkxurdtvuworah5ayxwbe6vgbffpjgkmue.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX2 -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -fopenmp -I/home/alexandr/miniconda3/include/python3.12 -I/home/alexandr/miniconda3/lib/python3.12/site-packages/torch/include -I/home/alexandr/miniconda3/lib/python3.12/site-packages/torch/include/torch/csrc/api/include -mavx2 -mfma -mf16c -D_GLIBCXX_USE_CXX11_ABI=1 -ltorch -ltorch_cpu -ltorch_python -lgomp -L/home/alexandr/miniconda3/lib -L/home/alexandr/miniconda3/lib/python3.12/site-packages/torch/lib -o /tmp/torchinductor_alexandr/mh/cmhvw2g25ue5wt335rhkxurdtvuworah5ayxwbe6vgbffpjgkmue.so

Output:
/tmp/torchinductor_alexandr/mh/cmhvw2g25ue5wt335rhkxurdtvuworah5ayxwbe6vgbffpjgkmue.cpp: In function 'void kernel(const int64_t*, const float*, const float*, const float*, const float*, const float*, const float*, float*, float*)':
/tmp/torchinductor_alexandr/mh/cmhvw2g25ue5wt335rhkxurdtvuworah5ayxwbe6vgbffpjgkmue.cpp:29:23: error: redeclaration of 'float tmp_acc0_arr [8]'
   29 |                 float tmp_acc0_arr[8];
      |                       ^~~~~~~~~~~~
/tmp/torchinductor_alexandr/mh/cmhvw2g25ue5wt335rhkxurdtvuworah5ayxwbe6vgbffpjgkmue.cpp:17:23: note: 'float tmp_acc0_arr [8]' previously declared here
   17 |                 float tmp_acc0_arr[8];
      |                       ^~~~~~~~~~~~


In [None]:
# Оцениваем точность на проверенном наборе val_data
eval_results = trainer.evaluate()
print(f'Точность в наборе для проверки: {eval_results["eval_accuracy"]:.3f}')

In [None]:
# Написание функции для получения предикта
def get_prediction():
    test_pred = trainer.predict(tokenized_dataset['test'])
    labels = np.argmax(test_pred.predictions, axis=-1)
    return labels


pred = get_prediction()

In [None]:
# Проверка полученного результата
print(classification_report(tokenized_dataset['test'], pred))
print(f1_score(tokenized_dataset['test'], pred))

In [None]:
# Сохранение обученной модели
model_path = BASE_DIR + '/fine-tune-RuBert'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
data_pred = test_data.copy()
data_pred['pred'] = pred
data_pred

Результат получился неплохим f1=0.863, данную модель можно улучшить подбирая гиперпараметры.