# RuT5

Скачаем и используем [ai-forever/ruT5-base](https://huggingface.co/ai-forever/ruT5-base/tree/main)

«…Обучите и протестируйте модель RuT5 на данной задаче (пример finetun’а можете найти здесь https://github.com/RussianNLP/RuCoLA/blob/main/baselines/finetune_t5.py)…»

Импорты библиотек

In [1]:
import os
from functools import partial
from pathlib import Path

import pandas as pd
import numpy as np
from datasets import load_metric, Dataset, DatasetDict
from razdel import tokenize
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    T5Tokenizer,
    T5ForConditionalGeneration,
)

import warnings
warnings.filterwarnings("ignore")

Зададим глобальные параметры.

In [2]:
PATH = r'H:\Инструменты\Windows\GPT or another LLM\ruT5-base 2021'
ACCURACY = load_metric("accuracy", keep_in_memory=True)
MCC = load_metric("matthews_correlation", keep_in_memory=True)
POS_LABEL = "грамотно"
NEG_LABEL = "неграмотно"

CURRENT_DIR = Path('.')
DATA_DIR = CURRENT_DIR
TRAIN_FILE = DATA_DIR / "in_domain_train_subset.csv"
IN_DOMAIN_DEV_FILE = DATA_DIR / "in_domain_validation_subset.csv"
TEST_FILE = DATA_DIR / "in_domain_test.csv"

N_EPOCHS = 10
LR_VALUES = (1e-4,) # 1e-3)
DECAY_VALUES = (0,) # 1e-4)
BATCH_SIZE = 128

Необходимые определения.

In [3]:
def compute_metrics(p, tokenizer):
    string_preds = tokenizer.batch_decode(p.predictions, skip_special_tokens=True)
    int_preds = [1 if prediction == POS_LABEL else 0 for prediction in string_preds]

    labels = np.where(p.label_ids != -100, p.label_ids, tokenizer.pad_token_id)
    string_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    int_labels = []

    for string_label in string_labels:
        if string_label == POS_LABEL:
            int_labels.append(1)
        elif string_label == NEG_LABEL or string_label == "":  # second case accounts for test data
            int_labels.append(0)
        else:
            raise ValueError()

    acc_result = ACCURACY.compute(predictions=int_preds, references=int_labels)
    mcc_result = MCC.compute(predictions=int_preds, references=int_labels)

    result = {"accuracy": acc_result["accuracy"], "mcc": mcc_result["matthews_correlation"]}
    return result


def preprocess_data(sentences, tokenizer):
    result = tokenizer(sentences["sentence"], padding=False)

    if "acceptable" in sentences:
        label_sequences = []
        for label in sentences['acceptable']:
            if label == 1:
                target_sequence = POS_LABEL
            elif label == 0:
                target_sequence = NEG_LABEL
            else:
                raise ValueError("Unknown class label")
            label_sequences.append(target_sequence)
    else:
        # a hack to avoid the "You have to specify either decoder_input_ids or decoder_inputs_embeds" error
        # for test data
        label_sequences = ["" for _ in sentences]

    result["labels"] = tokenizer(label_sequences, padding=False)["input_ids"]
    result["length"] = [len(list(tokenize(sentence))) for sentence in sentences['sentence']]
    return result


def read_splits(*, as_datasets):
    train_df, in_domain_dev_df, test_df = map(
        pd.read_csv, (TRAIN_FILE, IN_DOMAIN_DEV_FILE, TEST_FILE)
    )

    # concatenate datasets to get aggregate metrics
    dev_df = in_domain_dev_df #pd.concat((in_domain_dev_df, out_of_domain_dev_df))

    if as_datasets:
        train, dev, test = map(Dataset.from_pandas, (train_df, dev_df, test_df))
        return DatasetDict(train=train, dev=dev, test=test)
    else:
        return train_df, dev_df, test_df


def train(tokenizer, data_collator, tokenized_splits):
    # seed, lr, wd, bs
    dev_metrics_per_run = np.empty((len(LR_VALUES), len(DECAY_VALUES), 2))

    for i, learning_rate in enumerate(LR_VALUES):
        for j, weight_decay in enumerate(DECAY_VALUES):
            model = T5ForConditionalGeneration.from_pretrained(PATH)

            run_base_dir = f"ruT5_{learning_rate}_{weight_decay}_{BATCH_SIZE}"

            training_args = Seq2SeqTrainingArguments(
                output_dir=f"checkpoints/{run_base_dir}",
                overwrite_output_dir=True,
                evaluation_strategy="epoch",
                per_device_train_batch_size=BATCH_SIZE,
                per_device_eval_batch_size=BATCH_SIZE,
                learning_rate=learning_rate,
                weight_decay=weight_decay,
                num_train_epochs=N_EPOCHS,
                lr_scheduler_type="constant",
                save_strategy="epoch",
                save_total_limit=1,
                seed=1,
                fp16=True,
                dataloader_num_workers=4,
                group_by_length=True,
                report_to="none",
                load_best_model_at_end=True,
                metric_for_best_model="eval_mcc",
                optim="adafactor",
                predict_with_generate=True,
            )

            trainer = Seq2SeqTrainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_splits["train"],
                eval_dataset=tokenized_splits["dev"],
                compute_metrics=partial(compute_metrics, tokenizer=tokenizer),
                tokenizer=tokenizer,
                data_collator=data_collator,
            )

            train_result = trainer.train()
            print(f"{run_base_dir}")
            print("train", train_result.metrics)

            os.makedirs(f"results/{run_base_dir}", exist_ok=True)

            dev_predictions = trainer.predict(
                test_dataset=tokenized_splits["dev"], metric_key_prefix="validation", max_length=10
            )
            print("dev", dev_predictions.metrics)
            dev_metrics_per_run[i, j] = (
                dev_predictions.metrics["validation_mcc"],
            )

            #rmtree(f"checkpoints/{run_base_dir}")
    
    os.makedirs("results_agg", exist_ok=True)
    np.save(f"results_agg/T5_dev.npy", dev_metrics_per_run)    

    return trainer

## Загрузим tokenizer

In [4]:
tokenizer = T5Tokenizer.from_pretrained("ai-forever/ruT5-base")
#tokenizer.save_pretrained(PATH)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Загрузим и подготовим наборы данных

In [5]:
splits_data = read_splits(as_datasets=True)

In [6]:
tokenized_splits_data = splits_data.map(
    partial(preprocess_data, tokenizer=tokenizer),
    batched=True,
    remove_columns=["sentence"],
)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)

## Тренировка модели (fine-tune)

In [10]:
trainer = train(tokenizer, data_collator, tokenized_splits_data)

  0%|          | 0/550 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.24565842747688293, 'eval_accuracy': 0.744973544973545, 'eval_mcc': 0.0, 'eval_runtime': 21.8884, 'eval_samples_per_second': 43.174, 'eval_steps_per_second': 0.365, 'epoch': 1.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.22860047221183777, 'eval_accuracy': 0.744973544973545, 'eval_mcc': 0.0, 'eval_runtime': 22.585, 'eval_samples_per_second': 41.842, 'eval_steps_per_second': 0.354, 'epoch': 2.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.23818540573120117, 'eval_accuracy': 0.744973544973545, 'eval_mcc': 0.0, 'eval_runtime': 21.9671, 'eval_samples_per_second': 43.019, 'eval_steps_per_second': 0.364, 'epoch': 3.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.23778873682022095, 'eval_accuracy': 0.744973544973545, 'eval_mcc': 0.0, 'eval_runtime': 21.9997, 'eval_samples_per_second': 42.955, 'eval_steps_per_second': 0.364, 'epoch': 4.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.22185108065605164, 'eval_accuracy': 0.744973544973545, 'eval_mcc': 0.0, 'eval_runtime': 22.1836, 'eval_samples_per_second': 42.599, 'eval_steps_per_second': 0.361, 'epoch': 5.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.25726646184921265, 'eval_accuracy': 0.744973544973545, 'eval_mcc': 0.0, 'eval_runtime': 22.1499, 'eval_samples_per_second': 42.664, 'eval_steps_per_second': 0.361, 'epoch': 6.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.22308388352394104, 'eval_accuracy': 0.7439153439153439, 'eval_mcc': 0.21379903267059114, 'eval_runtime': 22.4142, 'eval_samples_per_second': 42.161, 'eval_steps_per_second': 0.357, 'epoch': 7.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.2908986210823059, 'eval_accuracy': 0.762962962962963, 'eval_mcc': 0.22472282307392621, 'eval_runtime': 22.5869, 'eval_samples_per_second': 41.838, 'eval_steps_per_second': 0.354, 'epoch': 8.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.28922784328460693, 'eval_accuracy': 0.762962962962963, 'eval_mcc': 0.22472282307392621, 'eval_runtime': 22.2184, 'eval_samples_per_second': 42.532, 'eval_steps_per_second': 0.36, 'epoch': 9.0}
{'loss': 0.4504, 'grad_norm': 0.44199422001838684, 'learning_rate': 0.0001, 'epoch': 9.09}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.2202126681804657, 'eval_accuracy': 0.7481481481481481, 'eval_mcc': 0.2587823105823372, 'eval_runtime': 22.1754, 'eval_samples_per_second': 42.615, 'eval_steps_per_second': 0.361, 'epoch': 10.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 547.8976, 'train_samples_per_second': 126.374, 'train_steps_per_second': 1.004, 'train_loss': 0.42816899386319246, 'epoch': 10.0}
ruT5_0.0001_0_128
train {'train_runtime': 547.8976, 'train_samples_per_second': 126.374, 'train_steps_per_second': 1.004, 'total_flos': 2080047914680320.0, 'train_loss': 0.42816899386319246, 'epoch': 10.0}


  0%|          | 0/8 [00:00<?, ?it/s]

dev {'validation_loss': 0.22064456343650818, 'validation_accuracy': 0.7481481481481481, 'validation_mcc': 0.2587823105823372, 'validation_runtime': 21.7138, 'validation_samples_per_second': 43.521, 'validation_steps_per_second': 0.368}


## Проверим результат на тестовом наборе данных

In [11]:
predictions = trainer.predict(test_dataset=tokenized_splits_data["test"], max_length=10)
print("test", predictions.metrics)

string_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)

int_preds = [1 if prediction == POS_LABEL else 0 for prediction in string_preds]
int_preds = np.asarray(int_preds)

np.save(f"results/preds.npy", int_preds)


  0%|          | 0/8 [00:00<?, ?it/s]

test {'test_loss': 0.20538544654846191, 'test_accuracy': 0.7721261444557477, 'test_mcc': 0.3281330304508942, 'test_runtime': 21.8893, 'test_samples_per_second': 44.908, 'test_steps_per_second': 0.365}
