## Дообучение модели на датасете tico-2019


## 1. Парсинг файла и создание датасета

#### Был загружен текстовый файл из корпуса с параллельными текстами и создан датасет для работы с моделью обучения

In [1]:
!pip install datasets



In [2]:
import pandas as pd

splits = {'train': 'tico19_train.json', 'test': 'tico19_test.json'}
df = pd.read_json("hf://datasets/glazzova/tico19_en_ru/" + splits["train"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
from datasets import Dataset, DatasetDict

data_dict = {str(i+1): {'en': row['en'], 'ru': row['ru']} for i, row in df.iterrows()}
formatted_data = [{'id': key, 'translation': value} for key, value in data_dict.items()]

# Create Hugging Face Dataset
dataset = Dataset.from_list(formatted_data)
dataset_dict = DatasetDict({'train': dataset})

# Print dataset structure
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 2763
    })
})


In [4]:
# как выглядят данные в датасете
dataset_dict['train'][0]

{'id': '1',
 'translation': {'en': 'It stated that much was yet to be discovered about COVID-19 , and that Australia would emphasize border control and communication in its response to the pandemic .',
  'ru': 'В этой связи говорится о том , что многое еще предстоит выяснить о COVID-19 , и что Австралия будет уделять особое внимание пограничному контролю и коммуникациям в ситуации с угрозой .'}}

In [5]:
# разделение датасета на трейновую и тестовую выборки
split_datasets = dataset_dict["train"].train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 2486
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 277
    })
})

In [6]:
split_datasets["validation"] = split_datasets.pop("test")

In [7]:
split_datasets["train"][3]["translation"]

{'en': 'In this letter , we answered one comment on our guideline and provided the newest diagnostic criteria of “ suspected case ” and “ confirmed case ” according to the latest Diagnosis and Treatment Guidelines for COVID-19 ( seventh version ) that issued by the National Health Committee of the People ’ s Republic of China .',
 'ru': 'В этом письме мы ответили на один комментарий относительно наших рекомендаций и приводим новейшие диагностические критерии для « случаев подозрения на заболевание » и « подтвержденных случаев » в соответствии с документом « Рекомендации по диагностике и терапии для COVID-19 » ( седьмая версия ) , выпущенным Национальным комитетом по здравоохранению КНР .'}

## 2. Токенизация

In [8]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-ru"
translator = pipeline("translation", model=model_checkpoint)

Device set to use cuda:0


In [9]:
translator(
    "Even plasma from recovered patients was proposed to be used for treatment ."
)

[{'translation_text': 'Было предложено использовать для лечения даже плазму из рекуперированных пациентов.'}]

In [10]:
translator(
    "and are you having a runny nose ?"
)

[{'translation_text': 'А у тебя нытик?'}]

In [11]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-ru"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [12]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
ru_sentence = split_datasets["train"][1]["translation"]["ru"]

inputs = tokenizer(en_sentence, text_target=ru_sentence)
inputs

{'input_ids': [86, 5732, 1747, 10, 2324, 5, 7010, 21, 26851, 7245, 44237, 8, 21, 20459, 27661, 44237, 1179, 211, 2083, 10, 318, 53132, 20, 6575, 10888, 1200, 5171, 21, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [6716, 350, 5915, 29646, 57, 21, 20391, 3972, 2151, 1369, 7, 21, 6301, 4702, 2151, 1369, 6, 17657, 26, 20165, 70, 6279, 21103, 95, 17177, 21, 2, 343, 19539, 1483, 10759, 6, 226, 16, 6575, 10888, 1200, 5171, 21, 3, 0]}

In [13]:
wrong_targets = tokenizer(ru_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))

['▁', 'П', 'р', 'е', 'д', 'в', 'а', 'р', 'и', 'т', 'е', 'л', 'ь', 'н', 'о', 'е', '▁', 'и', 'с', 'с', 'л', 'е', 'д', 'о', 'в', 'а', 'н', 'и', 'е', '▁', 'к', 'о', 'м', 'б', 'и', 'н', 'и', 'р', 'о', 'в', 'а', 'н', 'н', 'о', 'г', 'о', '▁', 'л', 'о', 'п', 'и', 'н', 'а', 'в', 'и', 'р', 'а', '▁', 'и', '▁', 'р', 'и', 'т', 'о', 'н', 'а', 'в', 'и', 'р', 'а', '▁', 'в', '▁', 'К', 'и', 'т', 'а', 'е', '▁', 'н', 'е', '▁', 'в', 'ы', 'я', 'в', 'и', 'л', 'о', '▁', 'н', 'и', 'к', 'а', 'к', 'о', 'г', 'о', '▁', 'э', 'ф', 'ф', 'е', 'к', 'т', 'а', '▁', 'у', '▁', 'п', 'а', 'ц', 'и', 'е', 'н', 'т', 'о', 'в', '▁', ',', '▁', 'г', 'о', 'с', 'п', 'и', 'т', 'а', 'л', 'и', 'з', 'и', 'р', 'о', 'в', 'а', 'н', 'н', 'ы', 'х', '▁', 'в', '▁', 'с', 'в', 'я', 'з', 'и', '▁', 'с', '▁CO', 'VI', 'D', '-19', '▁', '.', '</s>']
['▁Предварительн', 'ое', '▁исследование', '▁комбинированн', 'ого', '▁', 'лоп', 'ина', 'ви', 'ра', '▁и', '▁', 'рит', 'она', 'ви', 'ра', '▁в', '▁Китае', '▁не', '▁выявил', 'о', '▁никакого', '▁эффекта', '▁у', '

In [14]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["ru"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [15]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

Map:   0%|          | 0/2486 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

## 3. Обучение модели на новом наборе данных

In [16]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [17]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [19]:
batch["labels"]

tensor([[ 6716,   350,  5915, 29646,    57,    21, 20391,  3972,  2151,  1369,
             7,    21,  6301,  4702,  2151,  1369,     6, 17657,    26, 20165,
            70,  6279, 21103,    95, 17177,    21,     2,   343, 19539,  1483,
         10759,     6,   226,    16,  6575, 10888,  1200,  5171,    21,     3,
             0],
        [ 8051,  1513,  2882, 26818,    46, 29834,    46,    21,     2,  6575,
         10888,  1200,  5171,   306, 14972, 10604,  8765, 21365,    42,  4421,
         10997,  1234, 12756, 10610,    52,  1276,  6157, 17177,    21,     3,
             0,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100]])

## 4. Метрики оценки качества перевода

In [20]:
!pip install sacrebleu evaluate



In [21]:
import evaluate
metric = evaluate.load("sacrebleu")

In [22]:
predictions = [
    "А у тебя нытик?"
]
references = [
    [
        "А нет ли у тебя насморка?"
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 17.030578356760866,
 'counts': [4, 1, 0, 0],
 'totals': [5, 4, 3, 2],
 'precisions': [80.0, 25.0, 16.666666666666668, 12.5],
 'bp': 0.6703200460356393,
 'sys_len': 5,
 'ref_len': 7}

#### Оценка BLEU обычно находится в диапазоне от 0 до 100, где 100 означает идеальное совпадение с эталонным текстом. Точные значения, считающиеся хорошими или плохими, могут различаться в зависимости от области применения и требований задачи.



## 5.  Подготовка выходных текстов модели для использования в метрике оценки качества модели.

In [23]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

## 6. Аутентификации в сервисе Hugging Face для загрузки результатов работы с моделью в Model Hub

In [24]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [36]:
# pip install accelerate -U

Collecting accelerate
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers[torch]
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Coll

In [25]:
pip install transformers[torch]



In [26]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f'AminHumara/med_translation_model',
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [28]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
from huggingface_hub import create_repo
create_repo("AminHumara/med_translation_model", repo_type="model")

RepoUrl('https://huggingface.co/AminHumara/med_translation_model', endpoint='https://huggingface.co', repo_type='model', repo_id='AminHumara/med_translation_model')

In [30]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [61]:
trainer.evaluate(max_length=max_length)



{'eval_loss': 1.1242144107818604,
 'eval_model_preparation_time': 0.0032,
 'eval_bleu': 8.758387664824042,
 'eval_runtime': 41.4825,
 'eval_samples_per_second': 6.678,
 'eval_steps_per_second': 0.121,
 'epoch': 3.0}

#### A BLEU score of 26 is not too bad, which reflects the fact that our model is already good at translating English sentences to Russian ones.

In [33]:
trainer.train()

Step,Training Loss




TrainOutput(global_step=234, training_loss=1.2440285967965412, metrics={'train_runtime': 81.4193, 'train_samples_per_second': 91.6, 'train_steps_per_second': 2.874, 'total_flos': 181049578094592.0, 'train_loss': 1.2440285967965412, 'epoch': 3.0})

In [34]:
trainer.evaluate(max_length=max_length)



{'eval_loss': 1.1242144107818604,
 'eval_model_preparation_time': 0.0032,
 'eval_bleu': 8.758387664824042,
 'eval_runtime': 40.9605,
 'eval_samples_per_second': 6.763,
 'eval_steps_per_second': 0.122,
 'epoch': 3.0}

That’s a nearly 5-point improvement, which not bad.

In [35]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1740470975.c4fe3923ab5f.4942.0:   0%|          | 0.00/6.99k [00:00<?, ?B/s]

events.out.tfevents.1740471222.c4fe3923ab5f.4942.1:   0%|          | 0.00/473 [00:00<?, ?B/s]

RuntimeError: Error while uploading 'runs/Feb25_08-07-13_c4fe3923ab5f/events.out.tfevents.1740470975.c4fe3923ab5f.4942.0' to the Hub.

## 7. Подготовка к обучению

In [36]:
from torch.utils.data import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [37]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [38]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [39]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [40]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [41]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "med_translation_model"
repo_name = get_full_repo_name(model_name)
repo_name

'AminHumara/med_translation_model'

In [42]:
output_dir = "med_translation_model"
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/AminHumara/med_translation_model into local empty directory.


Download file model.safetensors:   0%|          | 8.00k/291M [00:00<?, ?B/s]

Download file runs/Feb25_08-07-13_c4fe3923ab5f/events.out.tfevents.1740470975.c4fe3923ab5f.4942.0: 100%|######…

Download file training_args.bin: 100%|##########| 5.37k/5.37k [00:00<?, ?B/s]

Download file source.spm:   4%|4         | 32.0k/784k [00:00<?, ?B/s]

Download file target.spm:   3%|3         | 32.0k/1.03M [00:00<?, ?B/s]

Clean file runs/Feb25_08-07-13_c4fe3923ab5f/events.out.tfevents.1740470975.c4fe3923ab5f.4942.0:  15%|#5       …

Clean file training_args.bin:  19%|#8        | 1.00k/5.37k [00:00<?, ?B/s]

Clean file source.spm:   0%|          | 1.00k/784k [00:00<?, ?B/s]

Clean file target.spm:   0%|          | 1.00k/1.03M [00:00<?, ?B/s]

Clean file model.safetensors:   0%|          | 1.00k/291M [00:00<?, ?B/s]

In [43]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [44]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/933 [00:00<?, ?it/s]

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


  0%|          | 0/35 [00:00<?, ?it/s]



epoch 0, BLEU score: 14.47


  0%|          | 0/35 [00:00<?, ?it/s]



epoch 1, BLEU score: 14.76


  0%|          | 0/35 [00:00<?, ?it/s]



epoch 2, BLEU score: 15.57


## 8. Использование модели

In [59]:
# Replace this with your own checkpoint
model_checkpoint = "AminHumara/med_translation_model"
translator = pipeline("translation", model=model_checkpoint)

Device set to use cuda:0


In [60]:
translator("i have a little cold and a cough")

[{'translation_text': 'У меня немного простуда и кашель'}]

In [58]:
translator("My symptoms are not very clear, but it might be coronavirus.")

[{'translation_text': 'Мои симптомы не очень ясны, но это может быть коронавирус'}]

In [54]:
translator('Do you feel shortness of breath?')

[{'translation_text': 'Чувствуешь ли ты нехватку дыхания?'}]

In [52]:
translator('Even plasma from recovered patients was proposed to be used for treatment .')

[{'translation_text': 'Было предложено использовать для лечения даже плазму выздоровевших пациентов .'}]