In [31]:
import evaluate
import numpy as np

import torch
import pandas as pd
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding, 
    EarlyStoppingCallback, 
    Seq2SeqTrainingArguments,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    AutoTokenizer
)
import os
from transformers.integrations import TensorBoardCallback

import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
metric = evaluate.load("sacrebleu")

In [33]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", cache_dir="/home/jovyan/work/homeworks/diploma/nllb-train/nllb-200")
tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", 
    src_lang="kaz_Cyrl", 
    tgt_lang="rus_Cyrl",
    cache_dir="/home/jovyan/work/homeworks/diploma/nllb-train/nllb-200",
)

In [34]:
dataset = load_dataset("issai/kazparc", "kazparc", cache_dir="/home/jovyan/work/homeworks/diploma/nllb-train/data")

In [35]:
dataset["train"] = dataset["train"].filter(lambda example: example["pair"] == "kk_ru")
dataset["validation"] = dataset["validation"].filter(lambda example: example["pair"] == "kk_ru")
dataset["test"] = dataset["test"].filter(lambda example: example["pair"] == "kk_ru")

In [36]:
train_df = pd.DataFrame(dataset['train'])
valid_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

In [37]:
train_df_cleaned = train_df.dropna(subset=['source_lang']).drop_duplicates(subset=['source_lang'])
valid_df_cleaned = valid_df.dropna(subset=['source_lang']).drop_duplicates(subset=['source_lang'])
test_df_cleaned = test_df.dropna(subset=['source_lang']).drop_duplicates(subset=['source_lang'])

In [38]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df_cleaned)
valid_dataset = Dataset.from_pandas(valid_df_cleaned)
test_dataset = Dataset.from_pandas(test_df_cleaned)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [39]:
dataset["train"] = dataset["train"].remove_columns("__index_level_0__")
dataset["validation"] = dataset["validation"].remove_columns("__index_level_0__")

In [40]:
MAX_LENGTH = 256

def tokenize_dataset(example, max_length=MAX_LENGTH):
    encodings = tokenizer(
        example['source_lang'], 
        truncation=True, 
        padding="max_length", 
        max_length=max_length,
        text_target=example['target_lang'],
    )
    return encodings

dataset = dataset.map(tokenize_dataset, batched=True)

Map: 100%|██████████| 286943/286943 [00:50<00:00, 5715.83 examples/s]
Map: 100%|██████████| 72413/72413 [00:12<00:00, 5782.16 examples/s]
Map: 100%|██████████| 4750/4750 [00:00<00:00, 5785.03 examples/s]


In [41]:
tensorboard_callback = TensorBoardCallback()

In [42]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [23]:
model_args = Seq2SeqTrainingArguments(
    output_dir="./output_dir",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    logging_steps=1000,
    evaluation_strategy="steps",
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.02,
    save_total_limit=1,
    predict_with_generate=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=model_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    callbacks=[tensorboard_callback],
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [24]:
trainer.evaluate()

{'eval_loss': 12.790704727172852,
 'eval_model_preparation_time': 0.0059,
 'eval_bleu': 25.311,
 'eval_gen_len': 26.8629,
 'eval_runtime': 300.6643,
 'eval_samples_per_second': 15.798,
 'eval_steps_per_second': 0.988}

In [29]:
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("/home/jovyan/work/homeworks/diploma/nllb-train/output_dir/checkpoint-15500")

In [30]:
trainer = Seq2SeqTrainer(
    model=finetuned_model,
    tokenizer=tokenizer,
    args=model_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    callbacks=[tensorboard_callback],
    compute_metrics=compute_metrics,
)

trainer.evaluate()

  trainer = Seq2SeqTrainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'eval_loss': 4.813709259033203,
 'eval_model_preparation_time': 0.0058,
 'eval_bleu': 35.727,
 'eval_gen_len': 27.9617,
 'eval_runtime': 302.7484,
 'eval_samples_per_second': 15.69,
 'eval_steps_per_second': 0.981}

In [43]:
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("/home/jovyan/work/homeworks/diploma/nllb-train/output_dir/checkpoint-26901")

In [44]:
model_args = Seq2SeqTrainingArguments(
    output_dir="./output_dir",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    logging_steps=1000,
    evaluation_strategy="steps",
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.02,
    save_total_limit=1,
    predict_with_generate=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=finetuned_model,
    tokenizer=tokenizer,
    args=model_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    callbacks=[tensorboard_callback],
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [45]:
trainer.evaluate()

{'eval_loss': 4.492841720581055,
 'eval_model_preparation_time': 0.0104,
 'eval_bleu': 35.9129,
 'eval_gen_len': 27.9634,
 'eval_runtime': 300.5336,
 'eval_samples_per_second': 15.805,
 'eval_steps_per_second': 0.988}