In [None]:
! pip install sacrebleu
! pip install sacremoses

In [None]:
import pandas as pd
import numpy as np 
import transformers
import os 
from datasets import load_dataset, load_metric
import datasets
from transformers import AutoTokenizer
from transformers import MarianMTModel, MarianTokenizer
from sacrebleu import sentence_bleu
from functools import partial
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
os.environ["WANDB_DISABLED"]="true"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

In [None]:
def load_models(language_pair: tuple[str]) -> dict:
    """Loads pretrained models from MarianMT

    :param language_pair: tuple containing src language and tgt language
    :type language_pair: tuple[str]
    :param cache_path: path to save cache for loading models
    :type cache_path: str
    :return: dictionary containing tokenizer and model objects
    :rtype: dict
    """
    # Get src and tgt language pairs
    src, tgt = language_pair
    model_name = f"Helsinki-NLP/opus-mt-{src}-{tgt}"

    # Load from huggingface or cache
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    return {"tokenizer": tokenizer, "model": model}

In [None]:
language_pair = 'en_sv'
src, tgt = language_pair.split('_')

train = pd.read_parquet(f'/kaggle/input/en-da-dataset/train/{language_pair}.parquet')
valid = pd.read_parquet(f'/kaggle/input/en-da-dataset/valid/{language_pair}.parquet')
test = pd.read_parquet(f'/kaggle/input/en-da-dataset/test/{language_pair}.parquet')

sample_size = train.shape[0]
train_sample = train.sample(n=sample_size, random_state=12)

data = datasets.DatasetDict()
data['train'] = datasets.Dataset.from_list(train_sample['translation'].tolist(), split='train')
data['valid'] = datasets.Dataset.from_list(valid['translation'].tolist(), split='valid')
data['test'] = datasets.Dataset.from_list(test['translation'].tolist(), split='test')

In [None]:
model_checkpoint = f"Helsinki-NLP/opus-mt-{src}-{tgt}"
tokenizer_en_sv = MarianTokenizer.from_pretrained(model_checkpoint)


In [None]:
print(f"P98 for source text: {train['source_text'].str.len().quantile(0.98)}")
print(f"P98 for target text: {train['target_text'].str.len().quantile(0.98)}")

In [None]:
max_input_length = 256
max_target_length = 256
def preprocess_function(sentence, tokenizer, src, tgt, max_input_length, max_target_length):
    inputs = [pair[src] for pair in sentence["translation"]]
    targets = [pair[tgt] for pair in sentence["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
preprocess_es_sv = partial(preprocess_function, 
                           tokenizer=tokenizer_en_sv, 
                              src=src, 
                              tgt=tgt, 
                              max_input_length=max_input_length, 
                              max_target_length=max_target_length
                          )
tokenized_datasets = data.map(preprocess_es_sv, batched=True)

## Training

In [None]:
model_en_sv = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 16
train_epochs = 3
learning_rate = 2e-5

model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{src}-{tgt}",
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    num_train_epochs=train_epochs,
    predict_with_generate=True    
)

data_collator = DataCollatorForSeq2Seq(tokenizer_en_sv, model=model_en_sv)

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model_en_sv,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer_en_sv,
    compute_metrics=None
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(f'/kaggle/working/finetuned-mt-{src}-{tgt}')

### Compute BLEU on finetuned model

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tuned_model_name = f'/kaggle/working/finetuned-mt-{src}-{tgt}'
tuned_tokenizer = MarianTokenizer.from_pretrained(tuned_model_name)
tuned_model = MarianMTModel.from_pretrained(tuned_model_name).to(device)
tuned_model.eval()

Using the test dataset, let's compute BLEU score between the pretrained model and our finetuned model

In [None]:
batch_size=32
test_sample = test.copy()
test_source = test_sample['source_text'].tolist()

In [None]:
def translate_text(src_text, batch_size, tokenizer, model):
    tokenized_text = [tokenizer(src_text[i:(i+batch_size)],
                                return_tensors="pt", padding=True).to(device)
                      for i in range(0, len(src_text), batch_size)
                     ]

    # Get translations
    translations =  [model.generate(**batch) for batch in tokenized_text]
    decoded_translations = [tokenizer.decode(text, skip_special_tokens=True) 
                            for batch in translations for text in batch]
    return decoded_translations

In [None]:
finetuned_results = translate_text(test_source, batch_size, tuned_tokenizer, tuned_model)

In [None]:
test_finetuned = test_sample.copy()
test_finetuned['prediction'] = finetuned_results
test_finetuned['bleu'] = (test_finetuned
                          .apply(lambda x: sentence_bleu(x['prediction'],
                                                         [x['target_text']]).score, 
                                 axis=1)
                         )

In [None]:
print('BLEU score on finetuned model: ', test_finetuned['bleu'].mean())

### Compute BLEU score on pretrained model

In [None]:
en_sv_pretrained_model = load_models(('en', 'sv'))
model_pretr_en_sv = en_sv_pretrained_model['model'].to(device)
tokenizer_pretr_en_sv = en_sv_pretrained_model['tokenizer']

In [None]:
pretrained_translations = translate_text(test_source, batch_size, 
                                         tokenizer_pretr_en_sv, model_pretr_en_sv)

In [None]:
test_finetuned['pretrained_predictions'] = pretrained_translations
test_finetuned['pretrained_bleu'] = (test_finetuned
                          .apply(lambda x: sentence_bleu(x['pretrained_predictions'],
                                                         [x['target_text']]).score, 
                                 axis=1)
                         )

In [None]:
print(test_finetuned['pretrained_bleu'].mean())

In [None]:
test_finetuned.to_csv('en_sv_finetuned_results.csv', index=False)

In [None]:
!zip -r file.zip /kaggle/working/finetuned-mt-en-sv

In [None]:
from IPython.display import FileLink
FileLink(r'file.zip')

Future: give it more epochs. Maybe sample data?