originally found here : https://github.com/ImperialNLP/NLPLabs/blob/c724834960345085690802233966682bc3321723/lab06/lab06_solutions.ipynb

In [None]:
gpu_id = input()

In [None]:
import os

In [None]:
 os.environ['CUDA_VISIBLE_DEVICES'] = gpu_id

In [None]:
import numpy as np
import pandas as pd 

In [None]:
data = pd.read_csv("../data_test/training_data.csv") 

In [None]:
src_lan = 'en'
trg_lan = 'fr'
# data[src_lan].tolist()

In [None]:
class Multi30K:
    """A dataset wrapper for Multi30K."""
    def __init__(self, tokenizer, src_lan, trg_lan, df):
        
        #TODO : raise an error of source and target languages are not in the dataframe columns
        
        self.tokenizer = tokenizer
        self.src_lan = src_lan
        self.trg_lan = trg_lan
        self.src_sents, self.trg_sents = self.read_sentences(df)

    def read_sentences(self, df):
        src_sents = data[self.src_lan].tolist()
        trg_sents = data[self.trg_lan].tolist()
        return src_sents, trg_sents
    
    def collate_fn(self, idx):
        src_texts = [self.src_sents[i] for i in idx]
        trg_texts = [self.trg_sents[i] for i in idx]
        
        output = self.tokenizer.prepare_seq2seq_batch(src_texts=src_texts, 
                                                      tgt_texts=trg_texts, 
                                                      max_length=128, 
                                                      max_target_length=128,
                                                      return_tensors='pt',
                                                      truncation=True)
        return output
    
    def __len__(self):
        return len(self.src_sents)

    def __getitem__(self, idx):
        return idx

In [None]:
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
pretrainedModelName = 'Helsinki-NLP/opus-mt-'+ src_lan + '-' + trg_lan
print(pretrainedModelName)
model = MarianMTModel.from_pretrained(pretrainedModelName)

In [None]:
num_test = int(0.1 * len(data))
print(num_test)

In [None]:
from random import choices

In [None]:
ids_test = choices(range(len(data)),k=num_test)

In [None]:
data_test = data.iloc[ids_test]
data_test

In [None]:
data_train = data.drop(ids_test)
data_train

In [None]:
def main_mt():
    
    ## QUESTION 5 ##

    mt_tokenizer = MarianTokenizer.from_pretrained(pretrainedModelName)
    mt_dataset = Multi30K(mt_tokenizer, 'en','fr',data_train)
    
    model = MarianMTModel.from_pretrained(pretrainedModelName)

    training_args = TrainingArguments(
        output_dir='./data/experiment/mt',
        learning_rate = 0.00005,
        logging_steps= 5000,
        save_steps = 10000,
        num_train_epochs = 1,
        per_device_train_batch_size=2
    )
    trainer = Trainer(
        model=model,                         
        args=training_args,                 
        train_dataset=mt_dataset,                     
        data_collator=mt_dataset.collate_fn
    )

    trainer.train()

    ## when you already trained your model and want to start from a checkpoint
    #trainer.train("./experiment/mt/checkpoint-40000")

    trainer.save_model('./data/models/custom' + pretrainedModelName)

In [None]:
main_mt()

In [None]:
import sacrebleu

def evaluate_mt(model,mt_tokenizer, mt_test_dataset):

    bleu = []

    model.eval()

    #   for file in tqdm(range(len(mt_test_dataset))):
    for file in range(len(mt_test_dataset)):
        src_text = mt_test_dataset.src_sents[file]
        targ_text_origin = mt_test_dataset.trg_sents[file]

        translated = model.generate(**mt_tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
        translated_text = [mt_tokenizer.decode(t, skip_special_tokens=True) for t in translated]

        bleu.append(sacrebleu.corpus_bleu(translated_text, targ_text_origin, force=True).score)

    bleu = np.asarray(bleu)

    return np.average(bleu)

In [None]:
model = MarianMTModel.from_pretrained('./data/models/custom' + pretrainedModelName)

mt_tokenizer = MarianTokenizer.from_pretrained(pretrainedModelName)
mt_test_dataset = Multi30K(mt_tokenizer, 'en','fr',data_test)

bleu = evaluate_mt(model,mt_tokenizer, mt_test_dataset)

print(bleu)

In [None]:
def compare_mt(model,mt_tokenizer, mt_test_dataset):

    model.eval()

    #   for file in tqdm(range(len(mt_test_dataset))):
    for file in range(len(mt_test_dataset)):
        src_text = mt_test_dataset.src_sents[file]
        targ_text_origin = mt_test_dataset.trg_sents[file]

        translated = model.generate(**mt_tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
        translated_text = [mt_tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        
        print('source : ' + str(src_text))
        print('original : ' + str(targ_text_origin))
        print('translated : ' + str(translated_text))


In [None]:
compare_mt(model,mt_tokenizer, mt_test_dataset)

In [None]:
mt_test_dataset