# MarianLM Evaluation

## Importing modules

In [None]:
!pip install transformers
!pip install evaluate
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import random
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import MarianTokenizer, MarianMTModel
from transformers import BartModel
import torch
from torch import nn
from torch import optim
from torch.utils.data import random_split
import random
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import evaluate
from transformers import T5Model, BartModel
import torch
from torch import nn
import torch.nn.functional as F
from transformers import T5Model
from transformers.utils import logging
logger = logging.get_logger(__name__)

## Dowloading the dataset

In [None]:
!wget http://www.manythings.org/anki/ita-eng.zip
!unzip ita-eng.zip
!rm ita-eng.zip
!mkdir dataset
!mv ita.txt dataset

--2023-03-26 12:56:51--  http://www.manythings.org/anki/ita-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7981351 (7.6M) [application/zip]
Saving to: ‘ita-eng.zip’


2023-03-26 12:56:52 (13.1 MB/s) - ‘ita-eng.zip’ saved [7981351/7981351]

Archive:  ita-eng.zip
  inflating: ita.txt                 
replace _about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: _about.txt              
mkdir: cannot create directory ‘dataset’: File exists


## Defining some settings

In [None]:
!mkdir images
!mkdir checkpoints

DIR_PATH= "."
DATASET_PATH = os.path.join(DIR_PATH, "./dataset")
IMAGE_PATH = os.path.join(DIR_PATH, "./images")
CHECKPOINT_DIR = os.path.join(DIR_PATH, "./checkpoints")

mkdir: cannot create directory ‘images’: File exists
mkdir: cannot create directory ‘checkpoints’: File exists


## Defining utilities

In [None]:
def count_parameters(model):
    n_params =  sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'The model has {n_params} trainable parameters')


def plot_curves(curve_1, label_1, curve_2=None, label_2=None, fig_name="figure", show=False):

    plt.plot(curve_1, label = label_1)
    if curve_2 is not None:
        plt.plot(curve_2, label = label_2)
    plt.legend()
    plt.savefig(f"{fig_name}")

    if show:
        plt.show()

    plt.clf()

    
def plot_attention_mask(attention_mask, source_tokens, target_tokens):

    skip_tokens = len(source_tokens) if "[PAD]" not in source_tokens else source_tokens.index("[PAD]")
    source_tokens = source_tokens[:skip_tokens]

    attention_mask = attention_mask.squeeze(1)

    attention_mask = attention_mask[:, :skip_tokens]

    plt.xticks(ticks=[x for x in range(len(source_tokens))], labels=source_tokens, rotation=45)
    plt.tick_params(top=True, labeltop=True, bottom=False, labelbottom=False)
    plt.yticks(ticks=[x for x in range(len(target_tokens))], labels=target_tokens)
    plt.imshow(attention_mask, cmap='gray', vmin=0, vmax=1)
    plt.show()

## Definition of the dataset class

In [None]:
class AnkiDataset(Dataset):

    def __init__(self,
                 data_path,
                 tokenizer_src,
                 tokenizer_dst,
                 src_max_length,
                 dst_max_length,
                 prefix=False,
                 subsample=False,
                 frac=1.0,
                 seed=42
                ) -> None:
        super().__init__()
        self.tokenizer_src = tokenizer_src
        self.tokenizer_dst = tokenizer_dst
        self.src_max_length = src_max_length
        self.dst_max_length = dst_max_length
        self.seed = seed
        self.frac = frac
        self.subsample = subsample
        random.seed(self.seed)
        self.data = self.get_data(data_path)
        self.prefix = prefix


    def __len__(self):
        return len(self.data)
    

    def __getitem__(self, index):
        
        src, dst = self.data[index]

        src = f"translate English to Italian: {src}" if self.prefix else src
        
        src = self.tokenizer_src(src, max_length=self.src_max_length, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        dst = self.tokenizer_dst(dst, max_length=self.dst_max_length, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
            
        for key in src.keys():
            src[key] = src[key][0]
            dst[key] = dst[key][0]

        return (src, dst)
        


    '''
    Takes in input the path of the datasets and it returnes a list where each element of
    the list is a list of the elment containing the english and italian sentence
    '''
    def get_data(self, data_path="./../dataset/ita.txt"):

        with open(data_path, "r") as dataset:
            sentences = [tuple(sentence.split("\t")[:2]) for sentence in dataset.readlines()]
            
        if self.subsample == True:
            k = int(len(sentences)*self.frac)
            sentences = random.sample(sentences, k)

        return sentences

## Defining the trainer superclass

In [None]:
class Trainer:

    def __init__(self, model, src_tokenizer, dst_tokenizer, config) -> None:

        self.device = config["device"]
        self.model = model.to(self.device)
        self.src_tokenizer = src_tokenizer
        self.dst_tokenizer = dst_tokenizer
        self.config = config

        pad_token = dst_tokenizer.pad_token
        pad_token_idx = dst_tokenizer.convert_tokens_to_ids([pad_token])[0]
        self.criterion = nn.CrossEntropyLoss(ignore_index=pad_token_idx)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.1)
        self.pad_token_idx = pad_token_idx
        self.best_epoch = 0

        self.metric = evaluate.load("bleu")

        if "model_name" in config:
            self.model_name = config["model_name"]
        else:
            self.model_name = self.model.__class__.__name__.lower()


    
    def set_seeds(self, seed):
        torch.manual_seed(seed)
        random.seed(seed)
        np.random.seed(seed)


    def get_data_loader(self, batch_size, val_split=0.2, test_split=0.1):
        
        data_set = AnkiDataset(
            f"{DATASET_PATH}/ita.txt",
            self.src_tokenizer,
            self.dst_tokenizer,
            self.config["src_max_length"],
            self.config["dst_max_length"]
        )


        n = len(data_set)

        val_size = int(n*val_split)
        test_size = int(n*test_split)
        train_size = n - val_size - test_size


        train_set, val_set, test_set = random_split(data_set, [train_size, val_size, test_size])

        train_loader = DataLoader(
                    train_set,
                    batch_size = batch_size
                )
        
        val_loader = DataLoader(
                    val_set,
                    batch_size=batch_size
                )
        
        test_loader = DataLoader(
                    test_set,
                    batch_size = batch_size
                )
        
        return train_loader, val_loader, test_loader


    def generate_learning_curvers(self, train_losses, val_losses):

        plot_curves(
            curve_1=train_losses,
            curve_2=val_losses,
            label_1="Train loss",
            label_2="Validation loss",
            fig_name=f"{IMAGE_PATH}/loss_model_{self.model_name}"
        )

        plot_curves(
            curve_1=train_losses[:self.best_epoch],
            curve_2=val_losses[:self.best_epoch],
            label_1="Train loss",
            label_2="Validation loss",
            fig_name=f"{IMAGE_PATH}/best_loss_model_{self.model_name}"
        )

        plot_curves(
            curve_1=train_losses,
            label_1="Train loss",
            fig_name=f"{IMAGE_PATH}/train_loss_model_{self.model_name}"
        )

        plot_curves(
            curve_1=train_losses[:self.best_epoch],
            label_1="Train loss",
            fig_name=f"{IMAGE_PATH}/best_train_loss_model_{self.model_name}"
        )

        plot_curves(
            curve_1=val_losses,
            label_1="Val loss",
            fig_name=f"{IMAGE_PATH}/val_loss_model_{self.model_name}"
        )

        plot_curves(
            curve_1=val_losses[:self.best_epoch],
            label_1="Val loss",
            fig_name=f"{IMAGE_PATH}/best_val_loss_model_{self.model_name}"
        )


    def train(self, generate_fun):
        
        seed = self.config["seed"]
        self.set_seeds(seed)

        batch_size = self.config["batch_size"]
        # self.model.to(self.device)

        train_loader, val_loader, test_loader = self.get_data_loader(batch_size, 0.2, 0.1)

        self.train_loop(train_loader, val_loader)
        self.model.eval()
        test_loss = self.test_step(test_loader)
        print("Evaluating model on the test set")
        print(f"Test loss: {test_loss}")

        # evaluate bleu score
        train_score = self.metric_evaluation(train_loader, generate_fun)
        val_score = self.metric_evaluation(val_loader, generate_fun)
        test_score = self.metric_evaluation(test_loader, generate_fun)

        print(f"Average train set BLEU score: {train_score}")
        print(f"Average validation set BLEU score: {val_score}")
        print(f"Average test set BLEU score: {test_score}")


    def train_loop(self, train_loader, val_loader):

        epochs = self.config["max_epochs"]
        batch_size = self.config["batch_size"]

        train_losses = []
        val_losses = []

        best_val_loss = float("inf")
        best_loss_epoch = None

        for epoch in range(1, epochs+1):
            self.model.train()
            print(f"Training epoch {epoch}/{epochs}")
            train_loss = self.train_step(train_loader, epoch)
            self.model.eval()
            print(f"Validation epoch {epoch}/{epochs}")
            val_loss = self.val_step(val_loader, epoch)

            if val_loss < best_val_loss:
                if best_loss_epoch != None:
                    os.system(f"rm {CHECKPOINT_DIR}/model_{self.model_name}_{best_loss_epoch}_checkpoint.pt")
                best_val_loss = val_loss
                best_loss_epoch = epoch
                torch.save(self.model.state_dict(), f"{CHECKPOINT_DIR}/model_{self.model_name}_{epoch}_checkpoint.pt")

            train_losses.append(train_loss)
            val_losses.append(val_loss)

            print(f"Epoch {epoch} train loss: {train_loss}, val_loss: {val_loss}")
            break

        self.best_epoch = best_loss_epoch

        self.generate_learning_curvers(train_losses, val_losses)
        


    def train_step(self, train_loader, epoch):

        total_loss = 0
        n = len(train_loader)

        with tqdm(total=n) as pbar:
            for step, batch in enumerate(train_loader):

                self.optimizer.zero_grad()
                inputs, targets = batch

                inputs = inputs.to(self.device)
                targets = targets.to(self.device)

                input_ids = inputs.input_ids
                target_ids = targets.input_ids

                output = self.model(input_ids=input_ids, decoder_input_ids=target_ids)

                logits = output.logits

                logits_dim = logits.shape[-1]

                logits = logits[1:].view(-1, logits_dim)
                target_ids = target_ids[1:].reshape(-1)

                loss = self.criterion(logits, target_ids)
                
                loss.backward()

                self.optimizer.step()

                total_loss += loss.item()

                if (step+1) % 50 == 0:
                    print(f"\nEpoch {epoch}, samples {step+1}/{n} train loss: {total_loss/(step+1)}")

                pbar.update(1)

                
        avg_loss = total_loss / n

        return avg_loss
            
    
    def val_step(self, val_loader, epoch):

        total_loss = 0
        n = len(val_loader)

        with tqdm(total=n) as pbar:
            for step, batch in enumerate(val_loader):

                inputs, targets = batch

                inputs = inputs.to(self.device)
                targets = targets.to(self.device)

                input_ids = inputs.input_ids
                target_ids = targets.input_ids

                output = self.model(input_ids=input_ids, decoder_input_ids=target_ids)
                logits = output.logits

                logits_dim = logits.shape[-1]

                logits = logits[1:].view(-1, logits_dim)
                target_ids = target_ids[1:].reshape(-1)

                loss = self.criterion(logits, target_ids)

                total_loss += loss.item()

                if (step+1) % 50 == 0:
                    print(f"\nEpoch {epoch}, samples {step+1}/{n} train loss: {total_loss/(step+1)}")

                pbar.update(1)

        avg_loss = total_loss / n

        return avg_loss


    
    def test_step(self, test_loader):

        self.model.load_state_dict(torch.load(f"{CHECKPOINT_DIR}/model_{self.model_name}_{self.best_epoch}_checkpoint.pt"))
        
        total_loss = 0
        n = len(test_loader)

        with tqdm(total=n) as pbar:
            for step, batch in enumerate(test_loader):
                
                inputs, targets = batch

                inputs = inputs.to(self.device)
                targets = targets.to(self.device)

                input_ids = inputs.input_ids
                target_ids = targets.input_ids

                output = self.model(input_ids=input_ids, decoder_input_ids=target_ids)
                logits = output.logits

                logits_dim = logits.shape[-1]

                logits = logits[1:].view(-1, logits_dim)
                target_ids = target_ids[1:].reshape(-1)

                loss = self.criterion(logits, target_ids)

                total_loss += loss.item()

                pbar.update(1)

        avg_loss = total_loss / n

        return avg_loss
            

    

    def metric_evaluation(self, data_loader, generate_fun):
        
        model_path = f"{CHECKPOINT_DIR}/model_{self.model_name}_{self.best_epoch}_checkpoint.pt"

        if os.path.isfile(model_path):
            self.model.load_state_dict(torch.load(model_path))
        self.model.eval()

        score = 0
        n = 0

        with tqdm(total=len(data_loader)) as pbar:

            for step, batch in enumerate(data_loader):

                self.optimizer.zero_grad()

                inputs, targets = batch

                inputs = inputs.to(self.device)
                targets = targets.to(self.device)

                gen_inputs = dict()
                gen_inputs["input_ids"] = inputs["input_ids"]
                gen_inputs["attention_mask"] = inputs["attention_mask"]
                
                output = generate_fun(gen_inputs)
                pred_sentences = self.dst_tokenizer.batch_decode(output, skip_special_tokens=True)
                # org_sentences = self.src_tokenizer.batch_decode(inputs.input_ids, skip_special_tokens=True)
                target_sentences = self.dst_tokenizer.batch_decode(targets.input_ids, skip_special_tokens=True)


                for i in range(len(pred_sentences)):
                    pred = pred_sentences[i]#.replace("▁", " ")
                    targ = target_sentences[i]#.replace("▁", " ")
                    result = dict()
                    if pred.replace(" ", "") != "":
                        result = self.metric.compute(predictions=[pred], references=[targ])
                    else:
                        result["bleu"] = 0.0
                    # print(pred, targ, result["bleu"])
                    # print("\n", result)
                    score += result["bleu"]
                    n+=1

                pbar.update(1)

                # for i in range(len(inputs.input_ids)):

                #     input_ids = inputs.input_ids[i]
                #     target_ids = targets.input_ids[i]

                #     output = generate_fun(input_ids.unsqueeze(0))

                #     if type(output) == tuple:
                #         pred_ids, attention = output
                #     else:
                #         pred_ids = output[0]

                #     pred_sentence = self.src_tokenizer.decode(pred_ids, skip_special_tokens=True)
                #     target_sentence = self.dst_tokenizer.decode(target_ids, skip_special_tokens=True)

                #     result = self.metric.compute(predictions=[pred_sentence], references=[target_sentence])
                #     score += result["bleu"]

                # score /= len(data_loader)

            return score/n*100

In [None]:
class Evaluator(Trainer):


    def __init__(self, model, src_tokenizer, dst_tokenizer, config) -> None:
        super().__init__(model, src_tokenizer, dst_tokenizer, config)

    
    def train(self, generate_fun):
        
        seed = self.config["seed"]
        self.set_seeds(seed)

        batch_size = self.config["batch_size"]
        train_loader, val_loader, test_loader = self.get_data_loader(batch_size, 0.2, 0.1)
        self.model.eval()

        train_score = self.metric_evaluation(train_loader, generate_fun)
        val_score = self.metric_evaluation(val_loader, generate_fun)
        test_score = self.metric_evaluation(test_loader, generate_fun)

        print(f"Average train set BLEU score: {train_score}")
        print(f"Average validation set BLEU score: {val_score}")
        print(f"Average test set BLEU score: {test_score}")

In [None]:
model_name = "Helsinki-NLP/opus-mt-tc-big-en-it"
model = MarianMTModel.from_pretrained(model_name)

## Define and train the model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = {
    "src_max_length": 183,
    "dst_max_length": 208,
    "src_vocab_size": 31102,
    "dst_vocab_size": 28996,
    "enc_hidden_dim": 8,
    "dec_hidden_dim": 8,
    "max_epochs": 1,
    "batch_size": 8,
    "seed": 7,
    "device": device
}


tokenizer = MarianTokenizer.from_pretrained(model_name)

eval = Evaluator(model, tokenizer, tokenizer, config)

generate_fun = lambda x: model.generate(
    **x, 
)

eval.train(generate_fun)