# Evaluation of FineTuned Model vs Base MarianMT Model

In this notebook we will compare translations using the English-Spanish Kaggle Dataset.

In [11]:
import pandas as pd 
import numpy as np 
import torch
from torch.utils.data import Dataset, DataLoader
import tqdm as tqdm
from evaluate import load
from transformers import MarianMTModel, MarianTokenizer, MarianConfig

In [26]:
data_file = pd.read_csv('../data/English-Spanish-Kaggle.csv')

Removing duplicate phrases from dataset

In [71]:
df_clean = data_file.drop_duplicates(subset=['english'], keep='first', ignore_index=True)

In [72]:
df_clean = df_clean[100000:len(df_clean) - 2800].reset_index()

## Import Models

In [45]:
base_model_name = 'Helsinki-NLP/opus-mt-en-es'
base_model = MarianMTModel.from_pretrained(base_model_name)
tokenizer = MarianTokenizer.from_pretrained(base_model_name)



In [47]:
state_dict = torch.load('fine_tuned_en_es.bin', map_location=torch.device('cpu'))
config = MarianConfig.from_json_file('config.json')
ft_model = MarianMTModel(config=config)
ft_model.load_state_dict(state_dict)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

## Evaluation

Here we will 

In [78]:
def eval_model(model, device, df, bertscore):
    model.eval()
    all_predictions = []
    all_references = []
    
    with torch.no_grad():
        for row in tqdm.tqdm(range(len(df))):
            
            # Grab phrases from data frame
            en_phrase = df['english'][row]
            es_phrase = df['spanish'][row]
            
            # Grab tokenized version of phrases and generate translation
            en_phrase = tokenizer([en_phrase], return_tensors='pt')
            en_ids = model.generate(**en_phrase)
            
            # Grab tokenized versions of reference phrases
            es_phrase = tokenizer([es_phrase], return_tensors='pt')
            es_ids = model.generate(**es_phrase)
            
            # Return the untokenized reference and prediction
            predictions = tokenizer.batch_decode(en_ids, skip_special_tokens=True)[0]
            references = tokenizer.batch_decode(es_ids, skip_special_tokens=True)[0]
            
            all_predictions.append(predictions)
            all_references.append(references)
    
    return bertscore.compute(predictions=all_predictions, references=all_references, device=device, lang='es')

In [79]:
bertscore = load("bertscore")
ft_model_eval = eval_model(ft_model, torch.device('mps'), df_clean, bertscore=bertscore)
base_model_eval = eval_model(base_model, torch.device('mps'), df_clean, bertscore=bertscore)


100%|██████████| 104/104 [00:49<00:00,  2.11it/s]
100%|██████████| 104/104 [00:49<00:00,  2.08it/s]


In [85]:
np.mean(ft_model_eval['f1']), np.mean(base_model_eval['f1'])

(np.float64(0.815087792965082), np.float64(0.815087792965082))