# Fine Tuning MarianMT Model

In [1]:
import pandas as pd 
import numpy as np 
import torch
from torch.utils.data import Dataset, DataLoader
import tqdm as tqdm
from evaluate import load
from transformers import MarianMTModel, MarianTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In this notebook we will walk through fine tuning the pre-existing MarianMT english to spanish model using the OpenSubtitles english and spanish text.

## Preprocessing Data

First we will parse the data and a sentence in a row, with each column representing the spanish version and english version. We will then split the data appropriately. We will export as a csv for later usage

### Initial Importing of Data

In [None]:
# Grabbing English sentences
with open('../data/en-es/OpenSubtitles.en-es.en') as en_text:
    english_sent = [line.strip() for line in en_text]

In [None]:
with open('../data/en-es/OpenSubtitles.en-es.es') as es_text:
    spanish_sent = [line.strip() for line in es_text]

In [None]:
# Assert they are the same size
len(english_sent), len(spanish_sent)

In [None]:
sentences = pd.DataFrame({
    'english': english_sent,
    'spanish': spanish_sent
})

In [None]:
sentences.to_csv('en-es_Full_Dataset.csv')

In [None]:
sentences

### Importing Data from csv file

In [5]:
sentences = pd.read_csv('./1millsentences.csv')

### Splitting Data

We will split the data into 3 different sets. A test, validation, and test set. Each set will have english and spanish sentences. 

We will reserve 70% of our data for training, 10% for validation, and 20% for testing. 

In [6]:
# Preparing Training Data
train_X = sentences['english'][:int(len(sentences) * .7)] # English only for input
train_y = sentences['spanish'][:int(len(sentences) * .7)] # Spanish for target

# Preparing Testing Data
test_X = sentences['english'][int(len(sentences) * .7) : int(len(sentences) * .9)]
test_y = sentences['spanish'][int(len(sentences) * .7) : int(len(sentences) * .9)]

# Preparing Validation Data
val_X = sentences['english'][int(len(sentences) * .9) : int(len(sentences))]
val_y = sentences['spanish'][int(len(sentences) * .9) : int(len(sentences))]


In [7]:
# Creating sample size to test code

# Preparing Training Data
train_X = sentences_sample['english'][:int(len(sentences_sample) * .7)] # English only for input
train_y = sentences_sample['spanish'][:int(len(sentences_sample) * .7)] # Spanish for target

# Preparing Testing Data
test_X = sentences_sample['english'][int(len(sentences_sample) * .7) : int(len(sentences_sample) * .9)]
test_y = sentences_sample['spanish'][int(len(sentences_sample) * .7) : int(len(sentences_sample) * .9)]

# Preparing Validation Data
val_X = sentences_sample['english'][int(len(sentences_sample) * .9) : int(len(sentences_sample))]
val_y = sentences_sample['spanish'][int(len(sentences_sample) * .9) : int(len(sentences_sample))]

NameError: name 'sentences_sample' is not defined

### Preparing Data

Here we are converting our data into a datatype that is acceptable for the DataLoader class. The DataLoader class loads a certain amount of data to input to the model based on the batch size during training.

First we need to tokenize our inputs so we can represent our text as a numerical value

In [8]:
model_name = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = MarianTokenizer.from_pretrained(model_name)



In [9]:
# Converting our data into a torch tensor
train_X = np.array(train_X)
train_y = np.array(train_y)

test_X = np.array(test_X)
test_y = np.array(test_y)

val_X = np.array(val_X)
val_y = np.array(val_y)

In [10]:
class TextDataset(torch.utils.data.Dataset):

    def __init__(self, X, y, tokenizer):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
        
        # Creating a dataset of inputs to model, and their outputs
        for data_X, data_y in zip(X, y):
            
            # Handles if the input file has Non-string like objects
            if isinstance(data_X, str) and isinstance(data_y, str):
                
                # Tokenize text
                inputs = tokenizer(data_X, max_length=128, padding='max_length', truncation=True)
                with tokenizer.as_target_tokenizer():
                    targets = tokenizer(data_y, max_length=128, padding='max_length', truncation=True)
                
                # Add tokenized text to our data sets
                self.input_ids.append(inputs['input_ids'])
                self.target_ids.append(targets['input_ids'])
        
        # Convert to tensors
        self.input_ids = torch.tensor(self.input_ids)
        self.target_ids = torch.tensor(self.target_ids)
        
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return {
            'input': self.input_ids[index],
            'target': self.target_ids[index]
        }

In [11]:
train_dataset = TextDataset(train_X, train_y, tokenizer)
test_dataset = TextDataset(test_X, test_y, tokenizer)
val_dataset = TextDataset(val_X, val_y, tokenizer)



Creating a dataset based on our sentences

In [12]:
batch_size = 16

# Create DataLoaders 
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

## Fine-Tuning

We will be fine-tuning the MarianMT with our data

In [None]:
model_name = 'Helsinki-NLP/opus-mt-en-es'
model = MarianMTModel.from_pretrained(model_name)
base_model = MarianMTModel.from_pretrained(model_name)

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
else:
    print ("MPS device not found.")
model.to(device)    
base_model.to(device)

In [None]:
def train(model, training_loader, optimizer):
    model.train()
    total_loss = 0
    for data in tqdm.tqdm(training_loader):
        ids = data['input'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, labels=targets)
        
        loss = outputs.loss
        loss.backward()
        
        optimizer.zero_grad()
        optimizer.step()
        
        total_loss += loss.item()
        
        
    return total_loss / len(training_loader)

In [None]:
def test(model, test_loader, tokenizer, bertscore):
    model.eval()
    all_predictions = []
    all_references = []
    
    with torch.no_grad():
        for data in tqdm.tqdm(test_loader):
            targets = data['target'].to(device, dtype = torch.long)
            ids = data['input'].to(device, dtype = torch.long)
            generated_ids = model.generate(input_ids = ids)
            
            predictions = tokenizer.batch_decode(generated_ids, kip_special_tokens=True)[0]
            references = tokenizer.batch_decode(targets, skip_special_tokens=True)[0]
            
            all_predictions.append(predictions)
            all_references.append(references)
        results = bertscore.compute(predictions=all_predictions, references=all_references, lang='es', device=device)
    return results['f1'], all_predictions, all_references

In [None]:
optim = torch.optim.AdamW(model.parameters(), lr=.0001)
NUM_EPOCH = 3
bertscore = load("bertscore")

for epoch in range(NUM_EPOCH):
    loss = train(model, train_loader, optim)
    
    f1_score = val(model, val_loader, tokenizer, bertscore)
    print(f'Epoch: {epoch+1} \nBERTScore F1: {f1_score}\nTraining Loss: {loss}')

## Evaluate

Due to computational resources we will have to evalutate on a smaller dataset. See the Evaluate.ipynb for testing both models

In [None]:
f1_score_based_model = test(base_model, test_loader, tokenizer, bertscore)
f1_score_finetuned = test(model, test_loader, tokenizer, bertscore)

In [None]:
np.mean(f1_score_based_model), np.mean(f1_score_finetuned)