# Fine Tuning MarianMT Model

In [None]:
import pandas as pd 
import numpy as np 
import torch
from torch.utils.data import TensorDataset, DataLoader
import tqdm as tqdm
from evaluate import load
from transformers import MarianMTModel, MarianTokenizer

In this notebook we will walk through fine tuning the pre-existing MarianMT english to spanish model using the OpenSubtitles english and spanish text.

## Preprocessing Data

First we will parse the data and a sentence in a row, with each column representing the spanish version and english version. We will then split the data appropriately. We will export as a csv for later usage

### Initial Importing of Data

In [None]:
# Grabbing English Sentences
with open('../data/en-es/OpenSubtitles.en-es.en') as en_text:
    english_sent = [line.strip() for line in en_text]

In [None]:
with open('../data/en-es/OpenSubtitles.en-es.es') as es_text:
    spanish_sent = [line.strip() for line in es_text]

In [None]:
# Assert they are the same size
len(english_sent), len(spanish_sent)

In [None]:
sentences = pd.DataFrame({
    'english': english_sent,
    'spanish': spanish_sent
})

In [None]:
sentences.to_csv('en-es_Full_Dataset.csv')

In [None]:
sentences

### Importing Data from csv file

In [None]:
sentences = pd.read_csv('../data/en-es_Full_Dataset.csv')

### Splitting Data

We will split the data into 3 different sets. A test, validation, and test set. Each set will have english and spanish sentences. 

We will reserve 70% of our data for training, 10% for validation, and 20% for testing. 

In [None]:
# Preparing Training Data
train_X = sentences['english'][:int(len(sentences) * .7)] # English only for input
train_y = sentences['spanish'][:int(len(sentences) * .7)] # Spanish for target

# Preparing Testing Data
test_X = sentences['english'][int(len(sentences) * .7) : int(len(sentences) * .9)]
test_y = sentences['spanish'][int(len(sentences) * .7) : int(len(sentences) * .9)]

# Preparing Validation Data
val_X = sentences['english'][int(len(sentences) * .9) : int(len(sentences))]
val_y = sentences['spanish'][int(len(sentences) * .9) : int(len(sentences))]


### Preparing Data

Here we are converting our data into a datatype that is acceptable for the DataLoader class. The DataLoader class loads a certain amount of data to input to the model based on the batch size during training.

First we need to tokenize our inputs so we can represent our text as a numerical value

In [None]:
model_name = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = MarianTokenizer.from_pretrained(model_name)

This class extends the Dataset class. What it does is returns the tokenized english sentence and spanish sentence for model inputs.

In [99]:
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, text, labels, tokenizer):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        
    def __len__(self):
        return len(self.text)
        
    def __getitem__(self, index):
        text = self.text[index]
        target = self.targets[index]
        
        # Tokenize source text
        inputs = self.tokenizer(
            text,
            truncation=True,
            return_tensors="pt",
            padding="max_length",
            max_length=128  # Adjust as needed
        )
        
        # Tokenize target text
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                target,
                truncation=True,
                return_tensors="pt",
                padding="max_length",
                max_length=128  # Adjust as needed
            )
        
        # Remove batch dimension created by return_tensors="pt"
        source = inputs['input_ids'].squeeze()
        targets = labels['input_ids'].squeeze()
        
        return {
            'ids': source,
            'targets': targets
        }

In [100]:
train_inputs = tokenizer(train_X[0], text_target=train_y[0], truncation=True)
train_inputs

{'input_ids': [37116, 35, 52, 806, 4452, 10165, 127, 67, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [35010, 12, 52, 806, 4452, 10165, 127, 67, 0]}

In [101]:
# Converting our data into a torch tensor
train_X = np.array(train_X)
train_y = np.array(train_y)

test_X = np.array(test_X)
test_y = np.array(test_y)

val_X = np.array(val_X)
val_y = np.array(val_y)

Creating a dataset based on our sentences

In [102]:
train_data = MultiLabelDataset(train_X, train_y, tokenizer)
test_data = MultiLabelDataset(test_X, test_y, tokenizer)
val_data = MultiLabelDataset(val_X, val_y, tokenizer)

In [103]:
batch_size = 16

# Create DataLoaders 
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size)
val_loader = DataLoader(val_data, batch_size=batch_size)

## Fine-Tuning

We will be fine-tuning the MarianMT with our data

In [104]:
model_name = 'Helsinki-NLP/opus-mt-en-es'
model = MarianMTModel.from_pretrained(model_name)

In [105]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
else:
    print ("MPS device not found.")
model.to(device)    

tensor([1.], device='mps:0')


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [106]:
def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm.tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        optimizer.zero_grad()
        outputs = model(input_ids=ids, decoder_input_ids=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    return loss

In [107]:
def val(model, val_loader, tokenizer, bertscore):
    model.eval()
    all_predictions = []
    all_references = []
    
    with torch.no_grad():
        for data in tqdm.tqdm(val_loader):
            targets = data['targets'].to(device, dtype = torch.long)
            ids = data['ids'].to(device, dtype = torch.long)
            generated_ids = model.generate(input_ids = ids)
            
            predictions = tokenizer.batch_decode(generated_ids)
            references = tokenizer.batch_decode(targets)
            
            all_predictions.append(predictions)
            all_references.append(references)
        results = bertscore.compute(predictions=all_predictions, references=all_references, lang='es')
    return results['f1']
        

In [108]:
optim = torch.optim.AdamW(model.parameters(), lr=.00002)
NUM_EPOCH = 3
bertscore = load("bertscore")

for epoch in range(NUM_EPOCH):
    loss = train(model, train_loader, optim)
    print(f'Epoch: {epoch+1}, Loss: {loss}')
    f1_score = val(model, val_loader, tokenizer, bertscore)
    print(f'Epoch: {epoch+1}, BERTScore F1: {f1_score}')

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
  0%|          | 0/4614857 [00:05<?, ?it/s]


AttributeError: 'NoneType' object has no attribute 'backward'

## Evaluate