# Imports

We first define the set of imports that we will use to train the model and import the dataset files.

In [22]:
import torch
import torch.nn as nn
import json # for loading the json file

# Loading dataset imports
from torch.utils.data import DataLoader # for creating the dataloader

# Training imports
from Transformer_model import build_transformer # the model
from torch.utils.tensorboard import SummaryWriter  # for logging during training
from tqdm import tqdm # for the progress bar during training

# For saving the model and checkpoints during training
import os

# For calculating the BLEU score 
from nltk.translate.bleu_score import corpus_bleu

# Hyperparameters

We define the hyperparameters that we will use to train the model. These are defined in the `hyperparameters.json` file.

In [23]:
# Load the JSON files
def load_json(json_file):
    with open(json_file) as json_data:
        d = json.load(json_data)
        return d
    
hyperparameters = load_json('hyperparameters.json')

# Load the hyperparameters
learning_rates = hyperparameters["learning_rates"]
epochs = hyperparameters["num_epochs"]

# Import English to Italian Datasets

 We first load the english to italian translation datasets we created by runnning the `Preprocessing.ipynb` file (training, validation and test datasets). We also import the vocabulary dictionaries for both the source and the target languages (also saved from running the `Preprocessing.ipynb` file).

In [24]:
header_file = load_json('header.json')

We import the datasets using the paths defined in the header file.

In [25]:
# Get the datasets path from the header file
en_it_dataset_path = header_file['en-it-save-path']

# Load the datasets
en_it_train = torch.load(en_it_dataset_path + 'train_ds.pt')
en_it_val = torch.load(en_it_dataset_path + 'val_ds.pt')
en_it_test = torch.load(en_it_dataset_path + 'test_ds.pt')

# Load the vocabularies from the header file
en_it_source_vocab = torch.load(en_it_dataset_path + 'source_vocab.pt')
en_it_target_vocab = torch.load(en_it_dataset_path + 'target_vocab.pt')

# Print the size of the dataset as a sanity check
print('Size of training dataset: ', len(en_it_train))
print('Size of validation dataset: ', len(en_it_val))
print('Size of test dataset: ', len(en_it_test))

Size of training dataset:  15999
Size of validation dataset:  2001
Size of test dataset:  2000


We create the dataloaders for the training, validation and test datasets we just imported. We use the dataloaders to create batches of data that we will use to train the model. The batch size is defined in the `hyperparameters.json` file.

In [26]:
# Create dataloaders for the datasets. The batch size is specified in the hyperparameters file
en_it_train_dl = DataLoader(en_it_train, batch_size=hyperparameters["batch_size"], shuffle=True)
en_it_val_dl = DataLoader(en_it_val, batch_size=1, shuffle=False) # batch size is 1 for validation
en_it_test_dl = DataLoader(en_it_test, batch_size=1, shuffle=False) # batch size is 1 for testing

# Import the English to Spanish Datasets

 We then load the english to spanish translation datasets that was created using the same method as the english to italian datasets. We also import the vocabulary dictionaries for both the source and the target languages. 

In [27]:
# Get the datasets path from the header file
en_es_dataset_path = header_file['en-es-save-path']

# Load the datasets
en_es_train = torch.load(en_es_dataset_path + 'train_ds.pt')
en_es_val = torch.load(en_es_dataset_path + 'val_ds.pt')
en_es_test = torch.load(en_es_dataset_path + 'test_ds.pt')

# Load the vocabularies from the header file
en_es_source_vocab = torch.load(en_es_dataset_path + 'source_vocab.pt')
en_es_target_vocab = torch.load(en_es_dataset_path + 'target_vocab.pt')

# Print the size of the dataset as a sanity check
print('Size of training dataset: ', len(en_es_train))
print('Size of validation dataset: ', len(en_es_val))
print('Size of test dataset: ', len(en_es_test))

Size of training dataset:  15999
Size of validation dataset:  2001
Size of test dataset:  2000


We create the dataloaders for the training, validation and test datasets we just imported. We use the dataloaders to create batches of data that we will use to train the model. The batch size is defined in the `hyperparameters.json` file.

In [28]:
# Create dataloaders for the datasets. The batch size is specified in the hyperparameters file
en_es_train_dl = DataLoader(en_es_train, batch_size=hyperparameters["batch_size"], shuffle=True)
en_es_val_dl = DataLoader(en_es_val, batch_size=1, shuffle=False) # batch size is 1 for validation
en_es_test_dl = DataLoader(en_es_test, batch_size=1, shuffle=False) # batch size is 1 for testing

# Training functions

We set the device to be used for training. We use the GPU if it is available, otherwise we use the CPU.

In [29]:
# Select device: cuda, mps or cpu
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('Device:', device)

Device: mps


In [30]:
def causal_mask(seq_len):
    """
    Causal mask: each word in the decoder can only look at previous words
    This is done to prevent the decoder from looking at future words.
    """
    # Create a matrix of size seq_len x seq_len
    # Fill the upper triangle with 0s and lower triangle with 1s
    # This is done to prevent the decoder from looking at future words
    return torch.tril(torch.ones((1, seq_len, seq_len), dtype=torch.int64))

In [31]:
def decode_tokens(token_ids, vocab):
    """
    Decode a list of token IDs back to a sentence using the vocabulary.
    """
    # Create a reverse vocabulary
    reverse_vocab = {id: word for word, id in vocab.items()}

    # Decode the token IDs to words
    words = [reverse_vocab.get(id, "[UNK]") for id in token_ids]

    return ' '.join(words)

In [32]:
def greedy_decode(model, source, encoder_mask, trg_vocab, sos_idx, eos_idx, max_len, device):    
    # Precompute the encoder output and reuse it for every token we get from the decoder
    encoder_output = model.encode(source, encoder_mask)

    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1,1).fill_(sos_idx).type_as(source).to(device)
    
    while True:
        # Break if the decoder input size is equal to the max length (which is set in the header file)
        if decoder_input.size(1) == max_len:
            break
        
        # Create a mask to prevent the decoder from looking at future words
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(encoder_mask).to(device)

        # Get the decoder output
        decoder_output = model.decode(decoder_input, encoder_output, encoder_mask, decoder_mask)     
        
        # Get the last predicted token
        output = model.output(decoder_output[:, -1])

        # Get the token with the max probability (greedy search)
        _, next_word = torch.max(output, dim=1)

        # Concatenate the predicted token to the decoder input as the next input for the decoder
        decoder_input = torch.cat([decoder_input, torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)], dim=1)
        
        # Break if the decoder predicted the end of sentence token
        if next_word == eos_idx:
            break

    # Remove the batch dimension
    decoder_input = decoder_input.squeeze(0)
    
    # Convert the decoded sentence to a list of token IDs 
    decoder_input = decoder_input.detach().cpu().numpy()

    # Remove the sos token from the decoded sentence
    decoder_input = decoder_input[1:]

    return decode_tokens(decoder_input, trg_vocab)

In [33]:
def train(model, lr, epochs, train_dl, val_dl, target_vocab, dataset_name):
    # Define the tensorboard writer
    writer = SummaryWriter() 

    # Define the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=1e-8)

    # Define the loss function
    # Ignore the padding token, which has index 3 in the vocabulary (see function build_vocab in Preprocessing.ipynb file)
    loss_fn = nn.CrossEntropyLoss(ignore_index=3).to(device)

    # Define the checkpoint directory
    checkpoint = os.path.join(dataset_name+"checkpoints", f"lr_{lr}")
    # Create the directory if it does not exist
    os.makedirs(checkpoint, exist_ok=True)

    # Find the latest checkpoint
    latest_epoch = -1
    latest_checkpoint_path = None
    for fname in os.listdir(checkpoint):
        if fname.startswith('epoch_') and fname.endswith('.pth'):
            epoch_num = int(fname.split('_')[1].split('.')[0])
            if epoch_num > latest_epoch:
                latest_epoch = epoch_num
                latest_checkpoint_path = os.path.join(checkpoint, fname)

    if latest_checkpoint_path:
        ckpt = torch.load(latest_checkpoint_path)
        model.load_state_dict(ckpt['model_state'])
        optimizer.load_state_dict(ckpt['optimizer_state'])
        start_epoch = ckpt['epoch'] + 1
        if start_epoch < epochs:
            print(f"Resuming training from epoch {start_epoch}")
        else:
            print(f"Training already finished for parameter learning rate={lr}")
            return model
    else:
        start_epoch = 0

    step = 0 # for logging the loss

    for epoch in range (start_epoch, epochs):
        # Empty the cache to avoid memory overflow
        torch.mps.empty_cache()        
        # Set the model to train mode
        model.train()

        # Create the progress bar 
        iter = tqdm(train_dl, desc=f'Epoch {epoch}')

        # Iterate over the batches
        for batch in iter:
            # Get the tensors from the batch
            encoder_input = batch['encoder_input'].to(device)    # size (batch_size, seq_len)
            decoder_input = batch['decoder_input'].to(device)    # size (batch_size, seq_len)
            encoder_mask = batch['encoder_mask'].to(device)      # size (batch_size, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device)      # size (batch_size, 1, seq_len, seq_len)
            label = batch['label'].to(device)                    # size (batch_size, seq_len)

            # Run the tensors through the model
            encoder_output = model.encode(encoder_input, encoder_mask)                                   
            decoder_output = model.decode(decoder_input, encoder_output, encoder_mask, decoder_mask)     
            output = model.output(decoder_output)                                                        # size (batch_size, seq_len, trg_vocab_size)

            # Calculate the loss
            # Flatten the output and label tensors to size (batch_size * seq_len, trg_vocab_size)
            train_loss = loss_fn(output.view(-1, len(target_vocab)), label.view(-1))
            iter.set_postfix(loss=train_loss.item()) # print the loss
            writer.add_scalar('Training Loss/Step', train_loss.item(), step) # log the loss
            writer.flush() 

            # Backpropagation
            train_loss.backward()    

            # Update the parameters
            optimizer.step()
            optimizer.zero_grad()

            step += 1
        
        print("Evaluating the model on the validation dataset")
        # Evaluate the model on the validation dataset
        model.eval()
        val_loss = 0

        # Disable gradient calculation
        with torch.no_grad():
            for batch in val_dl:
                # Get the tensors from the batch
                encoder_input = batch['encoder_input'].to(device)    
                decoder_input = batch['decoder_input'].to(device)    
                encoder_mask = batch['encoder_mask'].to(device)      
                decoder_mask = batch['decoder_mask'].to(device)      
                label = batch['label'].to(device)                    

                # Run the tensors through the model
                encoder_output = model.encode(encoder_input, encoder_mask)                                   
                decoder_output = model.decode(decoder_input, encoder_output, encoder_mask, decoder_mask)     
                output = model.output(decoder_output)                                                        # size (batch_size, seq_len, trg_vocab_size)
                
                # Calculate the loss
                # Flatten the output and label tensors to size (batch_size * seq_len, trg_vocab_size)
                val_loss += loss_fn(output.view(-1, len(target_vocab)), label.view(-1)).item()

        # Log the validation loss
        val_loss /= len(val_dl)
        writer.add_scalar('Validation Loss/Epoch', val_loss, epoch)
        writer.flush()

        # Save the model after each epoch
        epoch_checkpoint_path = os.path.join(str(checkpoint), f"epoch_{epoch}_train_loss_{round(train_loss.item(), 2)}_val_loss_{round(val_loss, 2)}.pth")
        torch.save({
            'epoch': epoch,
            'model_state': model.state_dict(),
            'optimizer_state': optimizer.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss
        }, epoch_checkpoint_path)

        print(f"Checkpoint for epoch {epoch} saved")
        
    writer.close()

    return model

In [34]:
def compute_BLEU_score(model, val_dl, target_vocab, subset_size=100):
    """
    Function to compute the BLEU score on the validation dataset. 
    """
    model.eval()
    # Initialize the lists that will contain the references and the outputs for each sentence when computing the BLEU score
    references = []
    outputs = []
    count = 0
    # Disable gradient calculation
    with torch.no_grad():
        for batch in val_dl:
            if count >= subset_size:
                break
            # Get the tensors from the batch
            encoder_input = batch['encoder_input'].to(device)    
            encoder_mask = batch['encoder_mask'].to(device)      
            
            # Generate the translation
            translation = greedy_decode(
                model=model,
                source=encoder_input,
                encoder_mask=encoder_mask,
                trg_vocab = target_vocab,
                sos_idx=1,
                eos_idx=2,
                max_len=header_file['max_seq_len'],
                device=device
            )
            
            count += len(batch['encoder_input'])
            
        # Add the generated translation and the reference translation to the lists
        outputs.append(translation)
        references.append(batch['trg'])  # Assuming this is available in your validation DataLoader

    # Compute the BLEU score and return it
    return corpus_bleu(references, outputs)


# Train the English to Italian Model

In [35]:
for lr in learning_rates:
    # Define the model
    model = build_transformer(
                        len(en_it_source_vocab),                    # size of the source vocabulary    
                        len(en_it_target_vocab),                    # size of the target vocabulary
                        src_seq_len= header_file["max_seq_len"],    # defined in the header file 
                        trg_seq_len= header_file["max_seq_len"],    # defined in the header file
                        d_model = 512,                              # based on the paper
                        N = 3,                                      # number of encoder and decoder layers (we use the same number of layers for both encoder and decoder)
                        h = 8,                                      # number of heads (we use the same number of heads for both encoder and decoder)                                          
                        dropout = 0.1,                              # based on the paper
                        d_ff = 2048                                 # based on the paper
                        ).to(device)    
    print('Learning rate:', lr)
    model = train(model, lr, epochs, en_it_train_dl, en_it_val_dl, en_it_target_vocab, "en_it_")

Learning rate: 0.0001


Epoch 0: 100%|██████████| 1000/1000 [09:25<00:00,  1.77it/s, loss=5.24]


Evaluating the model on the validation dataset
Checkpoint for epoch 0 saved


Epoch 1: 100%|██████████| 1000/1000 [08:39<00:00,  1.93it/s, loss=4.6]


Evaluating the model on the validation dataset
Checkpoint for epoch 1 saved


Epoch 2: 100%|██████████| 1000/1000 [08:34<00:00,  1.94it/s, loss=4.16]


Evaluating the model on the validation dataset
Checkpoint for epoch 2 saved


Epoch 3: 100%|██████████| 1000/1000 [08:32<00:00,  1.95it/s, loss=3.72]


Evaluating the model on the validation dataset
Checkpoint for epoch 3 saved


Epoch 4: 100%|██████████| 1000/1000 [08:33<00:00,  1.95it/s, loss=3.13]


Evaluating the model on the validation dataset
Checkpoint for epoch 4 saved


Epoch 5: 100%|██████████| 1000/1000 [08:28<00:00,  1.97it/s, loss=2.56]


Evaluating the model on the validation dataset
Checkpoint for epoch 5 saved


Epoch 6: 100%|██████████| 1000/1000 [08:30<00:00,  1.96it/s, loss=2.81]


Evaluating the model on the validation dataset
Checkpoint for epoch 6 saved


Epoch 7: 100%|██████████| 1000/1000 [08:31<00:00,  1.95it/s, loss=2.38]


Evaluating the model on the validation dataset
Checkpoint for epoch 7 saved


Epoch 8: 100%|██████████| 1000/1000 [08:28<00:00,  1.97it/s, loss=2.05]


Evaluating the model on the validation dataset
Checkpoint for epoch 8 saved


Epoch 9: 100%|██████████| 1000/1000 [08:25<00:00,  1.98it/s, loss=1.66]


Evaluating the model on the validation dataset
Checkpoint for epoch 9 saved
Learning rate: 0.001


Epoch 0: 100%|██████████| 1000/1000 [08:27<00:00,  1.97it/s, loss=5.34]


Evaluating the model on the validation dataset
Checkpoint for epoch 0 saved


Epoch 1: 100%|██████████| 1000/1000 [08:21<00:00,  2.00it/s, loss=5.05]


Evaluating the model on the validation dataset
Checkpoint for epoch 1 saved


Epoch 2: 100%|██████████| 1000/1000 [08:20<00:00,  2.00it/s, loss=4.71]


Evaluating the model on the validation dataset
Checkpoint for epoch 2 saved


Epoch 3: 100%|██████████| 1000/1000 [08:20<00:00,  2.00it/s, loss=4.54]


Evaluating the model on the validation dataset
Checkpoint for epoch 3 saved


Epoch 4: 100%|██████████| 1000/1000 [08:26<00:00,  1.97it/s, loss=3.93]


Evaluating the model on the validation dataset
Checkpoint for epoch 4 saved


Epoch 5: 100%|██████████| 1000/1000 [08:20<00:00,  2.00it/s, loss=4.15]


Evaluating the model on the validation dataset
Checkpoint for epoch 5 saved


Epoch 6: 100%|██████████| 1000/1000 [08:20<00:00,  2.00it/s, loss=3.4]


Evaluating the model on the validation dataset
Checkpoint for epoch 6 saved


Epoch 7: 100%|██████████| 1000/1000 [08:22<00:00,  1.99it/s, loss=3.31]


Evaluating the model on the validation dataset
Checkpoint for epoch 7 saved


Epoch 8: 100%|██████████| 1000/1000 [08:19<00:00,  2.00it/s, loss=2.9]


Evaluating the model on the validation dataset
Checkpoint for epoch 8 saved


Epoch 9: 100%|██████████| 1000/1000 [08:20<00:00,  2.00it/s, loss=2.5]


Evaluating the model on the validation dataset
Checkpoint for epoch 9 saved


# Train the English to Spanish Model

In [36]:
for lr in learning_rates:
    # Define the model
    model = build_transformer(
                        len(en_es_source_vocab),                    # size of the source vocabulary    
                        len(en_es_target_vocab),                    # size of the target vocabulary
                        src_seq_len= header_file["max_seq_len"],    # defined in the header file 
                        trg_seq_len= header_file["max_seq_len"],    # defined in the header file
                        d_model = 512,                              # based on the paper
                        N = 3,                                      # number of encoder and decoder layers (we use the same number of layers for both encoder and decoder)
                        h = 8,                                      # number of heads (we use the same number of heads for both encoder and decoder)                                          
                        dropout = 0.1,                              # based on the paper
                        d_ff = 2048                                 # based on the paper
                        ).to(device)    
    print('Learning rate:', lr)
    model = train(model, lr, epochs, en_es_train_dl, en_es_val_dl, en_es_target_vocab, "en_es_")

Learning rate: 0.0001


Epoch 0: 100%|██████████| 1000/1000 [08:22<00:00,  1.99it/s, loss=5.05]


Evaluating the model on the validation dataset
Checkpoint for epoch 0 saved


Epoch 1: 100%|██████████| 1000/1000 [08:17<00:00,  2.01it/s, loss=3.83]


Evaluating the model on the validation dataset
Checkpoint for epoch 1 saved


Epoch 2: 100%|██████████| 1000/1000 [08:15<00:00,  2.02it/s, loss=3.68]


Evaluating the model on the validation dataset
Checkpoint for epoch 2 saved


Epoch 3: 100%|██████████| 1000/1000 [08:12<00:00,  2.03it/s, loss=3.55]


Evaluating the model on the validation dataset
Checkpoint for epoch 3 saved


Epoch 4: 100%|██████████| 1000/1000 [08:20<00:00,  2.00it/s, loss=2.84]


Evaluating the model on the validation dataset
Checkpoint for epoch 4 saved


Epoch 5: 100%|██████████| 1000/1000 [08:14<00:00,  2.02it/s, loss=2.51]


Evaluating the model on the validation dataset
Checkpoint for epoch 5 saved


Epoch 6: 100%|██████████| 1000/1000 [08:15<00:00,  2.02it/s, loss=2.22]


Evaluating the model on the validation dataset
Checkpoint for epoch 6 saved


Epoch 7: 100%|██████████| 1000/1000 [08:18<00:00,  2.01it/s, loss=2.11]


Evaluating the model on the validation dataset
Checkpoint for epoch 7 saved


Epoch 8: 100%|██████████| 1000/1000 [08:14<00:00,  2.02it/s, loss=2.14]


Evaluating the model on the validation dataset
Checkpoint for epoch 8 saved


Epoch 9: 100%|██████████| 1000/1000 [08:13<00:00,  2.03it/s, loss=1.39]


Evaluating the model on the validation dataset
Checkpoint for epoch 9 saved
Learning rate: 0.001


Epoch 0: 100%|██████████| 1000/1000 [08:12<00:00,  2.03it/s, loss=4.86]


Evaluating the model on the validation dataset
Checkpoint for epoch 0 saved


Epoch 1: 100%|██████████| 1000/1000 [08:14<00:00,  2.02it/s, loss=4.73]


Evaluating the model on the validation dataset
Checkpoint for epoch 1 saved


Epoch 2: 100%|██████████| 1000/1000 [08:10<00:00,  2.04it/s, loss=4.11]


Evaluating the model on the validation dataset
Checkpoint for epoch 2 saved


Epoch 3: 100%|██████████| 1000/1000 [08:09<00:00,  2.04it/s, loss=4.07]


Evaluating the model on the validation dataset
Checkpoint for epoch 3 saved


Epoch 4: 100%|██████████| 1000/1000 [08:12<00:00,  2.03it/s, loss=3.46]


Evaluating the model on the validation dataset
Checkpoint for epoch 4 saved


Epoch 5: 100%|██████████| 1000/1000 [08:10<00:00,  2.04it/s, loss=3.14]


Evaluating the model on the validation dataset
Checkpoint for epoch 5 saved


Epoch 6: 100%|██████████| 1000/1000 [08:10<00:00,  2.04it/s, loss=2.8]


Evaluating the model on the validation dataset
Checkpoint for epoch 6 saved


Epoch 7: 100%|██████████| 1000/1000 [08:11<00:00,  2.03it/s, loss=2.61]


Evaluating the model on the validation dataset
Checkpoint for epoch 7 saved


Epoch 8: 100%|██████████| 1000/1000 [08:11<00:00,  2.03it/s, loss=2.26]


Evaluating the model on the validation dataset
Checkpoint for epoch 8 saved


Epoch 9: 100%|██████████| 1000/1000 [08:10<00:00,  2.04it/s, loss=1.35]


Evaluating the model on the validation dataset
Checkpoint for epoch 9 saved


# Evaluate the models on the validation sets to pick the best model

In [37]:
# Create dictionaries to store the models for each learning rate
en_it_models = {}
en_es_models = {}

# Load the last epochs of the models for each learning rate in their corresponding dictionaries
for lr in learning_rates:
    model = build_transformer(
                    len(en_it_source_vocab),                    # size of the source vocabulary    
                    len(en_it_target_vocab),                    # size of the target vocabulary
                    src_seq_len= header_file["max_seq_len"],    # defined in the header file 
                    trg_seq_len= header_file["max_seq_len"],    # defined in the header file
                    d_model = 512,                              # based on the paper
                    N = 3,                                      # number of encoder and decoder layers (we use the same number of layers for both encoder and decoder)
                    h = 8,                                      # number of heads (we use the same number of heads for both encoder and decoder)                                          
                    dropout = 0.1,                              # based on the paper
                    d_ff = 2048                                 # based on the paper
                    ).to(device) 
        
    en_it_checkpoint = os.path.join("en_it_checkpoints", f"lr_{lr}")    
    latest_epoch = -1
    for fname in os.listdir(en_it_checkpoint):
        if fname.startswith('epoch_') and fname.endswith('.pth'):
            epoch_num = int(fname.split('_')[1].split('.')[0])
            if epoch_num > latest_epoch:
                latest_epoch = epoch_num
                latest_checkpoint_path = os.path.join(en_it_checkpoint, fname)

    if latest_checkpoint_path:
        ckpt = torch.load(latest_checkpoint_path)
        model.load_state_dict(ckpt['model_state'])
        en_it_models[lr] = model
        print(f"English Italian Model for learning rate {lr} and epoch {latest_epoch} loaded")

for lr in learning_rates:
    model = build_transformer(
                len(en_es_source_vocab),                    # size of the source vocabulary    
                len(en_es_target_vocab),                    # size of the target vocabulary
                src_seq_len= header_file["max_seq_len"],    # defined in the header file 
                trg_seq_len= header_file["max_seq_len"],    # defined in the header file
                d_model = 512,                              # based on the paper
                N = 3,                                      # number of encoder and decoder layers (we use the same number of layers for both encoder and decoder)
                h = 8,                                      # number of heads (we use the same number of heads for both encoder and decoder)                                          
                dropout = 0.1,                              # based on the paper
                d_ff = 2048                                 # based on the paper
                ).to(device) 
    
    en_es_checkpoint = os.path.join("en_es_checkpoints", f"lr_{lr}")    
    latest_epoch = -1
    for fname in os.listdir(en_es_checkpoint):
        if fname.startswith('epoch_') and fname.endswith('.pth'):
            epoch_num = int(fname.split('_')[1].split('.')[0])
            if epoch_num > latest_epoch:
                latest_epoch = epoch_num
                latest_checkpoint_path = os.path.join(en_es_checkpoint, fname)

    if latest_checkpoint_path:
        ckpt = torch.load(latest_checkpoint_path)
        model.load_state_dict(ckpt['model_state'])
        en_es_models[lr] = model
        print(f"English Spanish Model for learning rate {lr} and epoch {latest_epoch} loaded")

English Italian Model for learning rate 0.0001 and epoch 9 loaded
English Italian Model for learning rate 0.001 and epoch 9 loaded
English Spanish Model for learning rate 0.0001 and epoch 9 loaded
English Spanish Model for learning rate 0.001 and epoch 9 loaded


In [38]:
best_en_it_model = None
best_BLEU = -1
best_en_it_lr = None
# Compare the BLEU scores on the models trained with different learning rates
print('BLEU scores on the english-italian models trained with different learning rates:')
for lr in learning_rates:
    print('Learning rate:', lr)

    # Compute the BLEU score on the validation dataset
    BLEU = compute_BLEU_score(en_it_models[lr], en_it_val_dl, en_it_target_vocab)
    print('BLEU score:', BLEU)
    if BLEU > best_BLEU:
        best_BLEU = BLEU
        best_en_it_model = en_it_models[lr]
        best_en_it_lr = lr

BLEU scores on the english-italian models trained with different learning rates:
Learning rate: 0.0001
BLEU score: 0.03795893975135081
Learning rate: 0.001
BLEU score: 0.036117260153593664


In [39]:
best_en_es_model = None
best_BLEU = -1
best_en_es_lr = None
# Compare the BLEU scores on the models trained with different learning rates
print('BLEU scores on the english-spanish models trained with different learning rates:')
for lr in learning_rates:
    print('Learning rate:', lr)
    # Compute the BLEU score on the validation dataset
    BLEU = compute_BLEU_score(en_es_models[lr], en_es_val_dl, en_es_target_vocab)
    print('BLEU score:', BLEU)
    if BLEU > best_BLEU:
        best_BLEU = BLEU
        best_en_es_model = en_es_models[lr]
        best_en_es_lr = lr

BLEU scores on the english-spanish models trained with different learning rates:
Learning rate: 0.0001
BLEU score: 0.08944820372220456
Learning rate: 0.001
BLEU score: 0.008682228073600117


# Fine tune the English to Spanish Model on the English to Italian Dataset

In [40]:
# Assuming `english_spanish_model` is your pre-trained English-Spanish model.
# Initialize a new model for English-Italian with the new target vocabulary size.
model = build_transformer(
                    len(en_it_source_vocab),                    # size of the source vocabulary    
                    len(en_it_target_vocab),                    # size of the target vocabulary
                    src_seq_len= header_file["max_seq_len"],    # defined in the header file 
                    trg_seq_len= header_file["max_seq_len"],    # defined in the header file
                    d_model = 512,                              # based on the paper
                    N = 3,                                      # number of encoder and decoder layers (we use the same number of layers for both encoder and decoder)
                    h = 8,                                      # number of heads (we use the same number of heads for both encoder and decoder)                                          
                    dropout = 0.1,                              # based on the paper
                    d_ff = 2048                                 # based on the paper
                    ).to(device)  

# Print the initial weights of some layers as a sanity check
print("Initial Weights (Model Before Transfer):")
print("Encoder Layer 0, First Parameter Mean and Std:", torch.mean(model.encoder.n_layers[0].self_attention_block.W_Q.weight).item(), torch.std(model.encoder.n_layers[0].self_attention_block.W_Q.weight).item())
print("Decoder Layer 0, First Parameter Mean and Std:", torch.mean(model.decoder.n_layers[0].self_attention_block.W_Q.weight).item(), torch.std(model.decoder.n_layers[0].self_attention_block.W_Q.weight).item())

# Transfer weights from the English-Spanish model to the new English-Italian model (only the encoder and decoder layers).
# We will exclude the final linear layer's weights since the vocabulary size is different.
# Transfer encoder weights.
for it_src, it_tgt in zip(best_en_es_model.encoder.n_layers.parameters(), model.encoder.n_layers.parameters()):
    it_tgt.data.copy_(it_src.data)

# Transfer decoder weights, except for the last linear layer.
for it_src, it_tgt in zip(best_en_es_model.decoder.n_layers.parameters(), model.decoder.n_layers.parameters()):
    if it_tgt.size() == it_src.size():
        it_tgt.data.copy_(it_src.data)

# Reinitialize the final linear layer's weights
model.linear_layer.proj.weight.data.normal_(mean=0.0, std=0.02)
model.linear_layer.proj.bias.data.zero_()

# Print the initial weights of some layers as a sanity check
print("Transferred Weights (Model After Transfer):")
print("Encoder Layer 0, First Parameter Mean and Std:", torch.mean(model.encoder.n_layers[0].self_attention_block.W_Q.weight).item(), torch.std(model.encoder.n_layers[0].self_attention_block.W_Q.weight).item())
print("Decoder Layer 0, First Parameter Mean and Std:", torch.mean(model.decoder.n_layers[0].self_attention_block.W_Q.weight).item(), torch.std(model.decoder.n_layers[0].self_attention_block.W_Q.weight).item())


Initial Weights (Model Before Transfer):
Encoder Layer 0, First Parameter Mean and Std: -0.00012798531679436564 0.044176071882247925
Decoder Layer 0, First Parameter Mean and Std: -0.00019926796085201204 0.04417480155825615
Transferred Weights (Model After Transfer):
Encoder Layer 0, First Parameter Mean and Std: -3.552586349542253e-05 0.04578001797199249
Decoder Layer 0, First Parameter Mean and Std: -5.2119379688519984e-05 0.04419827088713646


In [41]:
fine_tuned_en_es_model = train(model, best_en_es_lr, epochs, en_it_train_dl, en_it_val_dl, en_it_target_vocab, "fine_tuned_en_es_")

Epoch 0: 100%|██████████| 1000/1000 [08:32<00:00,  1.95it/s, loss=4.76]


Evaluating the model on the validation dataset
Checkpoint for epoch 0 saved


Epoch 1: 100%|██████████| 1000/1000 [08:17<00:00,  2.01it/s, loss=3.76]


Evaluating the model on the validation dataset
Checkpoint for epoch 1 saved


Epoch 2: 100%|██████████| 1000/1000 [08:17<00:00,  2.01it/s, loss=3.45]


Evaluating the model on the validation dataset
Checkpoint for epoch 2 saved


Epoch 3: 100%|██████████| 1000/1000 [08:20<00:00,  2.00it/s, loss=2.71]


Evaluating the model on the validation dataset
Checkpoint for epoch 3 saved


Epoch 4: 100%|██████████| 1000/1000 [08:17<00:00,  2.01it/s, loss=2.19]


Evaluating the model on the validation dataset
Checkpoint for epoch 4 saved


Epoch 5: 100%|██████████| 1000/1000 [08:17<00:00,  2.01it/s, loss=2.12]


Evaluating the model on the validation dataset
Checkpoint for epoch 5 saved


Epoch 6: 100%|██████████| 1000/1000 [08:19<00:00,  2.00it/s, loss=1.52]


Evaluating the model on the validation dataset
Checkpoint for epoch 6 saved


Epoch 7: 100%|██████████| 1000/1000 [08:18<00:00,  2.01it/s, loss=1.4] 


Evaluating the model on the validation dataset
Checkpoint for epoch 7 saved


Epoch 8: 100%|██████████| 1000/1000 [08:17<00:00,  2.01it/s, loss=1.29]


Evaluating the model on the validation dataset
Checkpoint for epoch 8 saved


Epoch 9: 100%|██████████| 1000/1000 [08:17<00:00,  2.01it/s, loss=1.3] 


Evaluating the model on the validation dataset
Checkpoint for epoch 9 saved


# Compare the BLEU scores on the test sets

In [42]:
# Create a dictionnary to save the BLEU score results of the fine-tuned model and the original english to italian model
BLEU_scores = {}

In [43]:
# Compute the BLEU score of the best english to italian model on the test dataset
BLEU = compute_BLEU_score(best_en_it_model, en_it_test_dl, en_it_target_vocab, len(en_it_test))
print('BLEU score on the test dataset of the English-Italian model is:', BLEU)
BLEU_scores['en_it'] = BLEU

BLEU score on the test dataset of the English-Italian model is: 0.02300914814791133


In [44]:
# Compute the BLEU score of the fine tuned English-Spanish model on the English-Italian task
BLEU = compute_BLEU_score(fine_tuned_en_es_model, en_it_test_dl, en_it_target_vocab, len(en_it_test))
print('BLEU score on the test dataset of the fine tuned English-Spanish model on the English-Italian task is:', BLEU)
BLEU_scores['fine_tuned_en_es_'] = BLEU

BLEU score on the test dataset of the fine tuned English-Spanish model on the English-Italian task is: 0.042610063363542866


In [45]:
# Save the dicitonnary containing the BLEU scores in a json file
with open('./results/BLEU_scores.json', 'w') as fp:
    json.dump(BLEU_scores, fp)

In [46]:
# Save the fine tuned model and the original english to italian model in the results folder
torch.save(best_en_it_model, './results/best_en_it_model.pt')
torch.save(fine_tuned_en_es_model, './results/fine_tuned_en_es_model.pt')