# Imports

In [1]:
import torch
import torch.nn as nn
import math

# Loading dataset imports
from torch.utils.data import Dataset, DataLoader # for creating the dataloader
import json # for loading the json file
from TranslationDataset import TranslationDataset # the custom dataset class


# Training imports
from Transformer_model import Transformer, build_transformer # the model
from torch.utils.tensorboard import SummaryWriter  # for logging during training
from tqdm import tqdm # for the progress bar during training

[nltk_data] Downloading package punkt to /Users/enzobenoit-
[nltk_data]     jeannin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import English to Italian Datasets

 We first load the english to italian translation datasets we created by runnning the Preprocessing.ipynb file (training, validation, test). We also import the vocabulary dictionaries for both languages (also saved from running the Preprocessing.ipynb file).

In [2]:
# Load the JSON header file
def load_json_header(json_file):
    with open(json_file) as json_data:
        d = json.load(json_data)
        return d

config = load_json_header('config.json')

In [3]:
# Load the datasets
# Get the dataset path from the config file
en_it_dataset_path = config['en-it-save-path']

# Load the dataset
en_it_train = torch.load(en_it_dataset_path + 'train_ds.pt')
en_it_val = torch.load(en_it_dataset_path + 'val_ds.pt')
en_it_test = torch.load(en_it_dataset_path + 'test_ds.pt')

# Load the vocabularies from the config file
source_vocab = torch.load(en_it_dataset_path + 'source_vocab.pt')
target_vocab = torch.load(en_it_dataset_path + 'target_vocab.pt')

# Print the size of the dataset
print('Size of training dataset: ', len(en_it_train))
print('Size of validation dataset: ', len(en_it_val))
print('Size of test dataset: ', len(en_it_test))

Size of training dataset:  1527292
Size of validation dataset:  190912
Size of test dataset:  190912


In [4]:
# Create dataloaders
train_dl = DataLoader(en_it_train, batch_size=config["batch_size"], shuffle=True)
val_dl = DataLoader(en_it_val, batch_size=1, shuffle=False)
test_dl = DataLoader(en_it_test, batch_size=config["batch_size"], shuffle=False)

# Import Transformer Model

In [5]:
# Select device: cuda, mps or cpu
# if torch.backends.mps.is_available():
#     device = torch.device('mps')
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('Device:', device)

Device: mps


# Model training functions

In [6]:
# def train(hyperparams):
#     # Define the model
#     model = build_transformer(
#                         len(source_vocab),
#                         len(target_vocab),
#                         src_seq_len= config["max_seq_len"],
#                         trg_seq_len= config["max_seq_len"],
#                         d_model = 512,
#                         N = 1,
#                         h = 4,
#                         dropout = 0.1,
#                         d_ff = 2048).to(device)
#     writer = SummaryWriter()

#     # Define the hyperparameters from the given dictionary
#     lr = hyperparams['lr']
#     epochs = hyperparams['epochs']

#     # Define the adam optimizer
#     optimizer = torch.optim.Adam(model.parameters(), lr=hyperparams['lr'], eps=1e-9)

#     # Define the loss function
#     # Ignore the padding token, which has index 3 in the vocabulary (see function build_vocab in Preprocessing.ipynb file)
#     loss_fn = nn.CrossEntropyLoss(ignore_index=3, label_smoothing=0.1).to(device)

#     step = 0 # for logging the loss

#     for epoch in range (epochs):
#         torch.mps.empty_cache() # empty the cache
#         model.train()
#         iter = tqdm(train_dl, desc=f'Epoch {epoch}')
#         for batch in iter:
#             encoder_input = batch['encoder_input'].to(device) # size (batch_size, seq_len)
#             decoder_input = batch['decoder_input'].to(device) # size (batch_size, seq_len)
#             encoder_mask = batch['encoder_mask'].to(device) # size (batch_size, 1, 1, seq_len)
#             decoder_mask = batch['decoder_mask'].to(device) # size (batch_size, 1, seq_len, seq_len)
#             label = batch['label'].to(device) # size (batch_size, seq_len)

#             # Run the tensors through the model
#             encoder_output = model.encode(encoder_input, encoder_mask)  # size (batch_size, seq_len, d_model)
#             decoder_output = model.decode(decoder_input, encoder_output, encoder_mask, decoder_mask) # size (batch_size, seq_len, d_model)
#             output = model.output(decoder_output) # size (batch_size, seq_len, trg_vocab_size)

#             # Calculate the loss
#             # Flatten the output and label tensors to size (batch_size * seq_len, trg_vocab_size)
#             loss = loss_fn(output.view(-1, len(target_vocab)), label.view(-1))
#             iter.set_postfix(loss=loss.item()) # print the loss
#             writer.add_scalar('Loss/Step', loss.item(), step) # log the loss
#             writer.flush()

#             # Backpropagation
#             loss.backward()    

#             # Update the parameters
#             optimizer.step()
#             optimizer.zero_grad()

#             step += 1

In [7]:
# hyperparameters = {
#     'lr': 0.0001,
#     'epochs': 1
# }

# train(hyperparameters)

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
        super(TransformerModel, self).__init__()
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)
        self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)
        self.trg_tok_emb = nn.Embedding(trg_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)
        self.generator = nn.Linear(d_model, trg_vocab_size)

    def forward(self, src, trg, src_mask, trg_mask, src_padding_mask, trg_padding_mask, memory_key_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        trg_emb = self.positional_encoding(self.trg_tok_emb(trg))
        outs = self.transformer(src_emb, trg_emb, src_mask=src_mask, tgt_mask=trg_mask, src_key_padding_mask=src_padding_mask, tgt_key_padding_mask=trg_padding_mask, memory_key_padding_mask=memory_key_padding_mask)
        return self.generator(outs)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=200):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [9]:
# Hyperparameters
src_vocab_size = len(source_vocab) # Size of the source vocabulary
trg_vocab_size = len(target_vocab) # Size of the target vocabulary
d_model = 512        # Embedding dimension
nhead = 8            # Number of heads in multi-head attention
num_encoder_layers = 1
num_decoder_layers = 1
dim_feedforward = 2048
dropout = 0.1

# Instantiate the model
transformer_model = TransformerModel(src_vocab_size, trg_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout).to(device)

opt = torch.optim.SGD(transformer_model.parameters(), lr=0.0001)
loss_fn = nn.CrossEntropyLoss(ignore_index=3)



In [10]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def train_epoch(model, train_loader, optimizer, criterion, src_pad_idx, trg_pad_idx, device):
    model.train()
    epoch_loss = 0

    for batch in train_loader:
        src = batch["encoder_input"].to(device)
        trg = batch["decoder_input"].to(device)

        src_mask = generate_square_subsequent_mask(src.size(0)).to(device)
        trg_mask = generate_square_subsequent_mask(trg.size(0)).to(device)
        src_padding_mask = (src == src_pad_idx).transpose(0, 1).to(device)
        trg_padding_mask = (trg == trg_pad_idx).transpose(0, 1).to(device)
        memory_key_padding_mask = src_padding_mask.clone()

        optimizer.zero_grad()

        output = model(src, trg, src_mask, trg_mask, src_padding_mask, trg_padding_mask, memory_key_padding_mask)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].contiguous().view(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(train_loader)

def evaluate(model, val_loader, criterion, src_pad_idx, trg_pad_idx, device):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            src = batch["encoder_input"].to(device)
            trg = batch["decoder_input"].to(device)

            src_mask = generate_square_subsequent_mask(src.size(0)).to(device)
            trg_mask = generate_square_subsequent_mask(trg.size(0)).to(device)
            src_padding_mask = (src == src_pad_idx).transpose(0, 1).to(device)
            trg_padding_mask = (trg == trg_pad_idx).transpose(0, 1).to(device)
            memory_key_padding_mask = src_padding_mask.clone()

            output = model(src, trg, src_mask, trg_mask, src_padding_mask, trg_padding_mask, memory_key_padding_mask)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].contiguous().view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(val_loader)

# Hyperparameters
src_pad_idx = 3 # The index of the padding token in the source vocabulary
trg_pad_idx = 3 # The index of the padding token in the target vocabulary
N_EPOCHS = 10
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train_epoch(transformer_model, train_dl, opt, loss_fn, src_pad_idx, trg_pad_idx, device)
    val_loss = evaluate(transformer_model, val_dl, loss_fn, src_pad_idx, trg_pad_idx, device)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {val_loss:.3f}')



RuntimeError: MPS backend out of memory (MPS allocated: 27.57 GB, other allocations: 1.26 GB, max allowed: 36.27 GB). Tried to allocate 11.58 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).