# Imports 

In [2]:
# IMPORTS
from PreProcessing import Preprocessor # for preprocessing the data
import json # for reading the config file
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.nn import Transformer # for the model
from torch.utils.data import DataLoader, TensorDataset # for creating the dataloader
# from Transformer import Transformer # for the model
from tqdm import tqdm # for the progress bar when training

from torch.utils.tensorboard import SummaryWriter # for tensorboard

# Preprocessing

We first load the header json file were we store paths / parameters / etc.

In [4]:
# Load the JSON header file
def load_json_header(json_file):
    with open(json_file) as json_data:
        d = json.load(json_data)
        return d

config = load_json_header('config.json')

## Load the dataset

In [5]:
# Instantiate the preprocessor
preprocessor = Preprocessor()

# Load the English-Italian dataset
en_file_path = config['en-it-dataset-english'] 
it_file_path = config['en-it-dataset-italian']

en_train, en_val, en_test, it_train, it_val, it_test = preprocessor.preprocess(en_file_path, it_file_path)

As a securtity check, we print the preprocessed dataset lentghs.

In [6]:
print("English Train: ", len(en_train))
print("English Validation: ", len(en_val))
print("English Test: ", len(en_test))
print("Italian Train: ", len(it_train))
print("Italian Validation: ", len(it_val))
print("Italian Test: ", len(it_test))

English Train:  1520698
English Validation:  190087
English Test:  190088
Italian Train:  1520698
Italian Validation:  190087
Italian Test:  190088


# Model Classes

In [8]:
# Assuming these are the sizes of your vocabularies
SRC_VOCAB_SIZE = len(preprocessor.source_tokenizer.word_index) + 1  # +1 for padding token
TRG_VOCAB_SIZE = len(preprocessor.target_tokenizer.word_index) + 1

# Hyperparameters
# Define hyperparameters
LEARNING_RATE = 0.001
BATCH_SIZE = 64
EPOCH = 5
src_vocab_size = SRC_VOCAB_SIZE
trg_vocab_size = TRG_VOCAB_SIZE
embedding_size = 512
nhead = 8
dim_feedforward = 2048
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.1


# Set the device mps, cuda or cpu
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

    
# Instantiate the Transformer model
transformer_model = nn.Transformer(
    d_model=embedding_size,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    num_decoder_layers=num_decoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout
)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer_model.parameters(), lr=LEARNING_RATE)

# Move model to the device
transformer_model.to(device)

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, o

In [None]:
# # Assuming these are the sizes of your vocabularies
# SRC_VOCAB_SIZE = len(preprocessor.source_tokenizer.word_index) + 1  # +1 for padding token
# TRG_VOCAB_SIZE = len(preprocessor.target_tokenizer.word_index) + 1


# EMB_DIM = 512
# N_ENCODER_LAYERS = 3
# N_DECODER_LAYERS = 3
# ENC_DROPOUT = 0.1
# MAX_LENGTH = 100 # Maximum length of the sentence (used for positional encoding)
# FORW_EXP = 4 # Forward expansion
# SRC_PAD_IDX = 0 # Padding token for the source language
# NUM_HEADS = 8 # Number of heads for the multi-head attention

# # TensorBoard for logging
# writer = SummaryWriter('runs/loss_plot')
# step = 0



In [None]:
train_data = TensorDataset(torch.from_numpy(en_train), torch.from_numpy(it_train))
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

val_data = TensorDataset(torch.from_numpy(en_val), torch.from_numpy(it_val))
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True)

test_data = TensorDataset(torch.from_numpy(en_test), torch.from_numpy(it_test))
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)


In [None]:
import torch
import math
from torch.nn import Transformer
import torch.optim as optim
from tqdm import tqdm

# Assuming you have already created and configured the Transformer model, as well as your optimizer and criterion.

def create_src_mask(src, pad_idx):
    src_mask = (src != pad_idx).unsqueeze(-2)
    return src_mask

def create_trg_mask(trg, pad_idx):
    trg_pad_mask = (trg != pad_idx).unsqueeze(-2)
    trg_len = trg.shape[0]
    trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=device)).bool()
    trg_mask = trg_pad_mask & trg_sub_mask
    return trg_mask

def train(model, iterator, optimizer, criterion, pad_idx, clip):
    model.train()
    epoch_loss = 0

    for _, (src, trg) in tqdm(enumerate(iterator), total=len(iterator)):
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()

        src_mask = create_src_mask(src, pad_idx)
        trg_mask = create_trg_mask(trg, pad_idx)

        output = model(src, trg[:-1], src_mask=src_mask, tgt_mask=trg_mask)

        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[1:].contiguous().view(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# Hyperparameters
PAD_IDX = 0  # Update this to your dataset's padding index
CLIP = 1

for epoch in range(EPOCH):
    train_loss = train(transformer_model, train_loader, optimizer, criterion, PAD_IDX, CLIP)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

In [None]:
# model = Transformer(
#     src_vocab_size=SRC_VOCAB_SIZE,
#     trg_vocab_size=TRG_VOCAB_SIZE,
#     src_pad_idx=SRC_PAD_IDX,
#     embedding_size=EMB_DIM,
#     num_encoder_layers=N_ENCODER_LAYERS,
#     num_decoder_layers=N_DECODER_LAYERS,
#     forward_expansion=FORW_EXP,
#     max_len=MAX_LENGTH,
#     dropout=ENC_DROPOUT,
#     num_heads=NUM_HEADS,
#     device=device
# ).to(device)

# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = nn.CrossEntropyLoss(ignore_index = SRC_PAD_IDX)

# save_model = True

