# English-French Translation with Seq2Seq Transformer

In [1]:
import sys

#  Add project directory
sys.path.append("..")

In [2]:
import torch
from model.transformer import TransformerSeq2Seq, TransformerEncoder, TransformerDecoder
from utils.loss import MaskedBCELoss
from utils.train import minibatch_gd

In [3]:
from data.engfra_trans import MTEngFra

## Initialize the dataset

In [4]:
# Dataset parameters
data_path = "../data"
num_steps = 10
num_train = 200
num_val = 50

In [5]:
dataset = MTEngFra(data_path, num_steps, num_train, num_val)

Dataset already downloaded and extracted.


In [6]:
# Test dataloader
for _, data in enumerate(dataset.data_loader(batch_size=2)):
    print(data)
    break

(tensor([[47, 26,  1,  2,  3,  3,  3,  3,  3,  3],
        [ 0, 50,  1,  2,  3,  3,  3,  3,  3,  3]], dtype=torch.int32), tensor([[ 0, 89,  2,  3,  4,  4,  4,  4,  4,  4],
        [ 0, 89, 11,  3,  4,  4,  4,  4,  4,  4]], dtype=torch.int32), tensor([[15, 89, 18,  2,  3,  4,  4,  4,  4,  4],
        [89, 11,  3,  4,  4,  4,  4,  4,  4,  4]], dtype=torch.int32), tensor([4, 4]))


## Prepare the model and optimizers

In [7]:
# Hyperparameters for the Transformer
num_hiddens, enc_num_layers, dec_num_layers, dropout, batch_size = 32, 4, 8, 0.1, 32
enc_num_heads, dec_num_heads, ffn_num_hiddens = 8, 16, 64
src_vocab_size = len(dataset.src_vocab)
tgt_vocab_size = len(dataset.tgt_vocab)

# Hyperparameters for GD
lr = 0.0001
num_epochs = 500

In [8]:
# Initialize the modules
encoder = TransformerEncoder(src_vocab_size, num_hiddens, enc_num_heads, enc_num_layers, ffn_num_hiddens, dropout)
decoder = TransformerDecoder(tgt_vocab_size, num_hiddens, dec_num_heads, dec_num_layers, ffn_num_hiddens, dropout)
model = TransformerSeq2Seq(encoder, decoder, dataset.tgt_vocab['<pad>'])

# Criterions and Optimizer
criterion = MaskedBCELoss(dataset.tgt_vocab['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr)

In [9]:
# Check the models
from torchinfo import summary

In [10]:
# Create a sample input to show the model structure
dummy_batch_size = 2
dummy_seq_length = 5
dummy_x = torch.randint(0, src_vocab_size, (dummy_batch_size, dummy_seq_length))
dummy_y = torch.randint(0, tgt_vocab_size, (dummy_batch_size, dummy_seq_length))
dummy_z = torch.tensor([1, 2])

# Display model summary
summary(model, 
        input_data=[dummy_x, dummy_y, dummy_z],
        col_names=["input_size", "output_size", "num_params", "kernel_size", "mult_adds"],
        depth=4,
        device='cpu')

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
TransformerSeq2Seq                                      [2, 5]                    [2, 5, 90]                --                        --                        --
├─TransformerEncoder: 1-1                               [2, 5]                    [2, 5, 32]                --                        --                        --
│    └─Embedding: 2-1                                   [2, 5]                    [2, 5, 32]                2,752                     --                        5,504
│    └─PositionalEncoding: 2-2                          [2, 5, 32]                [2, 5, 32]                --                        --                        --
│    └─Sequential: 2-3                                  --                        --                        --                        --                        --
│    │    └─

## Train the model

In [11]:
from time import perf_counter
device = torch.device("cudo:0" if torch.cuda.is_available() else "cpu")

In [12]:
start = perf_counter()
minibatch_gd(model, dataset, batch_size, criterion, optimizer, device, num_epochs)
end = perf_counter()
f'{end-start:.2f}s'

  scores = torch.bmm(queries, keys.transpose(1, 2)) / torch.tensor(math.sqrt(d))  # (batch_size, num_queries, num_keys)
  scores = torch.bmm(queries, keys.transpose(1, 2)) / torch.tensor(math.sqrt(d))  # (batch_size, num_queries, num_keys)


Epoch [1/500]


RuntimeError: 0D or 1D target tensor expected, multi-target not supported