### NMT (Nueral Machine Translation)

In these series of notebooks we are going to do create bidirectional NMT model for our application. We are going to use the following notebooks as reference to this notebook.

1. [17_Custom_Dataset_and_Translation.ipynb](https://github.com/CrispenGari/pytorch-python/blob/main/09_NLP/03_Sequence_To_Sequence/17_Custom_Dataset_and_Translation.ipynb)
2. [16_Data_Preparation_Translation_Dataset.ipynb](https://github.com/CrispenGari/pytorch-python/blob/main/09_NLP/03_Sequence_To_Sequence/16_Data_Preparation_Translation_Dataset.ipynb)

I will be loading the data from my google drive.

In [None]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Imports

In [None]:
import torch
from torch import nn
from torch.nn  import functional as F
import spacy, math, random
import numpy as np
from torchtext.legacy import datasets, data
import time, os, json
from prettytable import PrettyTable
from matplotlib import pyplot as plt

In [None]:
SEED = 42

np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deteministic = True

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'
)
device

device(type='cuda')

In [None]:
base_path = '/content/drive/My Drive/NLP Data/seq2seq/manythings'
path_to_files = os.path.join(base_path, "Afrikaans - English")
os.listdir(path_to_files)

['afr.txt',
 'train.en',
 'test.en',
 'valid.en',
 'valid.af',
 'test.af',
 'train.af']

### File extensions

In [None]:
exts = (".en", ".af")

### Tokenizer models

All the tokenization models that we are going to use are going to be found [here](https://spacy.io/usage/models) but to those languages that doesn't have tokenization models we are going to create our own tokenizers.

In [None]:
import en_core_web_sm
spacy_en = spacy.load('en_core_web_sm')

In [None]:
def tokenize_af(sent):
  return sent.split(" ")

def tokenize_en(sent):
  return [tok.text for tok in spacy_en.tokenizer(sent)]

### Fields

In [None]:
SRC = data.Field(
     tokenize = tokenize_en,
     lower= True,
     init_token = "<sos>",
     eos_token = "<eos>",
     batch_first =True
)
TRG = data.Field(
    tokenize = tokenize_af,
    lower= True,
    init_token = "<sos>",
    eos_token = "<eos>",
    batch_first =True
)

### Creating dataset

In [None]:
train_data, valid_data, test_data = datasets.TranslationDataset.splits(
    exts= exts,
    path=path_to_files,
    train='train', validation='valid', test='test',
    fields = (SRC, TRG)
)

In [None]:
print(vars(train_data.examples[0]))

{'src': ['i', "'m", 'glad', 'you', 'finally', 'made', 'it', 'back', '.'], 'trg': ['ek', 'is', 'bly', 'jy', 'het', 'uiteindelik', 'terug', 'gekom.']}


In [None]:
print(vars(valid_data.examples[0]))

{'src': ['i', 'have', 'certain', 'rights', '.'], 'trg': ['ek', 'is', 'bly', 'jy', 'het', 'uiteindelik', 'terug', 'gekom.']}


In [None]:
print(vars(test_data.examples[0]))

{'src': ['you', "'re", 'a', 'long', 'way', 'from', 'home', '.'], 'trg': ['ek', 'is', 'bly', 'jy', 'het', 'uiteindelik', 'terug', 'gekom.']}


### Counting examples

In [None]:
from prettytable import PrettyTable
def tabulate(column_names, data):
  table = PrettyTable(column_names)
  table.title= "VISUALIZING SETS EXAMPLES"
  table.align[column_names[0]] = 'l'
  table.align[column_names[1]] = 'r'
  for row in data:
    table.add_row(row)
  print(table)

column_names = ["SUBSET", "EXAMPLE(s)"]
row_data = [
        ["training", len(train_data)],
        ['validation', len(valid_data)],
        ['test', len(test_data)]
]
tabulate(column_names, row_data)

+-----------------------------+
|  VISUALIZING SETS EXAMPLES  |
+--------------+--------------+
| SUBSET       |   EXAMPLE(s) |
+--------------+--------------+
| training     |          825 |
| validation   |            9 |
| test         |            9 |
+--------------+--------------+


Our dataset is very small so we are not going to set the `min_freq` to a number greater than 1 dring building of the vocabulary.

In [None]:
SRC.build_vocab(train_data, min_freq=1)
TRG.build_vocab(train_data, min_freq=1)

Saving the dictionary maping of our SRC and TRG to a json file.

In [None]:
len(SRC.vocab.stoi), len(TRG.vocab.stoi)

(1032, 1240)

In [None]:
src = dict(SRC.vocab.stoi)
trg = dict(TRG.vocab.stoi)

src_vocab_path = "src_vocab.json"
trg_vocab_path = "trg_vocab.json"

with open(src_vocab_path, "w") as f:
  json.dump(src, f, indent=2)

with open(trg_vocab_path, "w") as f:
  json.dump(trg, f, indent=2)

print("Done")

Done


In [None]:
files.download(src_vocab_path)
files.download(trg_vocab_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Iterators

In [None]:
BATCH_SIZE = 64 # 128 for languages with good vocab corpus
sort_key = lambda x: len(x.src)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_key= sort_key,
    sort_within_batch = True,
    device=device
)

### Model based on (Attention is all you need) 

### Encoder

In [None]:
class Encoder(nn.Module):
  def __init__(self, input_dim, hid_dim, n_layers, 
               n_heads, pf_dim, dropout, device, max_length=100):
    super(Encoder, self).__init__()
    self.device = device 

    self.tok_embedding = nn.Embedding(input_dim, hid_dim)
    self.pos_embedding = nn.Embedding(max_length, hid_dim)

    self.layers = nn.ModuleList([
          EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)
    ])
    self.dropout = nn.Dropout(dropout)
    self.scale = torch.sqrt(torch.FloatTensor([
        hid_dim
    ])).to(device)

  def forward(self, src, src_mask):
    batch_size = src.shape[0]
    src_len = src.shape[1]
    pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
    src = self.dropout(
        (self.tok_embedding(src) * self.scale) + self.pos_embedding(pos)
    )
    for layer in self.layers:
      src = layer(src, src_mask)
    return src

### Encoder layer

In [None]:
class EncoderLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, pf_dim,
               dropout, device):
    super(EncoderLayer, self).__init__()

    self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.ff_layer_norm = nn.LayerNorm(hid_dim)
    self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, 
                                                  dropout, device)
    self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                 pf_dim,
                                                                 dropout
                                                                 )
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, src, src_mask):
    _src, _ = self.self_attention(src, src, src, src_mask)
    src = self.self_attn_layer_norm(src + self.dropout(_src))
    _src = self.positionwise_feedforward(src)
    src = self.ff_layer_norm(src + self.dropout(_src)) # src = [batch size, src len, hid dim]
    return src

### Mutli Head Attention Layer

In [None]:
class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, dropout, device):
    super(MultiHeadAttentionLayer, self).__init__()

    assert hid_dim % n_heads == 0

    self.hid_dim = hid_dim
    self.n_heads = n_heads
    self.head_dim = hid_dim // n_heads

    self.fc_q = nn.Linear(hid_dim, hid_dim)
    self.fc_k = nn.Linear(hid_dim, hid_dim)
    self.fc_v = nn.Linear(hid_dim, hid_dim)

    self.fc_o = nn.Linear(hid_dim, hid_dim)
    self.dropout = nn.Dropout(dropout)

    self.scale = torch.sqrt(torch.FloatTensor([
        self.head_dim
    ])).to(device)

  def forward(self, query, key, value, mask):
    """
    query = [batch size, query len, hid dim]
    key = [batch size, key len, hid dim]
    value = [batch size, value len, hid dim]
    """
    batch_size = query.shape[0]

    Q = self.fc_q(query)
    K = self.fc_k(key)
    V = self.fc_v(value)
    """
    Q = [batch size, query len, hid dim]
    K = [batch size, key len, hid dim]
    V = [batch size, value len, hid dim]
    """

    Q = Q.view(batch_size, -1, self.n_heads, self.head_dim
               ).permute(0, 2, 1, 3)
    K = K.view(batch_size, -1, self.n_heads, self.head_dim
               ).permute(0, 2, 1, 3)
    V = V.view(batch_size, -1, self.n_heads, self.head_dim
               ).permute(0, 2, 1, 3)
    
    """
    Q = [batch size, n heads, query len, head dim]
    K = [batch size, n heads, key len, head dim]
    V = [batch size, n heads, value len, head dim]
    """

    energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
    if mask is not None:
      energy = energy.masked_fill(mask == 0, -1e10)
    
    attention = torch.softmax(energy, dim = -1)
    # attention = [batch size, n heads, query len, key len]
    x = torch.matmul(self.dropout(attention), V) # x = [batch size, n heads, query len, head dim]
    x = x.permute(0, 2, 1, 3).contiguous()
    # x = [batch size, query len, n heads, head dim]
    x = x.view(batch_size, -1, self.hid_dim)
    # x = [batch size, query len, hid dim]
    x = self.fc_o(x)
    # x = [batch size, query len, hid dim]
    return x, attention

### Pointwise Feed Forwad Layer

In [None]:
class PositionwiseFeedforwardLayer(nn.Module):
  def __init__(self, hid_dim, pf_dim, dropout):
    super(PositionwiseFeedforwardLayer, self).__init__()
    self.fc_1 = nn.Linear(hid_dim, pf_dim)
    self.fc_2 = nn.Linear(pf_dim, hid_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    # x = [batch size, seq len, hid dim]
    x = self.dropout(torch.relu(self.fc_1(x)))
    # x = [batch size, seq len, pf dim]
    x = self.fc_2(x) # x = [batch size, seq len, hid dim]
    return x

### Decoder

In [None]:
class Decoder(nn.Module):
  def __init__(self,
                output_dim,  hid_dim,   n_layers, 
                 n_heads, pf_dim,  dropout,  device,
                 max_length = 100
               ):
    super(Decoder, self).__init__()
    
    self.device = device
    self.tok_embedding = nn.Embedding(output_dim, hid_dim)
    self.pos_embedding = nn.Embedding(max_length, hid_dim)

    self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
    self.fc_out = nn.Linear(hid_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
    self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

  def forward(self, trg, enc_src, trg_mask, src_mask):
    """
    trg = [batch size, trg len]
    enc_src = [batch size, src len, hid dim]
    trg_mask = [batch size, 1, trg len, trg len]
    src_mask = [batch size, 1, 1, src len]
    """
    batch_size = trg.shape[0]
    trg_len = trg.shape[1]
    pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
    # pos = [batch size, trg len]
    trg = self.dropout(
        (self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos)
    )
    # trg = [batch size, trg len, hid dim]

    for layer in self.layers:
      trg, attention = layer(trg, enc_src, trg_mask, src_mask)
    
    """
    trg = [batch size, trg len, hid dim]
    attention = [batch size, n heads, trg len, src len]
    """
    output = self.fc_out(trg) # output = [batch size, trg len, output dim]

    return output, attention

### Decoder Layer

In [None]:
class DecoderLayer(nn.Module):
  def __init__(self, hid_dim,  n_heads, 
                 pf_dim, dropout, device
               ):
    super(DecoderLayer, self).__init__()
    self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.ff_layer_norm = nn.LayerNorm(hid_dim)
    self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, 
                                                  dropout, device)
    self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout,
                                                     device)
    self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim,
                                                                     pf_dim, 
                                                                     dropout
                                                                 )
    self.dropout = nn.Dropout(dropout)

  def forward(self, trg, enc_src, trg_mask, src_mask):
    """
    trg = [batch size, trg len, hid dim]
    enc_src = [batch size, src len, hid dim]
    trg_mask = [batch size, 1, trg len, trg len]
    src_mask = [batch size, 1, 1, src len]
    """
    # self attention
    _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
    # dropout, residual connection and layer norm
    trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
    # trg = [batch size, trg len, hid dim]

    # encoder attention
    _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)

    # dropout, residual connection and layer norm
    trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
    # trg = [batch size, trg len, hid dim]
    # positionwise feedforward
    _trg = self.positionwise_feedforward(trg)
    # dropout, residual and layer norm
    trg = self.ff_layer_norm(trg + self.dropout(_trg))
    """
    trg = [batch size, trg len, hid dim]
    attention = [batch size, n heads, trg len, src len]
    """
    return trg, attention

### Seq2Seq

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
    super(Seq2Seq, self).__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx
    self.device = device

  def make_src_mask(self, src):
    # src = [batch size, src len]
    src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    # src_mask = [batch size, 1, 1, src len]
    return src_mask

  def make_trg_mask(self, trg):
    # trg = [batch size, trg len]
    trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
    # trg_pad_mask = [batch size, 1, 1, trg len]
    trg_len = trg.shape[1]
    trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), 
                                         device = self.device)).bool()

    # trg_sub_mask = [trg len, trg len]
    trg_mask = trg_pad_mask & trg_sub_mask
    # trg_mask = [batch size, 1, trg len, trg len]
    return trg_mask

  def forward(self, src, trg):
    """
    src = [batch size, src len]
    trg = [batch size, trg len]
    """
    src_mask = self.make_src_mask(src)
    trg_mask = self.make_trg_mask(trg)
    """
    src_mask = [batch size, 1, 1, src len]
    trg_mask = [batch size, 1, trg len, trg len]
    """
    enc_src = self.encoder(src, src_mask)
    # enc_src = [batch size, src len, hid dim]
    output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
    """
    output = [batch size, trg len, output dim]
    attention = [batch size, n heads, trg len, src len]
    """
    return output, attention

### Seq2Seq model instance

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = DEC_LAYERS = 3
ENC_HEADS = DEC_HEADS = 8
ENC_PF_DIM =  DEC_PF_DIM = 512
ENC_DROPOUT = DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)
print(enc)
dec

Encoder(
  (tok_embedding): Embedding(1032, 256)
  (pos_embedding): Embedding(100, 256)
  (layers): ModuleList(
    (0): EncoderLayer(
      (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (self_attention): MultiHeadAttentionLayer(
        (fc_q): Linear(in_features=256, out_features=256, bias=True)
        (fc_k): Linear(in_features=256, out_features=256, bias=True)
        (fc_v): Linear(in_features=256, out_features=256, bias=True)
        (fc_o): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (positionwise_feedforward): PositionwiseFeedforwardLayer(
        (fc_1): Linear(in_features=256, out_features=512, bias=True)
        (fc_2): Linear(in_features=512, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): E

Decoder(
  (tok_embedding): Embedding(1240, 256)
  (pos_embedding): Embedding(100, 256)
  (layers): ModuleList(
    (0): DecoderLayer(
      (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (enc_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (self_attention): MultiHeadAttentionLayer(
        (fc_q): Linear(in_features=256, out_features=256, bias=True)
        (fc_k): Linear(in_features=256, out_features=256, bias=True)
        (fc_v): Linear(in_features=256, out_features=256, bias=True)
        (fc_o): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder_attention): MultiHeadAttentionLayer(
        (fc_q): Linear(in_features=256, out_features=256, bias=True)
        (fc_k): Linear(in_features=256, out_features=256, bias=True)
        (fc_v): Linear(in_features=256, out_featu

In [None]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)
model

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(1032, 256)
    (pos_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
     

### Model parameters

In [None]:
def count_trainable_params(model):
  return sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad)

n_params, trainable_params = count_trainable_params(model)
print(f"Total number of paramaters: {n_params:,}\nTotal tainable parameters: {trainable_params:,}")

Total number of paramaters: 4,905,176
Total tainable parameters: 4,905,176


Initialize model weights

In [None]:
def initialize_weights(m):
  if hasattr(m, 'weight') and m.weight.dim() > 1:
    nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(1032, 256)
    (pos_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
     

In [None]:
# Note that the learning rate needs to be lower than the default used by Adam or else learning is unstable.

LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)


In [None]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

### Train and evaluation functions

In [None]:
def train(model, iterator, optimizer, criterion, clip):
  model.train()
  epoch_loss = 0
  for i, batch in enumerate(iterator):
    src = batch.src
    trg = batch.trg
    optimizer.zero_grad()
    output, _ = model(src, trg[:,:-1])
    """
    output = [batch size, trg len - 1, output dim]
    trg = [batch size, trg len]
    """
    output_dim = output.shape[-1]
    
    output = output.contiguous().view(-1, output_dim)
    trg = trg[:,1:].contiguous().view(-1)
    """
    output = [batch size * trg len - 1, output dim]
    trg = [batch size * trg len - 1]
    """
    loss = criterion(output, trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss / len(iterator)


def evaluate(model, iterator, criterion):
  model.eval()
  epoch_loss = 0
  with torch.no_grad():
    for i, batch in enumerate(iterator):
      src = batch.src
      trg = batch.trg
      output, _ = model(src, trg[:,:-1])
      """
      output = [batch size, trg len - 1, output dim]
      trg = [batch size, trg len]
      """
      output_dim = output.shape[-1]
      
      output = output.contiguous().view(-1, output_dim)
      trg = trg[:,1:].contiguous().view(-1)
      """
      output = [batch size * trg len - 1, output dim]
      trg = [batch size * trg len - 1]
      """
      loss = criterion(output, trg)

      epoch_loss += loss.item()
  return epoch_loss / len(iterator)

### Training the model

In [None]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def tabulate_training(column_names, data, title):
  table = PrettyTable(column_names)
  table.title= title
  table.align[column_names[0]] = 'l'
  table.align[column_names[1]] = 'r'
  table.align[column_names[2]] = 'r'
  table.align[column_names[3]] = 'r'
  for row in data:
    table.add_row(row)
  print(table)

### Model Name

In [None]:
MODEL_NAME = "eng-af.pt"

In [None]:
N_EPOCHS = 10
CLIP = 0.1
best_valid_loss = float('inf')
column_names = ["SET", "LOSS", "PPL", "ETA"]
print("TRAINING START....")
for epoch in range(N_EPOCHS):
  start = time.time()
  train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
  valid_loss = evaluate(model, valid_iterator, criterion)
  end = time.time()
  title = f"EPOCH: {epoch+1:02}/{N_EPOCHS:02} | {'saving model...' if valid_loss < best_valid_loss else 'not saving...'}" 
  if valid_loss < best_valid_loss:
      best_valid_loss = valid_loss
      torch.save(model.state_dict(), MODEL_NAME)
  rows_data =[
        ["train", f"{train_loss:.3f}", f"{math.exp(train_loss):7.3f}", hms_string(end - start) ],
        ["val", f"{valid_loss:.3f}", f"{math.exp(train_loss):7.3f}", '' ]
  ]
  tabulate_training(column_names, rows_data, title)

print("TRAINING ENDS....")

TRAINING START....
+--------------------------------------+
|    EPOCH: 01/10 | saving model...    |
+-------+-------+---------+------------+
| SET   |  LOSS |     PPL |        ETA |
+-------+-------+---------+------------+
| train | 5.979 | 394.898 | 0:00:00.87 |
| val   | 5.128 | 394.898 |            |
+-------+-------+---------+------------+
+--------------------------------------+
|    EPOCH: 02/10 | saving model...    |
+-------+-------+---------+------------+
| SET   |  LOSS |     PPL |        ETA |
+-------+-------+---------+------------+
| train | 5.036 | 153.787 | 0:00:00.77 |
| val   | 5.055 | 153.787 |            |
+-------+-------+---------+------------+
+--------------------------------------+
|     EPOCH: 03/10 | not saving...     |
+-------+-------+---------+------------+
| SET   |  LOSS |     PPL |        ETA |
+-------+-------+---------+------------+
| train | 4.544 |  94.082 | 0:00:00.76 |
| val   | 5.248 |  94.082 |            |
+-------+-------+---------+-----------

In [None]:
model.load_state_dict(torch.load(MODEL_NAME))

test_loss = evaluate(model, test_iterator, criterion)
title = "Model Evaluation Summary"
data_rows = [["Test", f'{test_loss:.3f}', f'{math.exp(test_loss):7.3f}', ""]]

tabulate_training(["SET", "LOSS", "PPL", "ETA"], data_rows, title)

+------------------------------+
|   Model Evaluation Summary   |
+------+-------+---------+-----+
| SET  |  LOSS |     PPL | ETA |
+------+-------+---------+-----+
| Test | 4.702 | 110.218 |     |
+------+-------+---------+-----+


### Model inference

In [None]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
    model.eval()
    if isinstance(sentence, str):
        # nlp = spacy.load('de_core_news_sm')
        tokens = sentence.split(" ")
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [None]:
example_idx = 0

src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']

translation, attention = translate_sentence(src, SRC, TRG, model, device)
print(f'src = {src}')
print(f'trg = {trg}')
print(f'predicted trg = {translation}')

src = ['i', "'m", 'glad', 'you', 'finally', 'made', 'it', 'back', '.']
trg = ['ek', 'is', 'bly', 'jy', 'het', 'uiteindelik', 'terug', 'gekom.']
predicted trg = ['ek', 'ek', 'is', '<eos>']


In [None]:
example_idx = 6

src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['trg']
translation, attention = translate_sentence(src, SRC, TRG, model, device)


print(f'src = {src}')
print(f'trg = {trg}')
print(f'predicted trg = {translation}')

src = ['you', 'can', 'do', 'whatever', 'you', 'want', 'to', 'do', ',', 'of', 'course', '.']
trg = ['kan', 'jy', 'my', 'indruk?']
predicted trg = ['jy', 'jy', 'jy', 'jy', 'jy', 'jy', 'jy', 'jy', 'jy', '<eos>']


Downloading the model name

In [None]:
files.download(MODEL_NAME)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>