### NMT (Nueral Machine Translation)

In these series of notebooks we are going to do create bidirectional NMT model for our application. We are going to use the following notebooks as reference to this notebook.

1. [17_Custom_Dataset_and_Translation.ipynb](https://github.com/CrispenGari/pytorch-python/blob/main/09_NLP/03_Sequence_To_Sequence/17_Custom_Dataset_and_Translation.ipynb)
2. [16_Data_Preparation_Translation_Dataset.ipynb](https://github.com/CrispenGari/pytorch-python/blob/main/09_NLP/03_Sequence_To_Sequence/16_Data_Preparation_Translation_Dataset.ipynb)
3. [07_Attention_is_all_you_need](https://github.com/CrispenGari/pytorch-python/blob/main/09_NLP/03_Sequence_To_Sequence/07_Attention_is_all_you_need.ipynb)

I will be loading the data from my google drive.

In [5]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Mounted at /content/drive


### Imports

In [1]:
import torch
from torch import nn
from torch.nn  import functional as F
import spacy, math, random
import numpy as np
from torchtext.legacy import datasets, data
import time, os, json
from prettytable import PrettyTable
from matplotlib import pyplot as plt

In [2]:
SEED = 42

np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deteministic = True

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'
)
device

device(type='cuda')

In [64]:
base_path = '/content/drive/My Drive/NLP Data/seq2seq/manythings'
path_to_files = os.path.join(base_path, "French - English")
os.listdir(path_to_files)

['fra.txt',
 'train.en',
 'test.en',
 'valid.en',
 'train.fr',
 'test.fr',
 'valid.fr']

### File extensions

In [65]:
exts = (".fr", ".en")

### Tokenizer models

All the tokenization models that we are going to use are going to be found [here](https://spacy.io/usage/models) but to those languages that doesn't have tokenization models we are going to create our own tokenizers.

In [41]:
import spacy
spacy.cli.download("de_core_news_sm")
import en_core_web_sm
spacy_en = spacy.load('en_core_web_sm')
spacy_fr = spacy.load('de_core_news_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [42]:
def tokenize_fr(sent):
  return [tok.text for tok in spacy_fr.tokenizer(sent)]

def tokenize_en(sent):
  return [tok.text for tok in spacy_en.tokenizer(sent)]

### Fields

In [66]:
SRC = data.Field(
    tokenize = tokenize_fr,
    lower= True,
    init_token = "<sos>",
     eos_token = "<eos>",
     include_lengths =True
)
TRG = data.Field(
    tokenize = tokenize_en,
    lower= True,
    init_token = "<sos>",
     eos_token = "<eos>"
)

### Creating dataset

In [67]:
train_data, valid_data, test_data = datasets.TranslationDataset.splits(
    exts= exts,
    path=path_to_files,
    train='train', validation='valid', test='test',
    fields = (SRC, TRG)
)

In [68]:
print(vars(train_data.examples[0]))

{'src': ['quand', "t'es-tu", 'mise', 'à', 'écrire', 'des', 'chansons', '?'], 'trg': ['when', 'did', 'you', 'start', 'writing', 'songs', '?']}


In [69]:
print(vars(valid_data.examples[0]))

{'src': ['quand', "t'es-tu", 'mise', 'à', 'écrire', 'des', 'chansons', '?'], 'trg': ['i', 'like', 'the', 'way', 'you', 'think', '.']}


In [70]:
print(vars(test_data.examples[0]))

{'src': ['quand', "t'es-tu", 'mise', 'à', 'écrire', 'des', 'chansons', '?'], 'trg': ['she', 'wished', 'she', 'had', 'been', 'born', 'twenty', 'years', 'earlier', '.']}


### Counting examples

In [71]:
from prettytable import PrettyTable
def tabulate(column_names, data):
  table = PrettyTable(column_names)
  table.title= "VISUALIZING SETS EXAMPLES"
  table.align[column_names[0]] = 'l'
  table.align[column_names[1]] = 'r'
  for row in data:
    table.add_row(row)
  print(table)

column_names = ["SUBSET", "EXAMPLE(s)"]
row_data = [
        ["training", len(train_data)],
        ['validation', len(valid_data)],
        ['test', len(test_data)]
]
tabulate(column_names, row_data)

+-----------------------------+
|  VISUALIZING SETS EXAMPLES  |
+--------------+--------------+
| SUBSET       |   EXAMPLE(s) |
+--------------+--------------+
| training     |       186419 |
| validation   |         1884 |
| test         |         1903 |
+--------------+--------------+


Our dataset is very small so we are not going to set the `min_freq` to a number greater than 1 dring building of the vocabulary.

In [72]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

Saving the dictionary maping of our SRC and TRG to a json file.

In [73]:
len(SRC.vocab.stoi), len(TRG.vocab.stoi)

(18950, 10467)

In [74]:
src = dict(SRC.vocab.stoi)
trg = dict(TRG.vocab.stoi)

src_vocab_path = "src_vocab.json"
trg_vocab_path = "trg_vocab.json"

with open(src_vocab_path, "w") as f:
  json.dump(src, f, indent=2)

with open(trg_vocab_path, "w") as f:
  json.dump(trg, f, indent=2)

print("Done")

Done


In [75]:
files.download(src_vocab_path)
files.download(trg_vocab_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Iterators

In [77]:
BATCH_SIZE = 128 # 128 for languages with good vocab corpus
sort_key = lambda x: len(x.src)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_key= sort_key,
    sort_within_batch = True
)

### Encoder

In [78]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
    super(Encoder, self).__init__()

    self.embedding = nn.Embedding(input_dim, embedding_dim=emb_dim)
    self.gru = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
    self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src, src_len):
    embedded = self.dropout(self.embedding(src)) # embedded = [src len, batch size, emb dim]
    # need to explicitly put lengths on cpu!
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.to('cpu'))
    packed_outputs, hidden = self.gru(packed_embedded)
    outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
    hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
    return outputs, hidden
    

### Attention layer

In [79]:

class Attention(nn.Module):
  def __init__(self, enc_hid_dim, dec_hid_dim):
    super(Attention, self).__init__()
    self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
    self.v = nn.Linear(dec_hid_dim, 1, bias = False)

  def forward(self, hidden, encoder_outputs, mask):
    batch_size = encoder_outputs.shape[1]
    src_len = encoder_outputs.shape[0]
    # repeat decoder hidden state src_len times
    hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
    encoder_outputs = encoder_outputs.permute(1, 0, 2)
    energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) # energy = [batch size, src len, dec hid dim]
    attention = self.v(energy).squeeze(2) # attention= [batch size, src len]
    attention = attention.masked_fill(mask == 0, -1e10)
    return F.softmax(attention, dim=1)
    

### Decoder

In [80]:
class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
    super(Decoder, self).__init__()
    self.output_dim = output_dim
    self.attention = attention

    self.embedding = nn.Embedding(output_dim, emb_dim)
    self.gru = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
    self.fc = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
        
  def forward(self, input, hidden, encoder_outputs, mask):
    input = input.unsqueeze(0) # input = [1, batch size]
    embedded = self.dropout(self.embedding(input)) # embedded = [1, batch size, emb dim]
    a = self.attention(hidden, encoder_outputs, mask)# a = [batch size, src len]
    a = a.unsqueeze(1) # a = [batch size, 1, src len]
    encoder_outputs = encoder_outputs.permute(1, 0, 2) # encoder_outputs = [batch size, src len, enc hid dim * 2]
    weighted = torch.bmm(a, encoder_outputs) # weighted = [batch size, 1, enc hid dim * 2]
    weighted = weighted.permute(1, 0, 2) # weighted = [1, batch size, enc hid dim * 2]
    rnn_input = torch.cat((embedded, weighted), dim = 2) # rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
    output, hidden = self.gru(rnn_input, hidden.unsqueeze(0))
    
    assert (output == hidden).all()
    embedded = embedded.squeeze(0)
    output = output.squeeze(0)
    weighted = weighted.squeeze(0)

    prediction = self.fc(torch.cat((output, weighted, embedded), dim = 1)) # prediction = [batch size, output dim]
    return prediction, hidden.squeeze(0), a.squeeze(1)
    

### Seq2Seq

In [81]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, src_pad_idx, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
    self.src_pad_idx = src_pad_idx
  
  def create_mask(self, src):
    mask = (src != self.src_pad_idx).permute(1, 0)
    return mask
  def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):
    """
    src = [src len, batch size]
    src_len = [batch size]
    trg = [trg len, batch size]
    teacher_forcing_ratio is probability to use teacher forcing
    e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
    """
    trg_len, batch_size = trg.shape
    trg_vocab_size = self.decoder.output_dim
        
    # tensor to store decoder outputs
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
    """
    encoder_outputs is all hidden states of the input sequence, back and forwards
    hidden is the final forward and backward hidden states, passed through a linear layer
    """
    encoder_outputs, hidden = self.encoder(src, src_len)     
    # first input to the decoder is the <sos> tokens
    input = trg[0,:]
    mask = self.create_mask(src) # mask = [batch size, src len]
    for t in range(1, trg_len):
      # insert input token embedding, previous hidden state and all encoder hidden states and mask
      # receive output tensor (predictions) and new hidden state
      output, hidden, _ = self.decoder(input, hidden, encoder_outputs, mask)
      
      # place predictions in a tensor holding predictions for each token
      outputs[t] = output
      
      # decide if we are going to use teacher forcing or not
      teacher_force = random.random() < teacher_forcing_ratio
      
      # get the highest predicted token from our predictions
      top1 = output.argmax(1) 
      
      # if teacher forcing, use actual next token as next input
      # if not, use predicted token
      input = trg[t] if teacher_force else top1
    return outputs

### Seq2Seq model instance

In [98]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = DEC_EMB_DIM = 256
ENC_HID_DIM = DEC_HID_DIM = 128
ENC_DROPOUT = DEC_DROPOUT = 0.5
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, SRC_PAD_IDX, device).to(device)
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(18950, 256)
    (gru): GRU(256, 128, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=384, out_features=128, bias=True)
      (v): Linear(in_features=128, out_features=1, bias=False)
    )
    (embedding): Embedding(10467, 256)
    (gru): GRU(512, 128)
    (fc): Linear(in_features=640, out_features=10467, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

### Model parameters

In [99]:
def count_trainable_params(model):
  return sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad)

n_params, trainable_params = count_trainable_params(model)
print(f"Total number of paramaters: {n_params:,}\nTotal tainable parameters: {trainable_params:,}")

Total number of paramaters: 14,865,379
Total tainable parameters: 14,865,379


Initialize model weights

In [100]:
def init_weights(m):
  for name, param in m.named_parameters():
    if 'weight' in name:
        nn.init.normal_(param.data, mean=0, std=0.01)
    else:
        nn.init.constant_(param.data, 0)   
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(18950, 256)
    (gru): GRU(256, 128, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=384, out_features=128, bias=True)
      (v): Linear(in_features=128, out_features=1, bias=False)
    )
    (embedding): Embedding(10467, 256)
    (gru): GRU(512, 128)
    (fc): Linear(in_features=640, out_features=10467, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

### Optimizer and Criterion

In [101]:
LEARNING_RATE = 0.0005
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX).to(device)

### Train and evaluation functions

In [102]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src, src_len = batch.src
        src = src.to(device)
        src_len = src_len.to(device)
        trg = batch.trg
        trg = trg.to(device)
        optimizer.zero_grad()
        output = model(src, src_len, trg)
        """
        trg = [trg len, batch size]
        output = [trg len, batch size, output dim]
        """
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        """
        trg = [(trg len - 1) * batch size]
        output = [(trg len - 1) * batch size, output dim]
        """
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
      for i, batch in enumerate(iterator):
          src, src_len = batch.src
          src = src.to(device)
          src_len = src_len.to(device)
          trg = batch.trg
          trg = trg.to(device)
          optimizer.zero_grad()
          output = model(src, src_len, trg, 0) ## Turn off the teacher forcing ratio.
          """
          trg = [trg len, batch size]
          output = [trg len, batch size, output dim]
          """
          output_dim = output.shape[-1]
          output = output[1:].view(-1, output_dim)
          trg = trg[1:].view(-1)
          """
          trg = [(trg len - 1) * batch size]
          output = [(trg len - 1) * batch size, output dim]
          """
          loss =  criterion(output, trg)
          epoch_loss += loss.item()
    return epoch_loss / len(iterator)

### Training the model

In [103]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def tabulate_training(column_names, data, title):
  table = PrettyTable(column_names)
  table.title= title
  table.align[column_names[0]] = 'l'
  table.align[column_names[1]] = 'r'
  table.align[column_names[2]] = 'r'
  table.align[column_names[3]] = 'r'
  for row in data:
    table.add_row(row)
  print(table)

### Model Name

In [104]:
MODEL_NAME = "fr-eng.pt"

In [None]:
N_EPOCHS = 10
CLIP = 1
best_valid_loss = float('inf')
column_names = ["SET", "LOSS", "PPL", "ETA"]
print("TRAINING START....")
for epoch in range(N_EPOCHS):
  start = time.time()
  train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
  valid_loss = evaluate(model, valid_iterator, criterion)
  end = time.time()
  title = f"EPOCH: {epoch+1:02}/{N_EPOCHS:02} | {'saving model...' if valid_loss < best_valid_loss else 'not saving...'}" 
  if valid_loss < best_valid_loss:
      best_valid_loss = valid_loss
      torch.save(model.state_dict(), MODEL_NAME)
  rows_data =[
        ["train", f"{train_loss:.3f}", f"{math.exp(train_loss):7.3f}", hms_string(end - start) ],
        ["val", f"{valid_loss:.3f}", f"{math.exp(train_loss):7.3f}", '' ]
  ]
  tabulate_training(column_names, rows_data, title)

print("TRAINING ENDS....")

TRAINING START....


In [None]:
model.load_state_dict(torch.load(MODEL_NAME))

test_loss = evaluate(model, test_iterator, criterion)
title = "Model Evaluation Summary"
data_rows = [["Test", f'{test_loss:.3f}', f'{math.exp(test_loss):7.3f}', ""]]

tabulate_training(["SET", "LOSS", "PPL", "ETA"], data_rows, title)

### Model inference

In [None]:
nlp = en_core_web_sm.load()

In [None]:
def translate_sentence(sent, src_field, trg_field, mdoel, device, max_len=50):
  model.eval()

  if isinstance(sent, str):
    tokens = sent.split(" ")
  else:
    tokens = [token.lower() for token in sent]
  
  tokens = [src_field.init_token] + tokens + [src_field.eos_token]
  src_indexes = [src_field.vocab.stoi[token] for token in tokens]
  src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
  src_len = torch.LongTensor([len(src_indexes)])

  with torch.no_grad():
    encoder_outputs, hidden = model.encoder(src_tensor, src_len)

  mask = model.create_mask(src_tensor)
  trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
  attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device)

  for i in range(max_len):
    trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
    with torch.no_grad():
      output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs, mask)

    attentions[i] = attention
    pred_token = output.argmax(1).item()
    trg_indexes.append(pred_token)

    if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
       break
  trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
  return trg_tokens[1:], attentions[:len(trg_tokens)-1]

In [None]:
example_idx = 6

src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['trg']
translation, attention = translate_sentence(src, SRC, TRG, model, device)


print(f'src = {src}')
print(f'trg = {trg}')
print(f'predicted trg = {translation}')

In [None]:
example_idx = 0
src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']
print(f'src = {src}')
print(f'trg = {trg}')
tokens, attention = translate_sentence(src,  SRC, TRG, model, device)
print(f'pred = {tokens}')


In [None]:
example_idx = 0
src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['trg']
print(f'src = {src}')
print(f'trg = {trg}')
tokens, attention = translate_sentence(src,  SRC, TRG, model, device)
print(f'pred = {tokens}')

Downloading the model name

In [None]:
files.download(MODEL_NAME)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### BLEU SCORE

In [None]:
from torchtext.data.metrics import bleu_score
def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):
    trgs = []
    pred_trgs = []
    for datum in data:
        src = vars(datum)['src']
        trg = vars(datum)['trg']
        pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len)
        # cut off <eos> token
        pred_trg = pred_trg[:-1]
        pred_trgs.append(pred_trg)
        trgs.append([trg])
    return bleu_score(pred_trgs, trgs)

bleu_score = calculate_bleu(test_data, SRC, TRG, model, device)
print(f'BLEU score = {bleu_score*100:.2f}')

BLEU score = 36.64
