### Machine Translation
Attention was first developed by Bahdanau et al, in [this paper](https://arxiv.org/abs/1409.0473), Neural Machine Translation by Jointly Learning to Align and Translate, and thus is a good starting point in discussing the progression Attention, we should talk about Machine Translation.

In [1]:
# Standard Lib
import math
from time import time

# Loading Bar
from tqdm import tqdm

# Torch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Dataloader Custom Module
from sample_dataloader import get_loaders

In [2]:
gpu = torch.device("cuda:0")

### Data
Dataset from [here](https://pytorch.org/text/stable/_modules/torchtext/datasets/iwslt2016.html), key tokens:
```python
'<unk>' unkown token
'<pad>' padding token
'<bos>' beginning of sentence token
'<eos>' end of sentence token
```

In [3]:
trainset, validset, testset, fr_tokenizer, en_tokenizer, en_vocab, fr_vocab = get_loaders(gpu)

In [4]:
# all these indices are the same for french and english
PAD_IDX = en_vocab['<pad>']
BOS_IDX = en_vocab['<bos>']
EOS_IDX = en_vocab['<eos>']

### Basic RNN Architecture
Lets first solve our machine translation problem with a simple RNN model

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        
        return hidden, cell

In [8]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        
        return prediction, hidden, cell

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, trg_vocab_size):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.trg_vocab_size = trg_vocab_size 
        
    def forward(self, src, trg):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        
        outputs = torch.zeros(trg_len, batch_size, self.trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        
        input = trg[0,:]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            outputs[t] = output
            
            # input is the previous token
            top1 = output.argmax(1)
            input = top1
        
        return outputs

In [11]:
INPUT_DIM = len(en_vocab)
OUTPUT_DIM = len(fr_vocab)
ENC_EMB_DIM = 64
DEC_EMB_DIM = 64
HID_DIM = 128
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, gpu, OUTPUT_DIM).to(gpu)

In [12]:
# initialize model weights
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(16292, 64)
    (rnn): LSTM(64, 128, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(20005, 64)
    (rnn): LSTM(64, 128, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=128, out_features=20005, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,366,501 trainable parameters


### Training

In [17]:
def train(model, iterator, optimizer, criterion, clip=1, num_epochs=10):
    model.train()
    
    for _ in range(num_epochs):
        for i, (src, trg) in tqdm(enumerate(iterator), desc="iteration"):

            optimizer.zero_grad()

            output = model(src, trg)

            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            loss.backward()

            # clip the gradients to prevent them from exploding (a common issue in RNNs)
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

            optimizer.step()
                    
    return model

In [18]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
model = train(model, trainset, optimizer, criterion)

iteration: 13773it [47:12,  4.86it/s]
iteration: 0it [00:00, ?it/s]
iteration: 0it [00:00, ?it/s]
iteration: 0it [00:00, ?it/s]
iteration: 0it [00:00, ?it/s]
iteration: 0it [00:00, ?it/s]
iteration: 0it [00:00, ?it/s]
iteration: 0it [00:00, ?it/s]
iteration: 0it [00:00, ?it/s]
iteration: 0it [00:00, ?it/s]


### Cosine Similarity Tests

In [22]:
en_word_idx, fr_word_idx = en_vocab['Hello'], fr_vocab['Bonjour']
en_embedding = model.encoder.embedding(torch.IntTensor([en_word_idx]).to(gpu)).cpu()
fr_embedding = model.decoder.embedding(torch.IntTensor([fr_word_idx]).to(gpu)).cpu()
torch.dot(en_embedding[0], fr_embedding[0]) / (en_embedding[0].norm() * fr_embedding[0].norm())

tensor(-0.0580, grad_fn=<DivBackward0>)

### Sentence Samples

In [58]:
def str_2_tensor(str_, tokenizer=en_tokenizer, vocab=en_vocab, device=gpu):
    tokens = [BOS_IDX] + [vocab[token] for token in tokenizer(str_)] + [EOS_IDX]
    return torch.tensor([tokens], dtype=torch.long).to(device)

def tensor_2_str(tensor, vocab=fr_vocab.itos):
    return [vocab[int(token)] for token in tensor]

def sample_translation(inp, trg, model):
    inp_tensor = str_2_tensor(inp)
    trg_tensor = str_2_tensor(trg)
    with torch.no_grad():
        output = model(inp_tensor, trg_tensor)
    chosen_tokens = output[0].max(dim=1).values.cpu().numpy()
    return tensor_2_str(chosen_tokens)

In [59]:
sample_translation("I love cats", "J'aime les chats", model)

['<unk>', '<unk>', '<unk>', '<unk>', '<unk>']

### RNN Bottlenecks
One of the issues with our current VanillaRNN is that at each timestep we are representing all information with a single hidden state. A cool idea would be if at every timestep we assign different weights to the input sequence whilst decoding. 

In [55]:
# Issues
# 1) everything predicted appears to just be a 0 (just unk)
# 2) Dataloader doesnt reset, once a full epoch is done all the others are skipped
# 3) Tokenization takes way too long

# Resolutions
# 1 & 3) Use BPE encoding and preprocess all of the text!
# 2) ??