### Machine Translation
Attention was first developed by Bahdanau et al, in [this paper](https://arxiv.org/abs/1409.0473), Neural Machine Translation by Jointly Learning to Align and Translate, and thus is a good starting point in discussing the progression Attention, we should talk about Machine Translation.

In [1]:
# Standard Lib
import math
from time import time

# Torch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Dataloader Custom Module
from sample_dataloader import get_loaders

In [2]:
gpu = torch.device("cuda:0")

### Data
Dataset from [here](https://pytorch.org/text/stable/_modules/torchtext/datasets/iwslt2016.html), key tokens:
```python
'<unk>' unkown token
'<pad>' padding token
'<bos>' beginning of sentence token
'<eos>' end of sentence token
```

In [3]:
trainset, validset, testset, fr_tokenizer, en_tokenizer, en_vocab, fr_vocab = get_loaders(gpu)

Build Vocabularies: 100%|██████████| 220400/220400 [01:18<00:00, 2803.63it/s]


In [4]:
# all these indices are the same for french and english
PAD_IDX = en_vocab['<pad>']
BOS_IDX = en_vocab['<bos>']
EOS_IDX = en_vocab['<eos>']

### Basic RNN Architecture
Lets first solve our machine translation problem with a simple RNN model

In [5]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        
        return hidden, cell

In [6]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        
        return prediction, hidden, cell

In [7]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, trg_vocab_size):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.trg_vocab_size = trg_vocab_size 
        
    def forward(self, src, trg):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        
        outputs = torch.zeros(trg_len, batch_size, self.trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        
        input = trg[0,:]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            outputs[t] = output
            
            # input is the previous token
            top1 = output.argmax(1)
            input = top1
        
        return outputs

In [8]:
INPUT_DIM = len(en_vocab)
OUTPUT_DIM = len(fr_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, gpu, OUTPUT_DIM).to(gpu)

### Training

In [10]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, (src, trg) in enumerate(iterator):
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        # clip the gradients to prevent them from exploding (a common issue in RNNs)
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [11]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, (src, trg) in enumerate(iterator):

            output = model(src, trg)

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [12]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
best_valid_loss = float('inf')

CLIP = 1
N_EPOCHS = 10

In [13]:
for epoch in range(N_EPOCHS):

    start_time = time()

    train_loss = train(model, trainset, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, validset, criterion)

    end_time = time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)


    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

In [None]:
%debug

> [0;32m/home/carter/anaconda3/lib/python3.7/site-packages/torch/nn/modules/rnn.py[0m(662)[0;36mforward[0;34m()[0m
[0;32m    660 [0;31m        [0;32mif[0m [0mbatch_sizes[0m [0;32mis[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    661 [0;31m            result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
[0m[0;32m--> 662 [0;31m                              self.dropout, self.training, self.bidirectional, self.batch_first)
[0m[0;32m    663 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    664 [0;31m            result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias,
[0m
ipdb> u
> [0;32m/home/carter/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py[0m(889)[0;36m_call_impl[0;34m()[0m
[0;32m    887 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[

ipdb> u
> [0;32m<ipython-input-6-a4cdb69ec86c>[0m(14)[0;36mforward[0;34m()[0m
[0;32m     12 [0;31m[0;34m[0m[0m
[0m[0;32m     13 [0;31m        [0membedded[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mdropout[0m[0;34m([0m[0mself[0m[0;34m.[0m[0membedding[0m[0;34m([0m[0minput[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 14 [0;31m        [0moutput[0m[0;34m,[0m [0;34m([0m[0mhidden[0m[0;34m,[0m [0mcell[0m[0;34m)[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mrnn[0m[0;34m([0m[0membedded[0m[0;34m,[0m [0;34m([0m[0mhidden[0m[0;34m,[0m [0mcell[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     15 [0;31m        [0mprediction[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mfc_out[0m[0;34m([0m[0moutput[0m[0;34m.[0m[0msqueeze[0m[0;34m([0m[0;36m0[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     16 [0;31m[0;34m[0m[0m
[0m
ipdb> u
> [0;32m/home/carter/anaconda3/lib/python3.

### RNN Bottlenecks
One of the issues with our current VanillaRNN is that at each timestep we are representing all information with a single hidden state. A cool idea would be if at every timestep we assign different weights to the input sequence whilst decoding. 