# First Attention Blocks

## Before
In **Transformers 1 - Before Attention** each forward pass went as 
```python
y, s = '<s>', encoder(x)
for _ in range(N):
    y, s = decoder(y, s)
```

## With Attention
**pseudo code**


```python
# Encoding Stage (inp_vec (x) -> hidden_states)
hidden_states, s = encoder(x)

# Decoding Stage(s)

# 1) computing the attention weights
alignment_scores = []
for h_i in hidden_states:
    e_i = f_att(s, h_i)
    alignment_scores.append(e_i)
    
attention_weights = F.softmax(alignment_scores)

# 2) computing the context vector
c = torch.zeros(1, hidden_dim)
for a_i, h_i in zip(attention_weights, hidden_states):
    c += a_i * h_i
    
# 3) decoding
y = '<s>'
y, s = decoder(y, c)
```


## Imports

In [1]:
# Standard Lib
import os
import math
import random
from time import time
from pathlib import Path

# Visualization
import matplotlib.pyplot as plt

# Tokenization
import spacy 

# Loading Bar
from tqdm import tqdm

# Torch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Dataloader Custom Module
from sample_dataloader import get_dataloaders

## Constants

In [2]:
data_root = os.path.join(Path(os.getcwd()).parent.parent.parent, "Datasets/")
gpu = torch.device("cuda:0")

In [3]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download de_core_news_sm

## Datasets

In [4]:
trainset, validset, testset, de_vocab, en_vocab = get_dataloaders(batch_size=128, device=gpu, data_root=data_root)

In [5]:
# all these indices are the same for french and english
PAD_IDX = de_vocab['<pad>']
SOS_IDX = de_vocab['<sos>']
EOS_IDX = de_vocab['<eos>']

## Model Architecture

In [23]:
class Encoder(nn.Module):
    def __init__(self, inp_vocab_len, num_features, num_hidden, num_layers, dropout):
        """
        Inputs:
            (encoder lstm params)
                num_hidden_features: The number of features in the hidden state h
                num_encoder_layers: The number of layers in the lstm
                dropout: the probability of dropout

            (embedding params)
                inp_vocab_len: The size of the dictionary of embeddings
                num_features: The size of each embedding vector

            (regularization)
                dropout
        """
        super().__init__()
        
        self.embedding = nn.Embedding(num_embeddings=inp_vocab_len, 
                                      embedding_dim=num_features)
        self.dropout = nn.Dropout(dropout)
        
        self.lstm = nn.LSTM(input_size=num_features, 
                            hidden_size=num_hidden, 
                            num_layers=num_layers,
                            dropout=dropout)
    
    def forward(self, x):
        """
        Input:
            (x) 
                (description): an encoded german sentence
                (shape): [1, number of tokens, batch size]
        
        Output:
            (hidden_states)
                tensor containing the output features (h_t) from the last layer of the LSTM, for each t
            (h_t)
                tensor containing the hidden state for t = seq_len
            (c_t)
                tensor containing the cell state for t = seq_len.
        """
        embedding = self.dropout(self.embedding(x))
        hidden_states, (h_t, c_t) = self.lstm(embedding)
        
        # our lstm is bi-directional
        # so we concat the forward and the backward
        # hidden [-2, :, : ] is the last of the forwards RNN 
        # hidden [-1, :, : ] is the last of the backwards RNN
        h_t = torch.cat((h_t[0], h_t[1]), dim=1)
        
        return hidden_states, (h_t, c_t)

In [7]:
class Attention(nn.Module):
    def __init__(self, decoder_hidden_size, encoder_hidden_size):
        super().__init__()
        
        # why (decoder_hidden_state * 2)?
        #     our lstm is actually bidirectional, so in the previous step 
        #     we concatenate the hidden_state forward of the last layer
        #     to the hidden_state backward of the last layer effectively 
        #     duplicating the number of features
        self.compressed = nn.Linear((decoder_hidden_size * 2), decoder_hidden_size)
        
        self.attn_out = nn.Linear
    
    def forward(self, s, encoder_outputs):
        """
        Input:
            (s)
                description: current decoder hidden state
                
            (encoder_outputs)
                description: tensor containing the output features (h_t) from the last layer of the LSTM, for each t
                
        Method Overview:
            We are trying to assign weights to the encoder outputs!
            to do this we are going to pass a large matrix
            into a linear layer (f_att), then output scalars

            the input for our linear layer should be of the shape
            (batch_size, seq_len, enc_hidden_size + dec_hidden_size)
            
        """
        # get sequence length
        seq_len = encoder_outputs.shape[0]
        
        # convert 
        # s [batch_size, decoder_hidden_state * 2] -> [batch_size, decoder_hidden_state]
        compressed_s = self.compressed(s)
        
        # duplicate decoder hidden state
        # s [batch_size, encoder_hidden_state * 2] -> [seq_len, batch_size, encoder_hidden_state * 2]
        s = s.unsqueeze(1).repeat(1, seq_len, 1)
        
        # reshape encoder outputs to be same as decoder hidden
        # encoder_outputs [seq_len, batch_size, encoder_hidden_state]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        # notice how we are concatenating along the third dimension
        # as expected above
        concatenated_inputs = torch.cat((s, encoder_outputs), dim = 2)
                               
        # note: there is one encoder output for every token in the sequence
        alignment_scores = self.att_out(concatenated_inputs)
        
        # compute attention weights
        return F.softmax(alignment_scores)

In [8]:
class Decoder(nn.Module):
    def __init__(self, trg_vocab_len, num_features, num_hidden, num_layers, dropout, attention):
        super().__init__()
        
        self.num_hidden = num_hidden
        
        self.embedding = nn.Embedding(num_embeddings=trg_vocab_len, 
                                      embedding_dim=num_features)
        self.dropout = nn.Dropout(dropout)
        
        # attention block
        self.attention = attention
        
        # Decoder
        self.lstm = nn.LSTM(input_size=num_features, 
                            hidden_size=num_hidden, 
                            num_layers=num_layers,
                            dropout=dropout)
        # self.fc_out = nn.Linear()
    
    def forward(self, x, s, cell, encoder_outputs):
        """
        Input
            (x)
                (description): the previous token
                (shape): [batch_size]
                
            (s)
                (description): current decoder hidden state
                (shape): [num_layers, batch_size, embedding_dim]
                
            (cell)
                (description): current cell state
                (shape): [num_layers, batch_size, embedding_dim]
                
            (encoder outputs) 
                (description): tensor containing the output features (h_t) from the last layer of the LSTM, for each t
                (shape): [token_seq_len, batch_size, embedding_dim]
        """
        embedding = self.dropout(self.embedding(x))
        
        # embedding [batch_size, num_features] -> [1, batch_size, num_features]
        embedding = embedding.unsqueeze(0)
        
        # attention [batch_size, seq_len, 1]
        attention_weights = self.attention(s, encoder_outputs)
        
        # permute inputs for batch matrix mul
        # attention_weights [batch_size, seq_len, 1] -> [batch_size, 1, seq_len]
        attention_weights = attention_weights.permute(0, 2, 1)
        
        # encoder_outputs [seq_len, batch_size, hidden_size] -> [batch_size, seq_len, hidden_size] 
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        # If input is a (b×n×m) tensor, mat2 is a (b×m×p) tensor, out will be a (b×n×p) tensor.
        context_vector = torch.bmm(attention_weights, encoder_outputs)
        
        context_vector = context_vector.view(128, self.num_hidden)

        # decoding
        # shape (num_layers * num_directions, batch, hidden_size)
        output, (s, cell) = self.lstm(embedding, (context_vector, cell))
        
        # prediction = self.fc_out(output)
        return output

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init()
    
    def forward(self, x):
        outputs = torch.zeros(trg_len, batch_size, self.trg_vocab_size).to(self.device)
        
        encoder_outputs, s, c_t = self.encoder(src)
        
        # this shoud just be replaced with <s> or whatever our start token is
        x = trg[0,:]
        
        for t in range(1, trg_len):
            prediction, s, cell = self.decoder(x, s, cell, encoder_outputs)
            
            outputs[t] = output
            
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            top1 = output.argmax(1) 
    
            x = trg[t] if teacher_force else top1
        
        return outputs

### Initialization + Number of Params

In [10]:
inp_vocab_len = len(de_vocab)
trg_vocab_len = len(en_vocab)
num_features = 256
num_hidden = 512
num_layers = 2
dropout = .5

enc = Encoder(inp_vocab_len=inp_vocab_len,
              num_features=num_features,
              num_hidden=num_hidden,
              num_layers=num_layers,
              dropout=dropout).to(gpu)

attn = Attention(decoder_hidden_size=num_hidden, 
                 encoder_hidden_size=num_hidden).to(gpu)

dec = Decoder(trg_vocab_len=trg_vocab_len,
              num_features=num_features,
              num_hidden=num_hidden,
              num_layers=num_layers,
              dropout=dropout,
              attention=attn).to(gpu)
# model = Seq2Seq(enc, dec, gpu, OUTPUT_DIM).to(gpu)

In [11]:
val = next(iter(trainset))
src, trg = val.src, val.trg
encoder_outputs, (h_t, c_t) = enc(val.src)
x = trg[0,:]
output = dec(x, h_t, c_t, encoder_outputs)

RuntimeError: Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor

In [22]:
encoder_outputs.shape

torch.Size([30, 128, 512])

In [None]:
# initialize model weights
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

### Training

In [None]:
def train(model, iterator, optimizer, criterion, clip=1, num_epochs=10):
    model.train()
    losses = []
    for _ in range(num_epochs):
        for i, batch in tqdm(enumerate(iterator), desc="iteration"):
            src = batch.src
            trg = batch.trg 
            
            optimizer.zero_grad()

            output = model(src, trg)

            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            loss = criterion(output, trg)
            losses.append(loss.item())
            loss.backward()

            # clip the gradients to prevent them from exploding (a common issue in RNNs)
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

            optimizer.step()
                    
    return model, losses

In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
model, losses = train(model, trainset, optimizer, criterion)

### Training

In [None]:
plt.scatter(x=list(range(len(losses))), y=losses)

### Testing

In [None]:
def tensor_2_str(tensor, vocab=de_vocab.itos):
    return " ".join([vocab[int(token)] for token in tensor if vocab[int(token)] not in ['<eos>', '<pad>', '.']])

In [None]:
with torch.no_grad():
    sample = next(iter(trainset))
    src, trg = sample.src, sample.trg
    output = model(src, trg)
    output_tensor = output.argmax(2)[:, 0]
    target_tensor = trg[:, 0]
    
    output = tensor_2_str(output_tensor, en_vocab.itos)
    expected = tensor_2_str(target_tensor, en_vocab.itos)
    N = max(len(output), len(expected)) + len("Expected: ")
    
    print("="*N)
    print("Output: {}".format(output).center(N))
    print("="*N)
    
    print("="*N)
    print("Expected: {}".format(expected).center(N))
    print("="*N)