# Homework 5 Transformer
    

Transformer models took the NLP community by storm by achieving state-of-the-art results in machine translation in the paper [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf). A very good tutorial on this architecture can be found [here](http://jalammar.github.io/illustrated-transformer/). As you can tell from the name, this model is based on attention! It replaces standard recurrent neural networks used for translation with a self-attention-based network. 

The model is an encoder-decoder model, as shown below:


<center><img src="http://nlp.seas.harvard.edu/images/the-annotated-transformer_14_0.png" alt="mlp" align="middle"></center>

The encoder and decoder each consists of modules which are repeated N times (N=6 in the original paper). 



## Preprocessing

This is basically the same as before, but we need batch first in `collate_fn`, simply transpose `src_batch` and `trg_batch` before return them.

In [None]:
import locale

def getpreferredencoding(do_setlocale = True):
    return 'UTF-8'
locale.getpreferredencoding = getpreferredencoding
print(locale.getpreferredencoding())

! pip install -U spacy -q
! python -m spacy download en_core_web_sm -q
! python -m spacy download de_core_news_sm -q
! pip install torch==1.13.1 torchtext==0.14.1 torchdata==0.5.1 -q

UTF-8
2023-04-22 08:09:47.294498: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-22 08:09:49.529907: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-22 08:09:49.530362: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
20

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Pytorch version is: ", torch.__version__)
print("You are using: ", device)
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 
from torchtext.datasets import multi30k, Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import spacy
import random
import math
import os
import time
from typing import List, Iterable, Tuple 
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker
# We'll set the random seeds for deterministic results.
SEED = 1
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.deterministic = True

Pytorch version is:  1.13.1+cu117
You are using:  cuda


In [None]:
# multi30k original link broke Issue: https://github.com/pytorch/text/issues/1756
# Update URLs to point to data stored by user
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
multi30k.URL["test"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt16_task1_test.tar.gz"

# Update hash since there is a discrepancy between user hosted test split and that of the test split in the original dataset 
multi30k.MD5["test"] = "6d1ca1dba99e2c5dd54cae1226ff11c2551e6ce63527ebb072a1f70f72a5cd36"


# Build vocab
SRC = 'de'
TRG = 'en'
LANG = {SRC: 0, TRG: 1}
models = ['de_core_news_sm', 'en_core_web_sm']
# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']
train_iter, valid_iter, test_iter = Multi30k(split=('train', 'valid', 'test'), 
                                             language_pair=(SRC, TRG))
token_transform = {}
vocab_transform = {}

def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    for data_sample in data_iter:
        yield token_transform[language](data_sample[LANG[language]])

for L, model in zip(LANG, models):
  token_transform[L] = get_tokenizer('spacy', language=model)
  vocab_transform[L] = build_vocab_from_iterator(yield_tokens(train_iter, L),
                                                    min_freq=2,
                                                    specials=special_symbols,
                                                    special_first=True)
  vocab_transform[L].set_default_index(UNK_IDX)

print(f"Unique tokens in source (de) vocabulary: {len(vocab_transform[SRC])}")
print(f"Unique tokens in target (en) vocabulary: {len(vocab_transform[TRG])}")


Unique tokens in source (de) vocabulary: 8014
Unique tokens in target (en) vocabulary: 6191


In [None]:
def tensor_transform(token_ids: List[int]):
  """
  function to add BOS/EOS and create tensor for input sequence indices
  """
  return torch.cat((torch.tensor([BOS_IDX]),
                    torch.tensor(token_ids),
                    torch.tensor([EOS_IDX])))


def collate_fn(batch: List[Tuple[str, str]]):
    """
    Tokenization, Numericalization, and Add BOS/EOS to create tensor for source/target language

    sort the sequences based on their lengths first, we can ensure that shorter sequences are processed together, 
    and then pad them to match the length of the longest sequence in the batch,
    reducing the number of unnecessary padding tokens that the model has to process, 
    lead to a more efficient computation and potentially better results, 

    batch: a list of tuples, where each tuple contains a pair of source and target samples. 
            The source and target samples are usually strings. e.g.
            [
              ("This is a source sentence.", "This is a target sentence."),
              ("Another source sentence.", "Another target sentence."),
              ...
            ]

    """
    src_batch, trg_batch = [], []
    
    # Sort batch based on the source length before tokenization and numericalization
    sorted_batch = sorted(batch, key=lambda x: len(x[0].rstrip("\n").split()), reverse=True)
    
    # tokenization and numericalization
    for src_sample, trg_sample in sorted_batch:
        src_batch.append(tensor_transform(vocab_transform[SRC](token_transform[SRC](src_sample.rstrip("\n")))).long())
        trg_batch.append(tensor_transform(vocab_transform[TRG](token_transform[TRG](trg_sample.rstrip("\n")))).long())
    
    # Pad sequences 
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    trg_batch = pad_sequence(trg_batch, padding_value=PAD_IDX)

    src_len = (src_batch != PAD_IDX).sum(dim=0)
    
    return src_batch.T, src_len, trg_batch.T 

In [None]:
BATCH_SIZE = 128
train_loader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, drop_last=True)
valid_loader = DataLoader(valid_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn, drop_last=True)
test_loader = DataLoader(test_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn, drop_last=True)

print(f"Number of training examples: {len(list(train_iter))}")
print(f"Number of validation examples: {len(list(valid_iter))}")
# print(f"Number of testing examples: {len(list(test_iter))}")

Number of training examples: 29001
Number of validation examples: 1015


In [None]:
for src, trg in train_loader:
  print(src.shape, trg.shape)
  break 

torch.Size([28, 128]) torch.Size([27, 128])


## self-attention

The encoder consists of a self-attention layer (multi-head attention in the first figure) which is then followed by a feed-forward network. 
<center><img src="http://jalammar.github.io/images/t/encoder_with_tensors_2.png" alt="mlp" align="middle"></center>


The self-attention matrix calculation : 

$$\text{Attention}(Q,K,V)=\text{softmax}(\frac{QK^T}{\sqrt{d_k}} )V$$

Where:

$Q$ represents the query matrix, you want to use every word in the sentence as a query for that keyword and check its relevance for representing that key.

$K$ represents the key matrix, refers to each word in the sentence

$V$ represents the value matrix

$d_k$ is the dimensionality of the key and query vectors

dot product of $Q$ and $K$ is divided by a constant and then softmax'ed before being multiplied by a matrix $V$, attention vectors can be thought of as the *weighted* from the previous notebooks. 

**TODO** explain what the role of the denominator in the self-attention equation is (check the original paper). 

**Answer:** 

1. mitigating the vanishing or exploding gradient problem during training

2. prevent 'hard' attention, maintain the numerical stability.

  When the dimensionality of the key vectors is larger, the dot products between the query and key matrices can become very large, causing the softmax function to squash the input values into the extreme ends of the range (very small or very large values), leading to a "hard" attention mechanism, model only pay attention to single word. 
    
  By dividing the dot product by $\sqrt{d_k}$, the self-attention mechanism becomes more "soft", allowing the model to pay attention to multiple words in the input sequence.



**TODO** explain what the motivation is behind using multiple heads attention (8 in the original paper). 

**Answer:** 1. Diversity of features 2. larger receptive field 3. increased capacity 4. Parallel computation and faster training.


**TODO**  explain what the benefits of using residual connections (an extra input arrow pointing to the *Add & Norm* module) are here and in neural networks in general. 

**Answer:** Residual connections allow gradients to flow through the network directly, without passing through non-linear activation functions. This prevents the gradients from exploding or vanishing.

**TODO**

1. Modify the size of the Q, K and V matrices to be of size (batch size, n heads, sent len, hid dim // n heads). 

  use `view()` and `permute()` functions.
  
  This line will be the same for each of the three matrices. 

2. Matrix multiply Q and K and scale the output following the equation above. 

3. Matrix multiple attention and V

4. Change the shape of x to match the desired output shape.

  after using operations like transpose, permute, or slicing, use `contiguous()` method to rearranges its elements in memory, creating a new contiguous tensor with the same data, ensure that a tensor's data is stored in a contiguous block of memory. This makes certain tensor operations, such as slicing and reshaping, more efficient.


In [None]:
class SelfAttention(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        assert hid_dim % n_heads == 0

        # Weight matrices for query, key, and value
        self.w_q = nn.Linear(hid_dim, hid_dim)
        self.w_k = nn.Linear(hid_dim, hid_dim)
        self.w_v = nn.Linear(hid_dim, hid_dim)
        
        # Final fully connected layer to project the concatenated attention matrix
        self.fc = nn.Linear(hid_dim, hid_dim)
        self.do = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

        
    def forward(self, query, key, value, mask=None):
        """
        query = key = value: [batch size, sent len, hid dim] 
        """
        batch_size = query.shape[0]
        # Compute Q, K, V using linear layers. 
        # Q, K, V = [batch size, sent len, hid dim]  
        Q = self.w_q(query)
        K = self.w_k(key)
        V = self.w_v(value)
        
        # TODO 1: Reshape and permute Q, K, V.
        # Q, K, V = [batch size, sent len, n_heads, hid dim // n heads] -> (batch size, n_heads, sent len, hid dim // n heads)
        Q = Q.view(batch_size, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        # Q, K, V = [batch size, n heads, sent len, hid dim // n heads]
        
        # TODO 2: compute attention scores (energy) by dot product of Q and K and scaling
        # energy = [batch size, n heads, sent len, sent len]
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        # apply masked self-attention if provided
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        # Compute the softmax-normalized attention scores. 
        # attention = [batch size, n heads, sent len, sent len]
        attention = self.do(F.softmax(energy, dim=-1))

        # TODO 3: compute attention matrix for each head by multiplying the attention scores with V
        # x = [batch size, n heads, sent len, hid dim // n heads] -> [batch size, sent len, n heads, hid dim // n heads]
        x = torch.matmul(attention, V).permute(0, 2, 1, 3).contiguous()
        
        # TODO 4 concatenate the attention matrices for all heads. 
        # x = [batch size,sent len, hid dim]
        x = x.view(batch_size, -1, self.hid_dim)
        
        # Project the concatenated attention matrix to the output shape. 
        # x = [batch size, sent len, hid dim]
        x = self.fc(x)
        
        return x

## Positionwise Feedforward

In [None]:
class PositionwiseFeedforward(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.pf_dim = pf_dim
        
        # Two convolution layers (1x1)
        self.fc_1 = nn.Conv1d(hid_dim, pf_dim, 1)
        self.fc_2 = nn.Conv1d(pf_dim, hid_dim, 1)
        
        self.do = nn.Dropout(dropout)
        
    def forward(self, x):
        # x = [batch size, sent len, hid dim] -> [batch size, hid dim, sent len]
        x = self.do(F.relu(self.fc_1(x.permute(0, 2, 1)))) # x = [batch size, ff dim, sent len]
        
        x = self.fc_2(x).permute(0, 2, 1) # x = [batch size, hid dim, sent len] -> [batch size, sent len, hid dim]
        
        return x

## Encoder

**TODO** 

1. Apply embeddings over the source, scale these embeddings, add positional embeddings and then at the end apply dropout to everything. 



### positional embedding

Words are transformed into embeddings before being input into these modules. 

However, these embeddings are added with positional embeddings, which give the model a notion of relative input position (remember that there is no recurrence model which keeps track sequentially.). 

These positional embeddings consist of the sine and cosine functions of different frequencies: $$PE(pos, 2i) = sin(pos/10000^\frac{2i}{d_{model}})$$   $$PE(pos, 2i+1) = cos(pos/10000^\frac{2i}{d_{model}}).$$ 

Thus, each dimension of the positional encoding corresponds to a sinusoid.

This implementation uses **learned** positional embeddings rather than the fixed sinusoidal positional embeddings proposed in the original Transformer paper.

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()
        
        self.ln = nn.LayerNorm(hid_dim)
        self.sa = self_attention(hid_dim, n_heads, dropout, device)
        self.pf = positionwise_feedforward(hid_dim, pf_dim, dropout)
        self.do = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        """
        src = [batch size, src sent len, hid dim]
        src_mask = [batch size, src sent len]
        """
        # Apply self-attention and layer normalization
        src = self.ln(src + self.do(self.sa(src, src, src, src_mask)))
        # Apply positionwise feedforward and layer normalization
        src = self.ln(src + self.do(self.pf(src)))
        
        return src

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, encoder_layer, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()

        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pf_dim = pf_dim
        self.encoder_layer = encoder_layer
        self.self_attention = self_attention
        self.positionwise_feedforward = positionwise_feedforward
        self.device = device

        # Token embedding and position embedding
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(1000, hid_dim)
        
        # Create `n_layers` of encoder layers
        self.layers = nn.ModuleList([encoder_layer(hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device) 
                                     for _ in range(n_layers)])
        
        self.do = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        """
        params:
        src = [batch size, src sent len]
        src_mask = [batch size, src sent len]

        return: src = [batch size, src sent len, hid dim]
        """
        # initialize position index
        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).to(self.device)
        
        # TODO create position-aware input (token embedding + position embedding). src = [batch size, src sent len, hid dim]
        # scaling token embeddings for numerical stability
        src = self.do((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        # Pass the input through each encoder layer
        for layer in self.layers:
            src = layer(src, src_mask)
        
        # The final output of the encoder src = [batch size, src sent len, hid dim]
        return src

## Decoder

**TODO** 

1. (same as above) -- Apply embeddings over the source, scale these embeddings, add positional embeddings and then at the end apply dropout to everything. 


In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()
        
        self.ln = nn.LayerNorm(hid_dim)                             # layer norm
        self.sa = self_attention(hid_dim, n_heads, dropout, device) # self-attention
        self.ea = self_attention(hid_dim, n_heads, dropout, device) # encoder-decoder attention
        self.pf = positionwise_feedforward(hid_dim, pf_dim, dropout)  # postiionwise feedforward layer
        self.do = nn.Dropout(dropout)
        
    def forward(self, trg, src, trg_mask, src_mask):
        """
        params:
        trg: target sequence  [batch size, trg sent len, hid dim]
        src: source sequence [batch size, src sent len, hid dim]
        trg_mask: target mask [batch size, trg sent len]
        src_mask: source mask [batch size, src sent len]

        return: trg = [batch size, trg sent len, hid dim]
        """
        # The output of each step is added to the input and then passed 
        
        # first apply self-attention and layer normalization 
        trg = self.ln(trg + self.do(self.sa(trg, trg, trg, trg_mask)))

        # next Apply encoder-decoder attention and layer normalization 
        trg = self.ln(trg + self.do(self.ea(trg, src, src, src_mask)))

        # finally Apply positionwise feedforward layers and layer normalization 
        trg = self.ln(trg + self.do(self.pf(trg)))
        
        # trg = [batch size, trg sent len, hid dim]
        return trg

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, decoder_layer, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pf_dim = pf_dim
        self.decoder_layer = decoder_layer
        self.self_attention = self_attention
        self.positionwise_feedforward = positionwise_feedforward
        self.dropout = dropout
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(1000, hid_dim)
        
        self.layers = nn.ModuleList([decoder_layer(hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device)
                                     for _ in range(n_layers)])
        
        self.fc = nn.Linear(hid_dim, output_dim)
        
        self.do = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, src, trg_mask, src_mask):
        """
        params
        trg = [batch_size, trg sent len]
        src = [batch_size, src sent len]
        trg_mask = [batch size, trg sent len]
        src_mask = [batch size, src sent len]

        return
        logits = [batch size, trg sent len, output dim]
        """
        # initialize position index 
        pos = torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(self.device)
        
        # TODO input is token embedding plus positional embedding
        trg = self.do((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        # trg = [batch size, trg sent len, hid dim]
        
        # Pass the input through each decoder layer
        for layer in self.layers:
            trg = layer(trg, src, trg_mask, src_mask)
        
        # Apply the output layer to produce logits
        # logits [batch size, trg sent len, output dim]
        logits = self.fc(trg)
        return logits

## Seq2seq

The Seq2seq model itself doesn't change much. Yay modular code. 

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, pad_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.device = device
        
    def make_masks(self, src, trg):
      """
      creates masks for the source and target sequences.
      The masks are used during self-attention in both the encoder and the decoder
      to prevent the model from attending to padding tokens or future tokens in the target sequence.

      Args:
          src (torch.Tensor): Source sequence tensor of shape [batch size, src sent len]
          trg (torch.Tensor): Target sequence tensor of shape [batch size, trg sent len]

      Returns:
          src_mask (torch.Tensor): Source mask tensor of shape [batch size, 1, 1, src sent len]
          trg_mask (torch.Tensor): Target mask tensor of shape [batch size, 1, trg sent len, trg sent len]
      """
      # Create source mask by checking which tokens are not padding tokens.
      # This mask is used in the encoder self-attention to prevent attending to padding tokens.
      src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)

      # Create target padding mask by checking which tokens are not padding tokens.
      trg_pad_mask = (trg != self.pad_idx).unsqueeze(1).unsqueeze(3)

      # Create target subsequent mask to prevent attending to future tokens in the target sequence.
      # This is used in the decoder self-attention mechanism.
      # `torch.tril` returns the lower triangular matrix of a matrix 
      trg_len = trg.shape[1]
      trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), dtype=torch.uint8, device=self.device))

      # Combine target padding mask and target subsequent mask.
      trg_mask = trg_pad_mask & trg_sub_mask

      return src_mask, trg_mask

    
    def forward(self, src, trg):
        """
        takes source and target sequences as input and returns logits after passing through the encoder and decoder.

        Args:
            src (torch.Tensor): Source sequence tensor of shape [batch size, src sent len].
            trg (torch.Tensor): Target sequence tensor of shape [batch size, trg sent len].

        Returns:
            out (torch.Tensor): Output logits tensor of shape [batch size, trg sent len, output dim].
        """

        # Create masks for the source and target sequences.
        src_mask, trg_mask = self.make_masks(src, trg)

        # compute hidden states of the encoder by Pass the source sequence and source mask through the encoder.
        # These hidden states capture the contextual information of the input source sequence and serve as the input for the decoder
        # enc_src = [batch size, src sent len, hid dim] 
        enc_src = self.encoder(src, src_mask) 
           
        # Pass the target sequence, encoded source sequence, target mask, and source mask through the decoder.
        out = self.decoder(trg, enc_src, trg_mask, src_mask) 
        
        # return logits. out = [batch size, trg sent len, output dim]
        return out


## Noam Optimizer

A new optimzer is introduced in this paper. 

In [None]:
class NoamOpt:
    """
    Noam Optimizer. a learning rate scheduler used in conjunction with the Adam optimizer, 
    specifically designed for training the Transformer model. 
    """
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))

## training

In [None]:

input_dim = len(vocab_transform[SRC])
output_dim = len(vocab_transform[TRG])
hid_dim = 512
n_layers = 6
n_heads = 8
pf_dim = 2048
dropout = 0.1

enc = Encoder(input_dim, hid_dim, n_layers, n_heads, pf_dim, EncoderLayer, SelfAttention, PositionwiseFeedforward, dropout, device)
dec = Decoder(output_dim, hid_dim, n_layers, n_heads, pf_dim, DecoderLayer, SelfAttention, PositionwiseFeedforward, dropout, device)
model = Seq2Seq(enc, dec, PAD_IDX, device).to(device)
optimizer = NoamOpt(hid_dim, 1, 2000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
        
model 

The model has 55,593,007 trainable parameters


Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(8014, 512)
    (pos_embedding): Embedding(1000, 512)
    (layers): ModuleList(
      (0): EncoderLayer(
        (ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (sa): SelfAttention(
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (fc): Linear(in_features=512, out_features=512, bias=True)
          (do): Dropout(p=0.1, inplace=False)
        )
        (pf): PositionwiseFeedforward(
          (fc_1): Conv1d(512, 2048, kernel_size=(1,), stride=(1,))
          (fc_2): Conv1d(2048, 512, kernel_size=(1,), stride=(1,))
          (do): Dropout(p=0.1, inplace=False)
        )
        (do): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (sa): SelfAttention(

In [None]:
def train(model, loader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for src, trg in loader:
        src, trg = src.to(device), trg.to(device)
        
        optimizer.optimizer.zero_grad()
        output = model(src, trg)
                
        # output = [batch size, trg sent len - 1, output dim]
        # trg = [batch size, trg sent len]
        output = output[:,1:].contiguous().view(-1, output.shape[-1])
        trg = trg[:,1:].contiguous().view(-1)
        # output = [batch size * trg sent len - 1, output dim]
        # trg = [batch size * trg sent len - 1]
            
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(list(loader))

In [None]:
def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for src, trg in loader:
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg)
            # output = [batch size, trg sent len - 1, output dim]
            # trg = [batch size, trg sent len]
            output = output[:,1:].contiguous().view(-1, output.shape[-1])
            trg = trg[:,1:].contiguous().view(-1)
            # output = [batch size * trg sent len - 1, output dim]
            # trg = [batch size * trg sent len - 1]
            
            loss = criterion(output, trg)
            epoch_loss += loss.item()
        
    return epoch_loss / len(list(loader))

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

**TODO** Train for 5 epochs

In [None]:
N_EPOCHS = 5
CLIP = 1
SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'transformer-seq2seq.pt')

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    start_time = time.perf_counter()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.perf_counter()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'| Epoch: {epoch+1:03} | Time: {epoch_mins}m {epoch_secs}s| Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')

| Epoch: 001 | Time: 2m 45s| Train Loss: 3.382 | Train PPL:  29.425 | Val. Loss: 0.759 | Val. PPL:   2.137 |
| Epoch: 002 | Time: 2m 44s| Train Loss: 0.561 | Train PPL:   1.752 | Val. Loss: 0.184 | Val. PPL:   1.202 |
| Epoch: 003 | Time: 2m 46s| Train Loss: 0.182 | Train PPL:   1.199 | Val. Loss: 0.046 | Val. PPL:   1.047 |
| Epoch: 004 | Time: 2m 44s| Train Loss: 0.057 | Train PPL:   1.058 | Val. Loss: 0.007 | Val. PPL:   1.007 |
| Epoch: 005 | Time: 2m 43s| Train Loss: 0.011 | Train PPL:   1.011 | Val. Loss: 0.000 | Val. PPL:   1.000 |


In [None]:
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
test_loss = evaluate(model, test_loader, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 2.392 | Test PPL:  10.939 |
