In [1]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List
from torch import Tensor

import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from timeit import default_timer as timer

import portalocker

%matplotlib inline

In [2]:
print(DEVICE)

cpu


# Lab02 Encoder-decoder model

To start off we will do the rewriting of the tutorial of pyTorch language translation with nn.Transformer and torchtext.


## Language Translation with ``nn.Transformer`` and torchtext

This tutorial shows:
    - How to train a translation model from scratch using Transformer.
    - Use torchtext library to access  [Multi30k](http://www.statmt.org/wmt16/multimodal-task.html#task1)_ dataset to train a German to English translation model.


## Data Sourcing and Processing

[torchtext library](https://pytorch.org/text/stable/)_ has utilities for creating datasets that can be easily
iterated through for the purposes of creating a language translation
model. In this example, we show how to use torchtext's inbuilt datasets,
tokenize a raw text sentence, build vocabulary, and numericalize tokens into tensor. We will use
[Multi30k dataset from torchtext library](https://pytorch.org/text/stable/datasets.html#multi30k)_
that yields a pair of source-target raw sentences.

To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data.




In [3]:
# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

Create source and target language tokenizer. Make sure to install the dependencies.

```python
pip install spacy sacrebleu torchdata -U
python -m spacy download en_core_web_sm
python -m spacy download de_core_news_sm
```


In [4]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

2023-05-29 17:44:24.644628: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-29 17:44:29.645218: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


## Seq2Seq Network using Transformer

Transformer is a Seq2Seq model introduced in [“Attention is all you
need”](https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf)_
paper for solving machine translation tasks.
Below, we will create a Seq2Seq network that uses Transformer. The network
consists of three parts. First part is the embedding layer. This layer converts tensor of input indices
into corresponding tensor of input embeddings. These embedding are further augmented with positional
encodings to provide position information of input tokens to the model. The second part is the
actual [Transformer](https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html)_ model.
Finally, the output of the Transformer model is passed through linear layer
that gives unnormalized probabilities for each token in the target language.




In [5]:
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

During training, we need a subsequent word mask that will prevent the model from looking into
the future words when making predictions. We will also need masks to hide
source and target padding tokens. Below, let's define a function that will take care of both.




In [6]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

Let's now define the parameters of our model and instantiate the same. Below, we also
define our loss function which is the cross-entropy loss and the optimizer used for training.




In [7]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

## Collation

As seen in the ``Data Sourcing and Processing`` section, our data iterator yields a pair of raw strings.
We need to convert these string pairs into the batched tensors that can be processed by our ``Seq2Seq`` network
defined previously. Below we define our collate function that converts a batch of raw strings into batch tensors that
can be fed directly into our model.




In [8]:
# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

Let's define training and evaluation loop that will be called for each
epoch.




In [9]:
def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

Now we have all the ingredients to train our model. Let's do it!




In [29]:
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))




Epoch: 1, Train loss: 5.343, Val loss: 4.109, Epoch time = 39.404s
Epoch: 2, Train loss: 3.762, Val loss: 3.312, Epoch time = 37.973s
Epoch: 3, Train loss: 3.161, Val loss: 2.891, Epoch time = 38.304s
Epoch: 4, Train loss: 2.768, Val loss: 2.635, Epoch time = 37.722s
Epoch: 5, Train loss: 2.480, Val loss: 2.450, Epoch time = 37.362s
Epoch: 6, Train loss: 2.249, Val loss: 2.313, Epoch time = 38.036s
Epoch: 7, Train loss: 2.056, Val loss: 2.196, Epoch time = 38.372s
Epoch: 8, Train loss: 1.893, Val loss: 2.106, Epoch time = 39.752s
Epoch: 9, Train loss: 1.757, Val loss: 2.044, Epoch time = 38.533s
Epoch: 10, Train loss: 1.630, Val loss: 1.998, Epoch time = 38.503s
Epoch: 11, Train loss: 1.520, Val loss: 1.963, Epoch time = 38.733s
Epoch: 12, Train loss: 1.418, Val loss: 1.938, Epoch time = 38.531s
Epoch: 13, Train loss: 1.326, Val loss: 1.935, Epoch time = 38.633s
Epoch: 14, Train loss: 1.244, Val loss: 1.934, Epoch time = 39.303s
Epoch: 15, Train loss: 1.173, Val loss: 1.952, Epoch time

In [30]:
# save the model
torch.save(transformer.state_dict(), './transformer.pth')

In [10]:
# load the model
transformer.load_state_dict(torch.load('./transformer.pth', map_location=torch.device(DEVICE)))

<All keys matched successfully>

In [11]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def greedy_translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [12]:
print(greedy_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))
print(greedy_translate(transformer, "Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster ."))
print(greedy_translate(transformer, "Ein junges Mädchen in einem Karateanzug bricht einen Stock mit einem Tritt ."))

 A group of people standing in front of an igloo . 
 A man in a blue shirt is standing on a ladder cleaning a window . 
 A young girl in a karate uniform is practicing a stick with a stick . 


## Theoretical questions

Now that the tutorial is complete and that we have our greedy translator let's answer the following questions :

### (1) In the positional encoding, why are we using a combination of sinus and cosinus?
First of all, positional encodings are used to provide the model with information about the relative positions of tokens in the input sequence because Transformer does not have notion of word order or position. With it the Transformer model can capture the sequential relationships between tokens and leverage this information during the self-attention mechanism.

But, encoding using only one type of function (sine or cosine) would result in a biased encoding pattern.

By using both sine and cosine functions, the positional encoding can capture different positional patterns. Both functions vary in a wave-like manner and are shifted in phase. So these two functions together help to encode different positional relationships between tokens.

The high-frequency components emanating from these functions capture small details while the low-frequency components capture larger positional relationships. So, with both functions we create a rich representation of token positions in the input sequence, enabling the transformer model to effectively use positional information during the translation process.

### (2) In the Seq2SeqTransformer class, what is the parameter nhead for?

The parameter nhead stands for the number of attention heads in the Transformer model.

As seen in class the attention mechanism in the Transformer architecture involves computing self-attention, where we capture dependencies and relationships betwwen encoding and decoding. This operation is performed by using the model's hidden layers now called attention heads.

So, the nhead parameter determines the number of attention heads used in both the encoder and decoder hidden layers of the Transformer. Each head are used to capture different types of relationships and patterns inside the model so that we can use all the information we can. Instead of only using the end sequence and have a biased attention mechanism.

As nhead gets bigger it allows for more complex interactions between tokens. But, it comes at a great cost of computational efficiency. We then have to find a balance between these two concepts.

### (3) In the Seq2SeqTransformer class, what is the point of the generator ?

The generator is used as the final layer of the model. It is responsible for generating the output predictions based on the transformed inputs.

In our case it is defined as `nn.Linear(emb_size, tgt_vocab_size)`. It is a linear layer that projects the input tensor from the embedding size to the size of the target vocabulary.

When using the generator, the model obtains logits (raw scores before any activation function) for each word in the target vocabulary. These logits represent the model's predictions for the likelihood of each word being the next token in the translated sequence.

It can be then passed through a softmax function to obtain a probability distribution over the target vocabulary, allowing for the selection of the most likely next word !

### (4) Describe the goal of the create_mask function. Why does it handle differently the source and target masks?

The goal of the `create_mask` function is to generate masks that are used in the attention mechanism of the Transformer model.
By handling the source and target masks differently, it ensures that the attention mechanism in the Transformer model is used correctly for both the source and target sequences.
There are 4 types of masks : 
- The source mask (`src_mask`) allows for finding dependencies between positions in the input sequence
- The target mask (`tgt_mask`) keeps the promise that a token only attends to previous tokens in the target sequence.
- The padding masks (`src_padding_mask` and `tgt_padding_mask`) exclude padding token (`<pad>`) from the attention mechanism to avoid unnecessary computations and maintain the integrity of the model's attention weights.

## Decoding functions

Let's make another program such as the greedy translator but using a <u>top-k sampling with temperature :</u>

In [13]:
def topk_decode(model, src, src_mask, max_len, start_symbol, k=2, temperature=1.0):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])

        # topk
        topk_prob, topk_idx = torch.topk(prob, k=k, dim=1)
        topk_prob = topk_prob.squeeze()
        topk_idx = topk_idx.squeeze()

        # convert topk_prob to float64 to avoid overflow
        topk_prob = topk_prob.type(torch.float64)

        # temperature
        sum_prob = torch.sum(torch.exp(topk_prob / temperature))
        topk_prob = torch.exp(topk_prob / temperature) / sum_prob

        # take one of the topk idx based on prob
        idx_next_word = torch.multinomial(topk_prob, 1)

        next_word = topk_idx[idx_next_word].item()


        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break

    return ys


def topk_translate(model: torch.nn.Module, src_sentence: str, k=2, temperature=1.0):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = topk_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, k=k, temperature=temperature).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
    

In [14]:
print(topk_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k=10, temperature=0.1))
print(topk_translate(transformer, "Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster .", k=10, temperature=0.1))
print(topk_translate(transformer, "Ein junges Mädchen in einem Karateanzug bricht einen Stock mit einem Tritt .", k=10, temperature=0.1))

 A group of people stand in front of an igloo . 
 A man in a blue shirt is standing on a ladder cleaning a window . 
 A young girl in a karate uniform is practicing a stick with a stick . 


And using a <u>top-p sampling with temperature :</u>

In [15]:
def topp_decode(model, src, src_mask, max_len, start_symbol, p=0.75, temperature=1.0):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])

        # topp
        sorted_prob, sorted_idx = torch.sort(prob, descending=True)
        cumsum_prob = torch.cumsum(sorted_prob, dim=1)

        # find the last index where cumsum_prob < p
        idx = torch.where(cumsum_prob < p, cumsum_prob, torch.ones_like(cumsum_prob))
        idx = torch.nonzero(idx == 1.0)

        # if all cumsum_prob > p, then idx is empty
        if len(idx) == 0:
            idx = torch.tensor([[0, 0]])

        # get the last index
        idx = idx[-1, 1]

        # get the top p idx
        top_p_idx = sorted_idx[:, :idx+1]

        # convert top_p_idx to float64 to avoid overflow
        top_p_prob = sorted_prob[:, :idx+1].type(torch.float64)

        top_p_idx = top_p_idx.squeeze()
        top_p_prob = top_p_prob.squeeze()

        # temperature
        sum_prob = torch.sum(torch.exp(top_p_prob / temperature))
        top_p_prob = torch.exp(top_p_prob / temperature) / sum_prob

        # take one of the top p idx based on prob
        idx_next_word = torch.multinomial(top_p_prob, 1)

        next_word = top_p_idx[idx_next_word].item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break

    return ys


def topp_translate(model: torch.nn.Module, src_sentence: str, p=0.75, temperature=1.0):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = topp_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, p=p, temperature=temperature).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [16]:
print(topp_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p=0.75, temperature=0.1))
print(topp_translate(transformer, "Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster .", p=0.75, temperature=0.1))
print(topp_translate(transformer, "Ein junges Mädchen in einem Karateanzug bricht einen Stock mit einem Tritt .", p=0.75, temperature=0.1))

 A group of people stand in front an igloo . 
 A man in a blue shirt is standing on a ladder cleaning a window . 
 A young girl in a karate uniform is practicing a stick with a stick . 


Ok now that we have 3 different translators let's compare them qualitatively as we play with the k, p and temperature parameters:

In [17]:
k=[2,5,10,20,50]
p=[0.1,0.5,0.75,2]

First of all let's explain those 3 parameters :  
- Temperature  
    The temperature parameter affects the randomness of the generated output during sampling. It scales the logits to adjust the shape of the probability distribution. A higher temperature (>=1.0) increases randomness and encourages more exploration, resulting in diverse output. Conversely, a lower temperature (<=0.5) reduces randomness, leading to more focused and deterministic output.  
  
- Top-k sampling (parameter k)  
    The parameter k determines the size of the "top-k" set. Instead of considering the entire vocabulary, the model only considers the k most likely tokens based on their probabilities.  

- Top-p (nucleus) sampling (parameter p)  
    This method limits the candidate tokens based on a cumulative probability threshold. The parameter p determines the threshold, where tokens with cumulative probabilities higher than p are considered. This allows for dynamic control over the number of candidate tokens and helps to maintain diversity while avoiding extremely unlikely or low-quality outputs.  

 
Now, if we use the temperature equal to 0.1 we will see practically always the same result for the three methods like so : 

In [18]:
print("Good translation : A group of people stand in front of an igloo.")
print("Greedy translate :", greedy_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))
print("Top-k translate :")
for i in k:
    print("k=",i,":",topk_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k=i, temperature=0.1))
print("Top-p translate :")
for i in p:
    print("p=",i,":",topp_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p=i, temperature=0.1))

Good translation : A group of people stand in front of an igloo.
Greedy translate :  A group of people standing in front of an igloo . 
Top-k translate :
k= 2 :  A group of people stand in front an igloo . 
k= 5 :  A group of people stand in front an igloo . 
k= 10 :  A group of people stand in front an igloo . 
k= 20 :  A group of people standing in front of an igloo . 
k= 50 :  A group of people stand in front of an igloo . 
Top-p translate :
p= 0.1 :  A group of people standing in front of an igloo . 
p= 0.5 :  A group of people standing in front of an igloo . 
p= 0.75 :  A group of people stand in front an igloo . 
p= 2 :  A group of people stand in front an igloo . 


As we can see if we have a low temperature the randomness is drastically reduced and it leaves us with deterministic outputs.  

Let's try with a temperature equal to `1.0` for three different examples:

In [19]:
print("First translate : A group of people stand in front of an igloo.")
print("Greedy translate :", greedy_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))
print("Top-k translate :")
for i in k:
    print("k=",i,":",topk_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k=i))
print("Top-p translate :")
for i in p:
    print("p=",i,":",topp_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p=i))

First translate : A group of people stand in front of an igloo.
Greedy translate :  A group of people standing in front of an igloo . 
Top-k translate :
k= 2 :  A group of people standing in front of an igloo 
k= 5 :  A group of people are standing in front an abandoned machines . 
k= 10 :  A group of people stand in front of an auditorium . 
k= 20 :  A group of people standing in front of an igloo 
k= 50 :  A group of people in an igloo 
Top-p translate :
p= 0.1 :  A group of people , standing in front of an pane . 
p= 0.5 :  A group of people stand in front of an Ebay too ear . 
p= 0.75 :  A group of people stand in front an interested machines . 
p= 2 :  A group of people are standing in front of an forestry . 


In [20]:
print("Second translate : A man in a blue shirt is standing on a ladder cleaning a window.")
print("Greedy translate :", greedy_translate(transformer, "Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster ."))
print("Top-k translate :")
for i in k:
    print("k=",i,":",topk_translate(transformer, "Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster .", k=i))
print("Top-p translate :")
for i in p:
    print("p=",i,":",topp_translate(transformer, "Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster .", p=i))

Second translate : A man in a blue shirt is standing on a ladder cleaning a window.
Greedy translate :  A man in a blue shirt is standing on a ladder cleaning a window . 
Top-k translate :
k= 2 :  A man wearing a blue shirt stands on a ladder cleaning a window . 
k= 5 :  Man in blue shirt standing up on a ladder cleaning a window . 
k= 10 :  A man in a blue shirt stands on a ladder cleaning a window . 
k= 20 :  A man in a blue shirt is on a ladder cleaning a window . 
k= 50 :  A man in a blue shirt is standing on a ladder washing a window . 
Top-p translate :
p= 0.1 :  A man in a blue shirt stands on a ladder cleaning a window . 
p= 0.5 :  A man in a blue shirt is standing on a ladder cleaning a window . 
p= 0.75 :  A man in a blue shirt is on a ladder cleaning a window . 
p= 2 :  Man in a blue shirt standing on a ladder cleaning a window . 


In [21]:
print("Third translate: A young girl in a karate suit breaks a stick with a kick.")
print("Greedy translate :", greedy_translate(transformer, "Ein junges Mädchen in einem Karateanzug bricht einen Stock mit einem Tritt ."))
print("Top-k translate :")
for i in k:
    print("k=",i,":",topk_translate(transformer, "Ein junges Mädchen in einem Karateanzug bricht einen Stock mit einem Tritt .", k=i))
print("Top-p translate :")
for i in p:
    print("p=",i,":",topp_translate(transformer, "Ein junges Mädchen in einem Karateanzug bricht einen Stock mit einem Tritt .", p=i))

Third translate: A young girl in a karate suit breaks a stick with a kick.
Greedy translate :  A young girl in a karate uniform is practicing a stick with a stick . 
Top-k translate :
k= 2 :  A young girl in a karate karate uniform uses a stick . 
k= 5 :  A young girl in a karate uniform is parasailing with a stick . 
k= 10 :  A young girl in a karate uniform is flying with a stick in a kick . 
k= 20 :  A young girl dressed in a karate karate uniform is following with a stick . 
k= 50 :  A young girl in a karate uniform is following a stick with a male kick . 
Top-p translate :
p= 0.1 :  Young girl in a military uniform is filming a stick in a game . 
p= 0.5 :  Young girl in a karate uniform gets excited with a stick in it . 
p= 0.75 :  Young girl in a karate uniform surfs with a stick in a karate . 
p= 2 :  A young girl in a karate uniform uses a stick with a stick . 


We observe that the temperature is letting the translation be more random.  
As we increase the parameter k or p the action in the sentence (end of sentence) becomes different situations that has nothing to do with the real translation. It adds diversity to the translation but in terms of qualitative results the greedy translate is the best output everytime, so keeping lower parameters in the other methods seems to be preferable to have results matching the authentic translation.  

As we are curious let's see what are the results for high paramters and temperature:

In [22]:
print("Good translation : A group of people stand in front of an igloo.")
print("Greedy translate :", greedy_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))
print("Top-k translate :")
print("k=",100,":",topk_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k=100, temperature=2.0))
print("Top-p translate :")
print("p=",5,":",topp_translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p=5, temperature=2.0))

Good translation : A group of people stand in front of an igloo.
Greedy translate :  A group of people standing in front of an igloo . 
Top-k translate :
k= 100 :  A group of professionally people in progress stand 
Top-p translate :
p= 5 :  Deep manipulating well outside trailer terminals a Thriller Ices bag . 


As expected, the traductions became completely out of context as the randomness and diversity are extremely high.

## Computing BLEU score of the model

In [23]:
# Use sacreBLEU implem to evaluate the model and compare the 3 methods of decoding on the test set
from torchtext.data.metrics import bleu_score
from tqdm import tqdm

test_iter = Multi30k(split='test', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))

def pull_cand_ref(data_iter, model):
    dict = {}
    references = []
    candidates_greedy = []
    candidates_topk = []
    candidates_topp = []

    for (src, tgt) in tqdm(data_iter, total=1000):
        candidates_greedy.append(greedy_translate(model, src).split())
        candidates_topk.append(topk_translate(model, src, k=5).split())
        candidates_topp.append(topp_translate(model, src, p=0.75).split())

        references.append([tgt.split()])
    
    dict["greedy"] = candidates_greedy
    dict["topk"] = candidates_topk
    dict["topp"] = candidates_topp
    dict["refs"] = references

    return dict

dict_corpus = pull_cand_ref(test_iter, transformer)

100%|██████████| 1000/1000 [06:05<00:00,  2.74it/s]


In [25]:
# Get BLEU score on greedy decoding
print("BLEU score on greedy decoding :")
print(f"BLEU score = {bleu_score(dict_corpus['greedy'], dict_corpus['refs'])*100:.2f}%")

# Get BLEU score on top-k decoding
print("BLEU score on top-k decoding with k=5 and temperature=1:")
print(f"BLEU score = {bleu_score(dict_corpus['topk'], dict_corpus['refs'])*100:.2f}%")

# Get BLEU score on top-p decoding
print("BLEU score on top-p decoding with p=0.75 and temperature=1:")
print(f"BLEU score = {bleu_score(dict_corpus['topp'], dict_corpus['refs'])*100:.2f}%")

BLEU score on greedy decoding :
BLEU score = 29.15%
BLEU score on top-k decoding with k=5 and temperature=1:
BLEU score = 24.30%
BLEU score on top-p decoding with p=0.75 and temperature=1:
BLEU score = 22.93%


As stated earlier the greedy decoder has a better translating efficiency than the other functions.