In [12]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List


# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

In [13]:
import spacy
import spacy.cli
spacy.cli.download("en_core_web_sm")
spacy.cli.download("de_core_news_sm")

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 25.2 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting de-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.5.0/de_core_news_sm-3.5.0-py3-none-any.whl (14.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 41.9 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [14]:
import portalocker
from portalocker import *

In [15]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')



def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    """
    Helper function that yields a list of tokens from the given data iterator and language.

    Args:
        data_iter (Iterable): Iterable containing the data samples.
        language (str): Language code specifying the language of the data.

    Returns:
        List[str]: List of tokens extracted from the data samples.
    """
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])


# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

# Build vocabularies for source and target languages
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)


for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    """
    Set the default index of the vocabulary for a specific language.

    Args:
        ln (str): Language code specifying the language of the vocabulary.

    Returns:
        None
    """
    vocab_transform[ln].set_default_index(UNK_IDX)

In [16]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PositionalEncoding(nn.Module):
    """
    Module that adds positional encoding to the token embedding to introduce a notion of word order.

    Args:
        emb_size (int): The embedding size of the tokens.
        dropout (float): The dropout probability.
        maxlen (int, optional): The maximum length of the sequence. Defaults to 5000.
    """
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor) -> Tensor:
        """
        Apply positional encoding to the token embeddings.

        Args:
            token_embedding (Tensor): The token embeddings.

        Returns:
            Tensor: The token embeddings with positional encoding applied.
        """
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class TokenEmbedding(nn.Module):
    """
    Module to convert tensor of input indices into corresponding tensor of token embeddings.

    Args:
        vocab_size (int): The size of the vocabulary.
        emb_size (int): The embedding size.
    """
    def __init__(self, vocab_size: int, emb_size: int):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor) -> Tensor:
        """
        Convert tensor of input indices into corresponding tensor of token embeddings.

        Args:
            tokens (Tensor): The input indices.

        Returns:
            Tensor: The token embeddings.
        """
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class Seq2SeqTransformer(nn.Module):
    """
    Seq2Seq Transformer model.

    Args:
        num_encoder_layers (int): Number of encoder layers.
        num_decoder_layers (int): Number of decoder layers.
        emb_size (int): The embedding size.
        nhead (int): The number of attention heads.
        src_vocab_size (int): The size of the source vocabulary.
        tgt_vocab_size (int): The size of the target vocabulary.
        dim_feedforward (int, optional): The hidden size of the feedforward layers. Defaults to 512.
        dropout (float, optional): The dropout probability. Defaults to 0.1.
    """
    def __init__(
        self,
        num_encoder_layers: int,
        num_decoder_layers: int,
        emb_size: int,
        nhead: int,
        src_vocab_size: int,
        tgt_vocab_size: int,
        dim_feedforward: int = 512,
        dropout: float = 0.1,
    ):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(
            d_model=emb_size,
            nhead=nhead,
            num

In [17]:
def generate_square_subsequent_mask(sz: int) -> Tensor:
    """
    Generate a square subsequent mask of shape (sz, sz) for self-attention.

    Args:
        sz (int): The size of the mask.

    Returns:
        Tensor: The square subsequent mask.
    """
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src: Tensor, tgt: Tensor) -> tuple:
    """
    Create masks for source and target sequences.

    Args:
        src (Tensor): The source sequence.
        tgt (Tensor): The target sequence.

    Returns:
        tuple: A tuple containing the source mask, target mask, source padding mask, and target padding mask.
    """
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [18]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [19]:
from torch.nn.utils.rnn import pad_sequence
from typing import List

def sequential_transforms(*transforms: Callable) -> Callable:
    """
    Helper function to club together sequential operations.

    Args:
        *transforms: Sequence of functions to be applied sequentially.

    Returns:
        func: A function that applies the sequence of transformations to the input.
    """
    def func(txt_input: str):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

def tensor_transform(token_ids: List[int]) -> Tensor:
    """
    Function to add BOS/EOS and create tensor for input sequence indices.

    Args:
        token_ids (List[int]): List of token indices.

    Returns:
        Tensor: Tensor containing the token indices with BOS/EOS tokens added.
    """
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln],  # Tokenization
                                               vocab_transform[ln],  # Numericalization
                                               tensor_transform)  # Add BOS/EOS and create tensor

def collate_fn(batch: List[Tuple[str, str]]) -> Tuple[Tensor, Tensor]:
    """
    Function to collate data samples into batch tensors.

    Args:
        batch (List[Tuple[str, str]]): List of tuples containing source and target sentences.

    Returns:
        Tuple[Tensor, Tensor]: A tuple containing the source batch tensor and target batch tensor.
    """
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [20]:
def train_epoch(model, optimizer) -> float:
    """
    Perform one training epoch.

    Args:
        model (Seq2SeqTransformer): The model to train.
        optimizer (torch.optim.Optimizer): The optimizer to use for training.

    Returns:
        float: The average training loss for the epoch.
    """
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model) -> float:
    """
    Perform evaluation on the validation set.

    Args:
        model (Seq2SeqTransformer): The model to evaluate.

    Returns:
        float: The average validation loss.
    """
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [21]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


Epoch: 1, Train loss: 5.344, Val loss: 4.104, Epoch time = 1214.843s
Epoch: 2, Train loss: 3.759, Val loss: 3.309, Epoch time = 1236.724s
Epoch: 3, Train loss: 3.159, Val loss: 2.891, Epoch time = 1225.386s
Epoch: 4, Train loss: 2.768, Val loss: 2.644, Epoch time = 1294.236s
Epoch: 5, Train loss: 2.479, Val loss: 2.443, Epoch time = 1239.040s
Epoch: 6, Train loss: 2.252, Val loss: 2.311, Epoch time = 1249.802s
Epoch: 7, Train loss: 2.062, Val loss: 2.197, Epoch time = 1187.805s
Epoch: 8, Train loss: 1.899, Val loss: 2.109, Epoch time = 1172.538s
Epoch: 9, Train loss: 1.756, Val loss: 2.062, Epoch time = 1195.688s
Epoch: 10, Train loss: 1.636, Val loss: 2.020, Epoch time = 1198.641s
Epoch: 11, Train loss: 1.524, Val loss: 1.972, Epoch time = 1167.054s
Epoch: 12, Train loss: 1.423, Val loss: 1.950, Epoch time = 1179.394s
Epoch: 13, Train loss: 1.333, Val loss: 1.942, Epoch time = 1236.072s
Epoch: 14, Train loss: 1.253, Val loss: 1.922, Epoch time = 1207.063s
Epoch: 15, Train loss: 1.177,

## Questions Théoriques

#### In the positional encoding, why are we using a combination of sinus and cosinus?

L'objectif de sin et de cos est de proposer une répartition cyclique comprise en permanence entre -1 et 1. Si l'on utilise une autre répartition, il serait fort possible que certaines valeurs deviennent excessivement grandes, ce qui n'aurait pas beaucoup d'intérêt. De plus, le fait d'alterner entre les sinus et les cosinus permet de pouvoir utiliser une transformation linéaire sur les vecteurs formés, en obtenant à la fin les mêmes fonctions mais avec un offset, ce qui permet une bonne modularité.

#### In the Seq2SeqTransformer class, What is the parameter nhead for?

Dans Seq2SeqTransformer, le paramètre nhead permet de modifier le nombre de head qui seront utilisés par le module multi-head attention, qui permet de faire tourner un certain nombre de fois une fonction d'activation sur des inputs Q, K et V.

#### What is the point of the generator?

Dans Seq2SeqTransformer, le générateur va permettre d'associer à chaque valeur la probabilité qu'il s'agisse de la meilleure valeur possible, en fonction des valeurs déjà détectées auparavant et de la valeur actuelle de l'input.

#### Describe the goal of the create_mask function. Why does it handle differently the source and target masks?

# Decoding functions

In [24]:
def greedy_decode(model: Module, src: Tensor, src_mask: Tensor, max_len: int, start_symbol: int) -> Tensor:
    """
    Generate an output sequence using a greedy decoding algorithm.

    Args:
        model (Seq2SeqTransformer): The trained model.
        src (Tensor): The input source sequence.
        src_mask (Tensor): The source mask.
        max_len (int): The maximum length of the output sequence.
        start_symbol (int): The index of the start symbol.

    Returns:
        Tensor: The generated output sequence.
    """
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


def translate_greedy(model: torch.nn.Module, src_sentence: str) -> str:
    """
    Translate an input sentence into the target language using greedy decoding.

    Args:
        model (Seq2SeqTransformer): The trained model.
        src_sentence (str): The input source sentence.

    Returns:
        str: The translated sentence in the target language.
    """
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")


print(translate_greedy(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))

 A group of people stand in front of an igloo 


In [26]:
import torch.nn.functional as F
import numpy as np

def top_k_decode(model: torch.nn.Module, src: torch.Tensor, src_mask: torch.Tensor, max_len: int, start_symbol: int, k: int, temp: float) -> torch.Tensor:    
    """
    Generate an output sequence using top-k decoding algorithm.

    Args:
        model (Seq2SeqTransformer): The trained model.
        src (Tensor): The input source sequence.
        src_mask (Tensor): The source mask.
        max_len (int): The maximum length of the output sequence.
        start_symbol (int): The index of the start symbol.
        k (int): The number of top-k candidates to consider.
        temp (float): The temperature parameter for controlling randomness.

    Returns:
        Tensor: The generated output sequence.
    """
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len - 1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        a, next_word = torch.topk(prob.flatten(), k)
        a = F.softmax(a / temp, dim=0)
        a = a.cpu().detach().numpy()
        next_word = next_word.cpu().numpy()
        next_word = np.random.choice(next_word, 1, p=(a / a.sum()))[0]
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


def translate_top_k(model: torch.nn.Module, src_sentence: str, k: int, temp: float) -> str:
    """
    Translate an input sentence into the target language using top-k decoding.

    Args:
        model (Seq2SeqTransformer): The trained model.
        src_sentence (str): The input source sentence.
        k (int): The number of top-k candidates to consider.
        temp (float): The temperature parameter for controlling randomness.

    Returns:
        str: The translated sentence in the target language.
    """
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = top_k_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, k=k, temp=temp).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")


print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k=5, temp=0.7))

 A group of people standing in front of an igloo 


In [27]:
def top_p_decode(model: torch.nn.Module, src: torch.Tensor, src_mask: torch.Tensor, max_len: int, start_symbol: int, p: float, temp: float) -> torch.Tensor:
    """
    Generate an output sequence using top-p (nucleus) decoding algorithm.

    Args:
        model (Seq2SeqTransformer): The trained model.
        src (Tensor): The input source sequence.
        src_mask (Tensor): The source mask.
        max_len (int): The maximum length of the output sequence.
        start_symbol (int): The index of the start symbol.
        p (float): The probability threshold for nucleus sampling.
        temp (float): The temperature parameter for controlling randomness.

    Returns:
        Tensor: The generated output sequence.
    """
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len - 1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        prob = prob - prob.min()
        prob = prob / prob.sum()
        sorted_prob, sorted_indices = torch.sort(prob.flatten(), descending=True)
        cumulative_probs = torch.cumsum(sorted_prob, dim=0)
        truncate_indices = cumulative_probs > p
        truncate_indices[1:] = truncate_indices[:-1]
        truncate_indices[0] = False
        next_word_indices = sorted_indices[truncate_indices]
        next_word_probs = sorted_prob[truncate_indices] / sorted_prob[truncate_indices].sum()

        next_word = np.random.choice(next_word_indices.cpu().numpy(), 1, p=next_word_probs.cpu().numpy())[0]
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


def translate_top_p(model: torch.nn.Module, src_sentence: str, p: float, temp: float) -> str:
    """
    Translate an input sentence into the target language using top-p (nucleus) decoding.

    Args:
        model (Seq2SeqTransformer): The trained model.
        src_sentence (str): The input source sentence.
        p (float): The probability threshold for nucleus sampling.
        temp (float): The temperature parameter for controlling randomness.

    Returns:
        str: The translated sentence in the target language.
    """
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = top_p_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, p=p, temp=temp).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")


print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p=0.0006, temp=0.7))

 A crowd of People in an igloo area . 


In [28]:
print(translate_greedy(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))
print()
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 5, temp = 0.7))
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 5, temp = 0.7))
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 5, temp = 0.7))
print()
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 5, temp = 0.9))
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 5, temp = 0.9))
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 5, temp = 0.9))
print()
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 10, temp = 0.7))
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 10, temp = 0.7))
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 10, temp = 0.7))
print()
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 10, temp = 0.9))
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 10, temp = 0.9))
print(translate_top_k(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k = 10, temp = 0.9))
print()
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.0006, temp = 0.7))
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.0006, temp = 0.7))
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.0006, temp = 0.7))
print()
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.0006, temp = 0.9))
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.0006, temp = 0.9))
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.0006, temp = 0.9))
print()
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.001, temp = 0.7))
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.001, temp = 0.7))
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.001, temp = 0.7))
print()
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.001, temp = 0.9))
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.001, temp = 0.9))
print(translate_top_p(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p = 0.001, temp = 0.9))

 A group of people stand in front of an igloo 

 A group of people stand in front of an igloo 
 A group of people in front of an igloo 
 A group of people stand in front an auditorium . 

 A group of people in front of an envelope . 
 A group of people stand in front of an abandoned facility . 
 Group of people stand in front an igloo . 

 A group of people standing in front of an auditorium . 
 A group of people standing in front of an igloo 
 A group of people stand in front an igloo . 

 A group of people in front an auditorium . 
 A group of people stand in front of an igloo . 
 A group of people standing in front of an igloo . 

 A crowd of individuals stand in front of microphones . 
 Group Group in midair standing in front an office setting . 
 Group of people standing in front of an igloo area . 

 Group of People stand in midair . area 
 A group is standing in an olive igloo . area 
 Group Group of individuals standing in an office area 

 The crowd are gathered outside a iglo

On peut noter que le modèle Greedy étant déterministe et se basant à chaque fois sur la probabilité la plus élevée, il y a une forte probabilité d'obtenir une bonne réponse. Les deux autres modèles, eux, ne sont pas déterministes et peuvent donner des résultats différents. On peut également noter que le modèle top_k reste relativement cohérent à mesure que l'on augmente la taille du k-sample, alors que le modèle top_p va très rapidement diverger. Le paramètre de température, lui, permet de rester plus ou moins dans le thème initial : plus il est faible (inférieur à 1), et plus les valeurs à haute probabilité auront une chance d'être sélectionnées pour la prédiction finale.

De ce que l'on peut en voir, on peut donc en conclure qu'avec le dataset que l'on a, le modèle le plus efficace est celui qui est Greedy, mais le modèle de Top_k est également très efficace et permet d'obtenir plusieurs résultats, ce qui peut dans certains cas se révéler plus avantageux que le Greedy. Le modèle de top_p, lui, n'est pas très efficace dû à la taille importante du dataset qu'il a à prendre en compte ; l'idéal serait pour lui de moduler la valeur p en fonction de chaque mot.

# Compute the BLEU score of the model

In [43]:
import sacrebleu
from torchtext.data.metrics import bleu_score
from sacrebleu.metrics import BLEU

def calculate_bleu(data: Dataset, model: Module, decoding_approach: str, k: int = None, temp: float = None, p: float = None) -> float:    """
    Calculate the BLEU score for a given dataset using a specific decoding approach.

    Args:
        data (torchtext.data.Dataset): The dataset containing source and target sentences.
        model (torch.nn.Module): The trained model for translation.
        decoding_approach (str): The decoding approach to use. Possible values: "greedy", "top_k", "top_p".
        k (int, optional): The value of k for top-k decoding. Only used when decoding_approach is "top_k".
        temp (float, optional): The temperature parameter for decoding. Only used when decoding_approach is "top_k" or "top_p".
        p (float, optional): The value of p for top-p decoding. Only used when decoding_approach is "top_p".

    Returns:
        bleu_score (float): The BLEU score of the predicted translations.

    Raises:
        ValueError: If an invalid decoding_approach is provided.

    """

    trgs = []
    pred_trgs = []
    
    for src, target in data:

        if decoding_approach == "greedy":
            pred_trg = translate_greedy(model, src)
        elif decoding_approach == "top_k":
            pred_trg = translate_top_k(model, src, k=k, temp=temp)
        elif decoding_approach == "top_p":
            pred_trg = translate_top_p(model, src, p=p, temp=temp)
        else:
            raise ValueError("Invalid decoding approach")

        pred_trg = pred_trg[:-1] 
        pred_trgs.append(pred_trg)
        trgs.append([target])

        
    bleu = BLEU()
    return bleu.corpus_score(pred_trgs, trgs) # .corpus_bleu(pred_trgs, trgs)

test_iter = Multi30k(split='test', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))

bleu_greedy = calculate_bleu(test_iter, transformer, "greedy")
bleu_top_k = calculate_bleu(test_iter, transformer, "top_k", k=5, temp=0.7)
bleu_top_p = calculate_bleu(test_iter, transformer, "top_p", p=0.0006, temp=0.7)



In [48]:
print('Greedy BLEU score = ', bleu_greedy)
print('Top-k BLEU score = ', bleu_top_k)
print('Top-p BLEU score = ', bleu_top_p)


Greedy BLEU score =  BLEU = 60.43 100.0/80.0/44.4/37.5 (BP = 1.000 ratio = 1.000 hyp_len = 11 ref_len = 11)
Top-k BLEU score =  BLEU = 50.81 91.7/54.5/40.0/33.3 (BP = 1.000 ratio = 1.000 hyp_len = 12 ref_len = 12)
Top-p BLEU score =  BLEU = 33.52 90.9/50.0/22.2/12.5 (BP = 1.000 ratio = 1.000 hyp_len = 11 ref_len = 11)


BLEU : c'est le score BLEU global pour le corpus, calculé comme une moyenne géométrique des précisions n-grammes, pondérée par une pénalisation pour les phrases trop courtes (BP). 

counts: Un tuple contenant le nombre de n-grammes correspondants pour chaque ordre de n-grammes (1-grammes, 2-grammes, etc.).

totals: Un tuple contenant le nombre total de n-grammes pour chaque ordre de n-grammes.

precisions: Un tuple contenant les précisions pour chaque ordre de n-grammes.

BP: c'est le facteur de pénalisation de longueur (Brevity Penalty). Si la longueur de la traduction est égale ou supérieure à la longueur de la référence, BP vaut 1, sinon il est inférieur à 1. Cela pénalise les traductions qui sont beaucoup plus courtes que les références.

ratio: c'est le rapport entre la longueur de la traduction et la longueur de la référence. Un ratio de 1 signifie que la traduction et la référence ont la même longueur.

hyp_len: c'est la longueur totale des traductions (hypothèses) dans le corpus.

ref_len: c'est la longueur totale des références dans le corpus.

# Bonus 1: Hyperparameters search 

In [50]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'temp': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'k': [3, 5, 7, 9, 11],
    'p': [0.0003, 0.0006, 0.0009, 0.0012, 0.0015]
}
grid = ParameterGrid(param_grid)

best_score = 0
best_params = None

# We will use 20% of the test data for hyperparameter tuning
# Convert the test_iter to a list
test_iter = list(Multi30k(split='test', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)))
validation_data, _ = torch.utils.data.random_split(test_iter, [int(len(test_iter)*0.2), len(test_iter) - int(len(test_iter)*0.2)])

for params in grid:
    temp = params['temp']
    k = params['k']
    p = params['p']
    
    score_top_k= calculate_bleu(validation_data, transformer, "top_k", k=k, temp=temp).score
    score_top_p = calculate_bleu(validation_data, transformer, "top_p", p=p, temp=temp).score
    if score_top_k > best_score:
        best_score = score_top_k
        best_params = params
        best_approach = "top_k"
    if score_top_p > best_score:
        best_score = score_top_p
        best_params = params
        best_approach = "top_p"
        
print(f'Best score: {best_score}')
print(f'Best parameters: {best_params}')
print(f'Best approach: {best_approach}')


Best score: 42.7287006396234
Best parameters: {'k': 11, 'p': 0.0012, 'temp': 0.9}
Best approach: top_k


In [51]:
def translate_batch_greedy(model: torch.nn.Module, src_sentences: List[str]) -> List[str]:
    """
    Translate a batch of source sentences using the greedy decoding approach.

    Args:
        model (torch.nn.Module): The trained model for translation.
        src_sentences (List[str]): A list of source sentences to be translated.

    Returns:
        translated_sentences (List[str]): A list of translated sentences.

    """

    model.eval()
    translated_sentences = []
    for src_sentence in src_sentences:
        src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
        num_tokens = src.shape[0]
        src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
        tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
        translated_sentence = " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
        translated_sentences.append(translated_sentence)
    return translated_sentences

def translate_batch_top_k(model: torch.nn.Module, src_sentences: List[str], k: int, temp: float) -> List[str]:
    """
    Translate a batch of source sentences using the top-k decoding approach.

    Args:
        model (torch.nn.Module): The trained model for translation.
        src_sentences (List[str]): A list of source sentences to be translated.
        k (int): The value of k for top-k decoding.
        temp (float): The temperature parameter for decoding.

    Returns:
        translated_sentences (List[str]): A list of translated sentences.

    """
    model.eval()
    translated_sentences = []
    for src_sentence in src_sentences:
        src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
        num_tokens = src.shape[0]
        src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
        tgt_tokens = top_k_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, k = k, temp = temp).flatten()
        translated_sentence = " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
        translated_sentences.append(translated_sentence)
    return translated_sentences

def translate_batch_top_p(model: torch.nn.Module, src_sentences: List[str], p: float, temp: float) -> List[str]:
    """
    Translate a batch of source sentences using the top-p decoding approach.

    Args:
        model (torch.nn.Module): The trained model for translation.
        src_sentences (List[str]): A list of source sentences to be translated.
        p (float): The value of p for top-p decoding.
        temp (float): The temperature parameter for decoding.

    Returns:
        translated_sentences (List[str]): A list of translated sentences.

    """
    model.eval()
    translated_sentences = []
    for src_sentence in src_sentences:
        src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
        num_tokens = src.shape[0]
        src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
        tgt_tokens = top_p_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, p = p, temp = temp).flatten()
        translated_sentence = " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
        translated_sentences.append(translated_sentence)
    return translated_sentences
