In [1]:
!pip install spacy sacrebleu torchdata -U
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu, spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.5.2
    Uninstalling spacy-3.5.2:
      Successfully uninstalled spacy-3.5.2
Successfully installed colorama-0.4.6 portalocker-2.

All the code below is taken from : [here](https://pytorch.org/tutorials/beginner/translation_transformer.html)

In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List

# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

In [3]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [4]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [5]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [6]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [7]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [8]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [9]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))




Epoch: 1, Train loss: 5.344, Val loss: 4.114, Epoch time = 47.695s
Epoch: 2, Train loss: 3.761, Val loss: 3.320, Epoch time = 44.350s
Epoch: 3, Train loss: 3.162, Val loss: 2.894, Epoch time = 46.087s
Epoch: 4, Train loss: 2.768, Val loss: 2.640, Epoch time = 44.604s
Epoch: 5, Train loss: 2.481, Val loss: 2.441, Epoch time = 45.880s
Epoch: 6, Train loss: 2.250, Val loss: 2.317, Epoch time = 48.607s
Epoch: 7, Train loss: 2.060, Val loss: 2.204, Epoch time = 45.227s
Epoch: 8, Train loss: 1.897, Val loss: 2.115, Epoch time = 45.813s
Epoch: 9, Train loss: 1.754, Val loss: 2.062, Epoch time = 44.716s
Epoch: 10, Train loss: 1.631, Val loss: 2.003, Epoch time = 45.291s
Epoch: 11, Train loss: 1.524, Val loss: 1.973, Epoch time = 44.827s
Epoch: 12, Train loss: 1.420, Val loss: 1.944, Epoch time = 44.772s
Epoch: 13, Train loss: 1.333, Val loss: 1.964, Epoch time = 45.462s
Epoch: 14, Train loss: 1.251, Val loss: 1.943, Epoch time = 44.646s
Epoch: 15, Train loss: 1.173, Val loss: 1.934, Epoch time

In [10]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

**Theoretical questions**

**In the positional encoding, why are we using a combination of sinus and cosinus?**

- We need a way to indicate the postion of a word in a sentence because the order of the words can change the meaning of the sentence. 

**In the Seq2SeqTransformer class, What is the parameter nhead for? What is the point of the generator?**

- The nhead parameter is the number of head. This means that the input splits in nhead number and each parts will be passed through a separate self-attention mechanism. This allow the model to pay attention to different positional information from different semantic spaces in the data.

- The generator is a linear layer used to transformer the transformer's output into the final output wich is the size of the target vocabulary. So the word with the highest probability will be activated through a softmax function.

**Describe the goal of the create_mask function. Why does it handle differently the source and target masks?**

- create_mask is used to create masks for the Transformer model during translation.

  + Subsequent Attention Mask('tgt_mask'): Prevent the model to have access to thefutur token.
  + Source Mask('src_mask'): Used to ignore some tokens from the source sequence.
  + Padding Mask('src_padding_mask', 'tgt_padding_mask'): Used to ignore padding tokens added to source and target sequences.

- The reason why source and target masks are handled differently is due to the nature of the sequences. For the source sequence all tokens are available. But for the target sequence, the tokens are generated sequentially so the generation of a token depends of the previous one. So we don't want the model to have access to all the informations when we are processing the target sequence. 

**Decoding functions**

 + **A top-k sampling with temperature.**

In [33]:
import torch.nn.functional as F

def top_k_sampling_decode(model, src, src_mask, max_len, start_symbol, k, temperature):
    """
    This function generates an output sequence using top-k sampling and a specified temperature.

    Parameters:
    model (torch.nn.Module): The transformer model.
    src (torch.Tensor): The source input tensor.
    src_mask (torch.Tensor): The source input mask tensor.
    max_len (int): The maximum length for the output sequence.
    start_symbol (int): The start symbol in the target language vocabulary.
    k (int): The number of top elements to be selected from. Higher values will lead to more diverse results.
    temperature (float): This parameter controls the randomness of predictions by scaling the logits before applying softmax. 
                         Higher values make the distribution more uniform (more randomness), lower values make the distribution sharper (less randomness).
    
    Returns:
    ys (torch.Tensor): The output sequence tensor.
    """

    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)

    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        logits = model.generator(out[:, -1])

        # Apply top-k sampling
        indices_to_remove = logits < torch.topk(logits, k)[0][..., -1, None]
        logits[indices_to_remove] = -float('Inf')

        # Apply temperature
        logits /= temperature

        probs = F.softmax(logits, dim=-1)
        next_word = torch.multinomial(probs, num_samples=1).squeeze()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

**Test Greedy and top_k**

In [36]:
# MODIFIED TRANSLATE TO ACCEPT DIFFERENT METHODS IN PARAMETER
def translate(model: torch.nn.Module, src_sentence: str, decoding_method, temp=0.3):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    if decoding_method == top_k_sampling_decode:
      tgt_tokens = decoding_method(
          model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, k=10, temperature=temp).flatten()
    else:
        tgt_tokens = decoding_method(
          model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

print("Test 1: Eine Gruppe von Menschen steht vor einem Iglu. | A group of people is standing in front of an igloo.")
print("    Top k, temperature = 0.3 : " + translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu.", top_k_sampling_decode))
print("    Top k, temperature = 0.5 : " + translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu.", top_k_sampling_decode, temp = 0.5))
print("    Top k, temperature = 0.7 : " + translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu.", top_k_sampling_decode, temp = 0.7))
print("    Greedy decode            : " + translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu.", greedy_decode))
print("")
print("Test 2: Autos sind schnell auf der Autobahn. | Cars are fast on the highway.")
print("    Top k, temperature = 0.3 : " + translate(transformer, "Autos sind schnell auf der Autobahn.", top_k_sampling_decode))
print("    Top k, temperature = 0.5 : " + translate(transformer, "Autos sind schnell auf der Autobahn.", top_k_sampling_decode, 0.5))
print("    Top k, temperature = 0.7 : " + translate(transformer, "Autos sind schnell auf der Autobahn.", top_k_sampling_decode, 0.7))
print("    Greedy decode            : " + translate(transformer, "Autos sind schnell auf der Autobahn.", greedy_decode))
print("")
print("Test 3: Gehen Sie aus dem Weg, wenn der Zug am Bahnhof ankommt. | Step aside when the train arrives at the station.")
print("    Top k, temperature = 0.3 : " + translate(transformer, "Gehen Sie aus dem Weg, wenn der Zug am Bahnhof ankommt.", top_k_sampling_decode))
print("    Top k, temperature = 0.5 : " + translate(transformer, "Gehen Sie aus dem Weg, wenn der Zug am Bahnhof ankommt.", top_k_sampling_decode, 0.5))
print("    Top k, temperature = 0.7 : " + translate(transformer, "Gehen Sie aus dem Weg, wenn der Zug am Bahnhof ankommt.", top_k_sampling_decode, 0.7))
print("    Greedy decode            : " + translate(transformer, "Gehen Sie aus dem Weg, wenn der Zug am Bahnhof ankommt.", greedy_decode))

Test 1: Eine Gruppe von Menschen steht vor einem Iglu . | A group of people is standing in front of an igloo.
    Top k, temperature = 0.3 :  A group of people stand in front of an igloo . 
    Top k, temperature = 0.5 :  A group of people stand in front of an igloo . 
    Top k, temperature = 0.7 :  A group of people stand in front of an igloo . 
    Greedy decode            :  A group of people stand in front of an igloo . 

Test 2: Autos sind schnell auf der Autobahn. | Cars are fast on the highway.
    Top k, temperature = 0.3 :  cars are racing on the highway . 
    Top k, temperature = 0.5 :  cars are racing on the highway . 
    Top k, temperature = 0.7 :  cars are racing on top of the highway . 
    Greedy decode            :  car are racing on the highway . 

Test 3: Gehen Sie aus dem Weg, wenn der Zug am Bahnhof ankommt. | Step aside when the train arrives at the station.
    Top k, temperature = 0.3 :  These are walking out the path as the train station station . 
    Top k,

**Analyse**

**Compute the BLEU score of the model**

In [43]:
from sacrebleu.metrics import BLEU

ref0 = "Eine Gruppe von Menschen steht vor einem Iglu."
ref1 = "Autos sind schnell auf der Autobahn."
ref2 = "Gehen Sie aus dem Weg, wenn der Zug am Bahnhof ankommt."

refs = [[ref0, ref1, ref2]]

predictions = [
    [translate(transformer, example, top_k_sampling_decode) for example in refs[0]],
    [translate(transformer, example, top_k_sampling_decode, temp = 0.5) for example in refs[0]],
    [translate(transformer, example, top_k_sampling_decode, temp = 0.7) for example in refs[0]],
    [translate(transformer, example, greedy_decode) for example in refs[0]]
]

bleu = BLEU()

print("BLEU Top_K, temperature = 0.3 : ", bleu.corpus_score(predictions[0], refs))
print("BLEU Top_K, temperature = 0.5 : ", bleu.corpus_score(predictions[1], refs))
print("BLEU Top_K, temperature = 0.7 : ", bleu.corpus_score(predictions[2], refs))
print("BLEU greddy method            : ", bleu.corpus_score(predictions[3], refs))

BLEU Top_K, temperature = 0.3 :  BLEU = 1.84 10.0/1.9/1.0/0.6 (BP = 1.000 ratio = 1.034 hyp_len = 30 ref_len = 29)
BLEU Top_K, temperature = 0.5 :  BLEU = 1.64 9.1/1.7/0.9/0.5 (BP = 1.000 ratio = 1.138 hyp_len = 33 ref_len = 29)
BLEU Top_K, temperature = 0.7 :  BLEU = 1.84 10.0/1.9/1.0/0.6 (BP = 1.000 ratio = 1.034 hyp_len = 30 ref_len = 29)
BLEU greddy method            :  BLEU = 1.84 10.0/1.9/1.0/0.6 (BP = 1.000 ratio = 1.034 hyp_len = 30 ref_len = 29)


**What does that means ?**

BLEU : It's the score as percentage, so here it's a very low score for all the methods.

10.0/1.9/1.0/0.6 : For example is the precision scores for 1-gram, 2-gram, 3-gram, and 4-gram. These are also low, suggesting that even on a word-for-word or phrase-for-phrase level, the translations do not match the reference very closely.

BP : Brevity penalty, factor that penalize translations that are too short, the value is between 0 and 1.

Ratio : Length of the generated translation divided by the length of the reference translation.

Hyp_len : it's the length of the generated translation.

Ref_len : it's the length of the reference translation. 