# Assignment 7

## Task

Train a Transformer model for Machine Translation from Russian to English.  
Dataset: http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz   
Make all source and target text to lower case.  
Use following tokenization for english:  
```
import sentencepiece as spm

...
spm.SentencePieceTrainer.Train('--input=data/text.en --model_prefix=bpe_en --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

tok_en = spm.SentencePieceProcessor()
tok_en.load('bpe_en.model')

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

...
TGT.build_vocab(..., min_freq=5)
...

```
Score: corpus-bleu `nltk.translate.bleu_score.corpus_bleu`  
Use last 1000 sentences for model evalutation (test dataset).  
Use your target sequence tokenization for BLEU score.  
Use max_len=50 for sequence prediction.  


Hint: You may consider much smaller model, than shown in the example.  

Baselines:  
[4 point] BLEU = 0.05  
[6 point] BLEU = 0.10  
[9 point] BLEU = 0.15  

[1 point] Share weights between target embeddings and output dense layer. Notice, they have the same shape.


Readings:
1. BLUE score how to https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
1. Transformer code and comments http://nlp.seas.harvard.edu/2018/04/03/attention.html

- **self-attention** http://jalammar.github.io/illustrated-transformer/
- **transformer** http://nlp.seas.harvard.edu/2018/04/03/attention.html
- **spacy russian**
https://github.com/aatimofeev/spacy_russian_tokenizer
- **beam search** https://github.com/mmehdig/lm_beam_search/blob/master/beam_search.py
- **seq to seq** https://arxiv.org/pdf/1703.01619.pdf

## Imports

In [2]:
!pip install sentencepiece
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.autograd import Variable
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from torchtext import datasets, data
import sentencepiece as spm
import copy 
from google.colab import drive
import sys
# drive.mount('/content/drive')

batch_size = 32
num_epochs = 2

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PATH =  '/content/drive/My Drive/Colab Notebooks/model_transl.pth'



## Model

### Model architecture

In [0]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)
    
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
    
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

    
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)
    
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)
    
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / np.sqrt(d_k)
    if mask is not None:
        try:
            scores = scores.masked_fill(mask == 0, -1e9)
        except:
            pass
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)
    
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))
    
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model).cuda()
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x).cuda() * np.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
      x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
      return self.dropout(x)   
    
 
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        nn.Linear(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, tgt=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if tgt is not None:
            self.trg = tgt[:, :-1]
            self.trg_y = tgt[:, 1:]
            self.trg_mask = self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask

max_src_in_batch = 25000
max_tgt_in_batch = 25000
global max_src_in_batch, max_tgt_in_batch

def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)


### Iterator

In [0]:
class BucketIteratorWrapper(DataLoader):
    __initialized = False

    def __init__(self, iterator: data.Iterator):
#         super(BucketIteratorWrapper,self).__init__()
        self.batch_size = iterator.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized = True

    def __iter__(self):
        return map(
            lambda batch: Batch(batch.src, batch.tgt, pad=TGT.vocab.stoi['<pad>']),
            self.batch_sampler.__iter__()
        )

    def __len__(self):
        return len(self.batch_sampler)
    
class MyCriterion:
    def __init__(self, generator, pad_idx):
        self.generator = generator
        self.criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=pad_idx)
        self.criterion.cuda()
        
    def __call__(self, x, batch):
        y = batch.trg_y
        x = self.generator(x)
        loss = self.criterion(x.reshape(-1, x.size(-1)), 
                              y.reshape(-1))  / batch.ntokens
        return loss

### Run epoch

In [0]:
def train_epoch(data_iter, model, criterion, optimizer):
    total_loss = 0
    data_iter = tqdm(data_iter)
    counter = 0
    for batch in data_iter:
        out = model.forward(batch.src, batch.trg, 
                          batch.src_mask, batch.trg_mask)
        loss = criterion(out, batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.detach().item() 
        data_iter.set_postfix(loss = loss)
        counter +=1
        
    total_loss /= counter
    return total_loss

def valid_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm(data_iter)
    counter = 0
    for batch in data_iter:
        with torch.no_grad():
            out = model.forward(batch.src, batch.trg, 
                              batch.src_mask, batch.trg_mask)
            loss = criterion(out, batch)
            total_loss += loss.detach().item() 
            data_iter.set_postfix(loss = loss)
            counter +=1
        
    total_loss /= counter
    return total_loss

import torch.optim as optim

def start_model(new=False, num_epochs=10, exists=False):
   # src_size = 32707
 #   src_size = 32704
   # tgt_size = 28300
  #  tgt_size = 28295
    src_size = len(SRC.vocab)
    tgt_size = len(TGT.vocab)
    model = make_model(src_size, tgt_size, N=2)
    if new is True:
        if exists:
            model.load_state_dict(torch.load(PATH))
        model = model.to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        criterion = MyCriterion(model.generator, pad_idx)
        #share weights TODO
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
        for epoch in range(num_epochs):
            print(f'epoch {epoch}')
            model.train()
            loss = train_epoch(train_iter, model, criterion, optimizer)
            print('train', loss)
          #  torch.save(model.state_dict(), PATH)
            model.eval()
            with torch.no_grad():
                loss = valid_epoch(valid_iter, model, criterion)
                scheduler.step(loss)
                print('valid', loss)
    else:
        model.load_state_dict(torch.load(PATH))
        model = model.to(DEVICE)
    return model

### Beam search

## Training

In [6]:
!wget http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz
!gunzip -c training-parallel-nc-v13.tgz | tar xvf -

with open('training-parallel-nc-v13/news-commentary-v13.ru-en.en') as f:
    with open('text.en', 'w') as out:
            out.write(f.read().lower())


with open('training-parallel-nc-v13/news-commentary-v13.ru-en.ru') as f:
    with open('text.ru', 'w') as out:
            out.write(f.read().lower())
        
spm.SentencePieceTrainer.Train('--input=text.en --model_prefix=bpe_en --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')
spm.SentencePieceTrainer.Train('--input=text.ru --model_prefix=bpe_ru --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

tok_ru = spm.SentencePieceProcessor()
tok_ru.load('bpe_ru.model')

tok_en = spm.SentencePieceProcessor()
tok_en.load('bpe_en.model')

SRC = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_ru.encode_as_pieces(x),
    batch_first=True,
)

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

fields = (('src', SRC), ('tgt', TGT))

with open('text.ru') as f:
    src_snt = list(map(str.strip, f.readlines()))
    
with open('text.en') as f:
    tgt_snt = list(map(str.strip, f.readlines()))
    
examples = [data.Example.fromlist(x, fields) for x in tqdm(zip(src_snt, tgt_snt))]
test = data.Dataset(examples[-1000:], fields)
train, valid = data.Dataset(examples[:-1000], fields).split(0.9)

TGT.build_vocab(train, min_freq=5)
SRC.build_vocab(train, min_freq=5)

pad_idx = TGT.vocab.stoi['<pad>']

print('src: ' + " ".join(train.examples[100].src))
print('tgt: ' + " ".join(train.examples[100].tgt))
print(len(train), len(valid), len(test))
print(len(SRC.vocab), len(TGT.vocab))


torch.cuda.empty_cache()

train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), 
                                              batch_sizes=(batch_size, batch_size, batch_size), 
                                  sort_key=lambda x: len(x.src),
                                  shuffle=True,
                                  device=DEVICE,
                                  sort_within_batch=False)
                                  
train_iter = BucketIteratorWrapper(train_iter)
valid_iter = BucketIteratorWrapper(valid_iter)
test_iter = BucketIteratorWrapper(test_iter)

--2020-02-23 07:57:30--  http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz
Resolving data.statmt.org (data.statmt.org)... 129.215.197.184
Connecting to data.statmt.org (data.statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 113157482 (108M) [application/x-gzip]
Saving to: ‘training-parallel-nc-v13.tgz’


2020-02-23 07:57:31 (98.9 MB/s) - ‘training-parallel-nc-v13.tgz’ saved [113157482/113157482]

training-parallel-nc-v13/
training-parallel-nc-v13/news-commentary-v13.ru-en.ru
training-parallel-nc-v13/news-commentary-v13.cs-en.en
training-parallel-nc-v13/news-commentary-v13.de-en.de
training-parallel-nc-v13/news-commentary-v13.ru-en.en
training-parallel-nc-v13/news-commentary-v13.zh-en.zh
training-parallel-nc-v13/news-commentary-v13.zh-en.en
training-parallel-nc-v13/news-commentary-v13.cs-en.cs
training-parallel-nc-v13/news-commentary-v13.de-en.en


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


src: ▁само ▁существование ▁европейской ▁модели ▁по - прежнему ▁направляет ▁и ▁вдохновля ет ▁всех , ▁кто ▁добивается ▁прозрачного , ▁демократи ческого ▁правления ▁во ▁многих ▁пост - коммунистических ▁странах .
tgt: ▁indeed , ▁the ▁e x istence ▁of ▁a ▁european ▁model ▁continues ▁to ▁guide ▁and ▁encourage ▁those ▁pursuing ▁transparent , ▁democratic ▁governance ▁in ▁many ▁post - communist ▁countries .
210743 23416 1000
32731 28303


In [7]:
model = start_model(new=True, num_epochs=2)

epoch 0


HBox(children=(IntProgress(value=0, max=6586), HTML(value='')))


train 6.351225875266012


HBox(children=(IntProgress(value=0, max=732), HTML(value='')))


valid 6.010107225407668
epoch 1


HBox(children=(IntProgress(value=0, max=6586), HTML(value='')))


train 6.019243819062702


HBox(children=(IntProgress(value=0, max=732), HTML(value='')))


valid 5.829468114779947


In [0]:
def beam_search(model, src, src_mask, max_len=20, k=5, offset=0):
    memory = model.encode(src, src_mask)
    start_token = TGT.vocab.stoi["<s>"]
    end_token = TGT.vocab.stoi["</s>"]
    ys = torch.ones(1, 1).fill_(start_token).type_as(src.data)
    beam = [(ys, 0)]
    for i in range(max_len):
        candidates= []
        candidates_proba = []
        for snt, snt_proba in beam:
            if snt[0][-1] == end_token:
                candidates.append(snt)
                candidates_proba.append(snt_proba)
            else:
                proba = model.decode(memory, src_mask, snt,
                                     subsequent_mask(snt.size(1)).type_as(src.data))
                proba = proba[0][i]
                best_k = torch.argsort(-proba)[:k].tolist()
                proba = proba.tolist()
                for tok in best_k:
                    candidates.append(torch.cat([snt, torch.ones(1, 1).type_as(src.data).fill_(tok)], dim=1))
                    candidates_proba.append(snt_proba + np.log(proba[tok])) 
         
        best_candidates = np.argsort(-np.array(candidates_proba))[offset:k+offset]
        beam = [(candidates[j], candidates_proba[j]) for j in best_candidates]
    return beam

In [14]:
model.eval()
with torch.no_grad():
    for idx, batch in enumerate(valid_iter):
        src = batch.src[:1]
        src_key_padding_mask = src != SRC.vocab.stoi["<pad>"]
        beam = beam_search(model, src, src_key_padding_mask, k=5, offset=3)
        
        seq = []
        for i in range(1, src.size(1)):
            sym = SRC.vocab.itos[src[0, i]]
            if sym == "</s>": break
            seq.append(sym)
        seq = tok_ru.decode_pieces(seq)
        print("\nSource:", seq)
        
        print("Translation:")
        for pred, pred_proba in beam:                
            seq = []
            for i in range(1, pred.size(1)):
                sym = TGT.vocab.itos[pred[0, i]]
                if sym == "</s>": break
                seq.append(sym)
            seq = tok_en.decode_pieces(seq)
            print(f"pred {pred_proba:.2f}:", seq)
                
        seq = []
        for i in range(1, batch.trg.size(1)):
            sym = TGT.vocab.itos[batch.trg[0, i]]
            if sym == "</s>": break 
            seq.append(sym)
        seq = tok_en.decode_pieces(seq)
        print("Target:", seq)
        break


Source: цепь кризисов
Translation:
pred -16.51: economy financial billion north ever north particularly north particularly north billion north billion north particularly north billion north particularly north
pred -16.54: economy financial billion north ever north particularly north billion north particularly north particularly north billion north particularly north billion north
pred -17.28: economy particularly north particularly north politics north progress north non north progress north billion north billion north billion north billion
pred -17.32: economy particularly north particularly north politics north progress north non north progress north billion north billion north progress north billion
pred -17.32: economy particularly north particularly north politics north progress north non north progress north billion north billion north billion north progress
Target: the crises nexus


In [16]:
from nltk.translate.bleu_score import corpus_bleu
from nltk import translate 

hypotheses = []
references = []

model.eval()
with torch.no_grad():
    for batch in tqdm(test_iter):
        for sent in range(len(batch.src)):
            src = batch.src[sent:sent+1]
            src_key_padding_mask = src != SRC.vocab.stoi["<pad>"]
            beam = beam_search(model, src, src_key_padding_mask, k=5, offset=0)
            for pred, pred_proba in beam[:1]:                
                seq = []
                for i in range(1, pred.size(1)):
                    sym = TGT.vocab.itos[pred[0, i]]
                    if sym == "</s>": break
                    seq.append(sym)
                seq = tok_en.decode_pieces(seq)
                my_trg = batch.trg[sent:sent+1].tolist()[0]
                my_ref = []
                for i in range(1, batch.trg.size(1)):
                     m_trg =  TGT.vocab.itos[my_trg[i]]
                     if m_trg == "</s>" or m_trg  == "<pad>": break
                     my_ref.append(m_trg)
                my_ref = tok_en.decode_pieces(my_ref)
                hypotheses.append(seq.split())
                references.append(my_ref.split())
corpus_bleu(references, hypotheses, 
            smoothing_function=translate.bleu_score.SmoothingFunction().method3,
            auto_reweigh=True
           )

HBox(children=(IntProgress(value=0, max=32), HTML(value='')))

0,05067184337873702

- https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial
- https://bastings.github.io/annotated_encoder_decoder/

In [17]:
len(references), len(hypotheses)

(1000, 1000)