# Basic Transformer model for Machine Translation

Based on (code): https://github.com/MLWhiz/data_science_blogs/blob/master/transformers/Translator.ipynb

Based on (paper): https://drive.google.com/viewerng/viewer?url=https://arxiv.org/pdf/1706.03762.pdf   
and also: https://medium.com/@galhever/neural-machine-translation-with-transformers-69d4bf918299

In [None]:
import numpy as np
import torch
from torch.nn.init import xavier_uniform_
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import math, copy, time
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import sent_tokenize,word_tokenize
#seaborn.set_context(context="talk")
%matplotlib inline
from torchtext import data, datasets
from torchtext.data import TabularDataset
import spacy

from typing import Optional, Any

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
BOS_WORD = '<s>' # beginning of sentence token
EOS_WORD = '</s>' # end of sentence token
BLANK_WORD = "<blank>" # blank token for padding

In [None]:
word_tokenize('á ő ü ó é ö ') 

['á', 'ő', 'ü', 'ó', 'é', 'ö']

In [None]:
SRC = data.Field(tokenize=word_tokenize, pad_token=BLANK_WORD)
TGT = data.Field(tokenize=word_tokenize, init_token = BOS_WORD,
eos_token = EOS_WORD, pad_token=BLANK_WORD)

In [None]:
fields = [('tgt',TGT), ('src',SRC)]

### Loading the English-Hungarian sentence pairs

The training set contains ~74000 sentence pairs, while the validation set has ~13000 sentences. No test set since the translations need to be evaluated by humans, so I'll just use a couple simple sentences for testing at the end and check each of them. 

84000 sentence pair data (source: http://www.manythings.org/anki/):

Hunglish data (source: http://mokk.bme.hu/resources/hunglishcorpus/):

In [None]:
train_data, valid_data = TabularDataset.splits(
    path='/content/drive/My Drive/AML_projekt',
    train = 'hunglish_train_cut_p1.tsv',
    validation = 'hunglish_valid_2_p1c.tsv',
    format ='tsv',
    fields= fields
    )

In [None]:
print(vars(valid_data[14])['tgt'])
#print(train_data[0].__dict__.keys())

['-', 'El', 'kell', 'őt', 'választani', 'a', 'társaitól', ',', 'Uram', '.']


Take a look at the training dataset.

In [None]:
for i, example in enumerate([(x.src,x.tgt) for x in train_data[:15]]):
    print(f"Example_{i}:{example}")

Example_0:(['The', 'people', 'stared', '.'], ['Az', 'emberek', 'rámeredtek', '.'])
Example_1:(['The', 'hands', '.'], ['A', 'kezek', '?'])
Example_2:(['If', 'the', 'Second', 'Foundation', 'exists', 'and', 'wishes', 'to', 'guard', 'the', 'secret', 'of', 'that', 'existence', ',', 'then', 'one', 'thing', 'is', 'sure', '.'], ['Ha', 'a', 'Második', 'Alapítvány', 'létezik', ',', 'és', 'létezésének', 'titkát', 'meg', 'akarja', 'őrizni', ',', 'akkor', 'egyvalamit', 'biztosra', 'vehetünk', '.'])
Example_3:(['She', 'shot', 'an', 'enquiring', 'look', 'up', 'at', 'Standish', '.'], ['Érdeklődő', 'pillantást', 'vetett', 'Standish-re', '.'])
Example_4:(['I', 'kissed', 'my', 'baby', 'and', 'we', 'put', 'out', 'the', 'lights', '.'], ['Megcsókoltam', 'Terryt', ',', 'és', 'eloltottuk', 'a', 'lámpát', '.'])
Example_5:(['Then', 'there', "'s", 'a', 'travelling', 'company', 'of', 'dwarves', 'going', 'West', 'come', 'in', 'this', 'evening', '.'], ['Aztán', 'ma', 'este', 'törpök', 'jöttek', ':', 'ők', 'nyugat',

Next I also create vocabularies, containing those words only that appear at least MIN_FREQ times.

In [None]:
MIN_FREQ = 3
SRC.build_vocab(valid_data.src, min_freq=MIN_FREQ, vectors='glove.6B.100d') # tried adding the file manually but it doesn't work, it has to download it every time
TGT.build_vocab(valid_data.tgt, min_freq=MIN_FREQ)

.vector_cache/glove.6B.zip: 862MB [06:29, 2.21MB/s]                           
 99%|█████████▉| 397877/400000 [00:17<00:00, 21368.51it/s]

In [None]:
len(SRC.vocab) #11104

11104

In [None]:
len(TGT.vocab) #14366

14366

 Create iterators to process text in batches of approx. the same length (they are sorted by length) to avoid problems with memory. The validation batch size is 1 to avoid padding on the val. sentences.

In [None]:
BATCH_SIZE = 25
train_iter = data.BucketIterator(train_data, batch_size=BATCH_SIZE, repeat=False, sort_key=lambda x: len(x.src))
val_iter = data.BucketIterator(valid_data, batch_size=1, repeat=False, sort_key=lambda x: len(x.src))

Let's see what's in a batch (will be the input to the network):

In [None]:
batch = next(iter(train_iter))
src_matrix = batch.src.T
print(src_matrix, src_matrix.size())

tensor([[   74,    75,    15,    31,    24,   252,  1094,    70,   717,    69,
         10997,     2,    15,   204,   209,    14,    11,  1559,   260,     3,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1],
        [   30,   262,  3221,    69,     4,  1388,   557,     2,  3055,     4,
          9892,     6,   958,  5948,   225,     4,  6031,     3,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1],
        [  274,   615,    51,     0,     3,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1],
        [  101,    10,    65,   105,    16,  4566,    61,   356,     4,  5791,
             6,     4,   273,   563,   406,   300,     2,    14,    10,   121,
           423,  

In [None]:
tgt_matrix = batch.tgt.T
print(tgt_matrix, tgt_matrix.size())

tensor([[    2,   186,    28,   573,    22,    11,     0,     0,     4,  4009,
           612,  2084,     5,     3,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1],
        [    2,   413,   136,     8,     0,  4593,     4,   657,     6,     0,
             0,  1949,     0,     5,     3,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1],
        [    2,     0,   968,     6,     0,     5,     3,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1],
        [    2,     0,    11, 11153,     0,     8,    82,  1404,     0,     6,
             0,     5,     3,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  

In [None]:
print(SRC.vocab.itos[1])
print(TGT.vocab.itos[2])
print(SRC.vocab.itos[3])
print(TGT.vocab.itos[154])

<blank>
<s>
.
akik


### Creating the Transformer

There is a Pytorch Transformer implementation, but that lacks some features included in the paper.

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * math.sqrt(self.d_model)
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

    
class MyTransformer(nn.Module):
    def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6,
                 num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1,
                 activation: str = "relu",source_vocab_length: int = 60000,target_vocab_length: int = 60000) -> None:
        super(MyTransformer, self).__init__()
        self.source_embedding = nn.Embedding(source_vocab_length, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        encoder_norm = nn.LayerNorm(d_model)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
        self.target_embedding = nn.Embedding(target_vocab_length, d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        decoder_norm = nn.LayerNorm(d_model)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
        self.out = nn.Linear(512, target_vocab_length)
        self._reset_parameters()
        self.d_model = d_model
        self.nhead = nhead

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None,
                memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None,
                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        if src.size(1) != tgt.size(1):
            raise RuntimeError("the batch number of src and tgt must be equal")
        src = self.source_embedding(src)
        src = self.pos_encoder(src)
        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        tgt = self.target_embedding(tgt)
        tgt = self.pos_encoder(tgt)
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
                              tgt_key_padding_mask=tgt_key_padding_mask,
                              memory_key_padding_mask=memory_key_padding_mask)
        output = self.out(output)
        return output


    def _reset_parameters(self):
        r"""Initiate parameters in the transformer model."""
        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)


Initializing the transformer

In [None]:
source_vocab_length = len(SRC.vocab)
target_vocab_length = len(TGT.vocab)

model = MyTransformer(source_vocab_length=source_vocab_length,target_vocab_length=target_vocab_length)
optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9) # same parameters as in the paper
model = model.cuda()

 99%|█████████▉| 397877/400000 [00:30<00:00, 21368.51it/s]

In [None]:
len(train_data)

140500

In [None]:
def train(train_iter, val_iter, model, optim, num_epochs,use_gpu=True): 
    train_losses = []
    valid_losses = []
    for epoch in range(num_epochs):
        train_loss = 0
        valid_loss = 0
        # Train model
        model.train()
        for i, batch in enumerate(train_iter):
            src = batch.src.cuda() if use_gpu else batch.src
            tgt = batch.tgt.cuda() if use_gpu else batch.tgt
            #change to shape (bs , max_seq_len)
            src = src.transpose(0,1)
            #change to shape (bs , max_seq_len+1) , Since right shifted
            tgt = tgt.transpose(0,1)
            tgt_input = tgt[:, :-1]
            targets = tgt[:, 1:].contiguous().view(-1)
            src_mask = (src != 0)
            src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
            src_mask = src_mask.cuda() if use_gpu else src_mask
            tgt_mask = (tgt_input != 0)
            tgt_mask = tgt_mask.float().masked_fill(tgt_mask == 0, float('-inf')).masked_fill(tgt_mask == 1, float(0.0))
            tgt_mask = tgt_mask.cuda() if use_gpu else tgt_mask
            size = tgt_input.size(1)
            #print(size)
            np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
            np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
            np_mask = np_mask.cuda() if use_gpu else np_mask   
            # Forward, backprop, optimizer
            optim.zero_grad()
            preds = model(src.transpose(0,1), tgt_input.transpose(0,1), tgt_mask = np_mask)#, src_mask = src_mask)#, tgt_key_padding_mask=tgt_mask)
            preds = preds.transpose(0,1).contiguous().view(-1, preds.size(-1))
            loss = F.cross_entropy(preds,targets, ignore_index=0,reduction='sum')
            loss.backward()
            optim.step()
            train_loss += loss.item()/BATCH_SIZE
        
        model.eval()
        with torch.no_grad():
            for i, batch in enumerate(val_iter):
                src = batch.src.cuda() if use_gpu else batch.src
                tgt = batch.tgt.cuda() if use_gpu else batch.tgt
                #change to shape (bs , max_seq_len)
                src = src.transpose(0,1)
                #change to shape (bs , max_seq_len+1) , Since right shifted
                tgt = tgt.transpose(0,1)
                tgt_input = tgt[:, :-1]
                targets = tgt[:, 1:].contiguous().view(-1)
                src_mask = (src != 0)
                src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
                src_mask = src_mask.cuda() if use_gpu else src_mask
                tgt_mask = (tgt_input != 0)
                tgt_mask = tgt_mask.float().masked_fill(tgt_mask == 0, float('-inf')).masked_fill(tgt_mask == 1, float(0.0))
                tgt_mask = tgt_mask.cuda() if use_gpu else tgt_mask
                size = tgt_input.size(1)
                #print(size)
                np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
                np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
                np_mask = np_mask.cuda() if use_gpu else np_mask

                preds = model(src.transpose(0,1), tgt_input.transpose(0,1), tgt_mask = np_mask)#, src_mask = src_mask)#, tgt_key_padding_mask=tgt_mask)
                preds = preds.transpose(0,1).contiguous().view(-1, preds.size(-1))         
                loss = F.cross_entropy(preds,targets, ignore_index=0,reduction='sum')
                valid_loss += loss.item()/1
            
        # Log after each epoch
        print(f'''Epoch [{epoch+1}/{num_epochs}] complete. Train Loss: {train_loss/len(train_iter):.3f}. Val Loss: {valid_loss/len(val_iter):.3f}''')
        
        #Save best model till now:
        #if valid_loss/len(val_iter)<min(valid_losses,default=1e9): 
        #    print("saving state dict")
        #    torch.save(model.state_dict(), f"checkpoint_best_epoch.pt")

        #Save best model till now:
        if train_loss/len(train_iter)<min(train_losses,default=1e9): 
            print("saving state dict")
            torch.save(model.state_dict(), f"checkpoint_best_epoch.pt")
        
        train_losses.append(train_loss/len(train_iter))
        valid_losses.append(valid_loss/len(val_iter))
        
        # Check Example after each epoch:
        sentences = ["This is an example to check how our model is performing."]
        for sentence in sentences:
            print(f"Original Sentence: {sentence}")
            print(f"Translated Sentence: {greeedy_decode_sentence(model,sentence)}")
    return train_losses,valid_losses

In [None]:
def greeedy_decode_sentence(model,sentence):
    model.eval()
    sentence = SRC.preprocess(sentence)
    indexed = []
    for tok in sentence:
        if SRC.vocab.stoi[tok] != 0 :
            indexed.append(SRC.vocab.stoi[tok])
        else:
            indexed.append(0)
    sentence = Variable(torch.LongTensor([indexed])).cuda()
    tgt_init_tok = TGT.vocab.stoi[BOS_WORD]
    tgt = torch.LongTensor([[tgt_init_tok]]).cuda()
    translated_sentence = ""
    maxlen = 25
    for i in range(maxlen):
        size = tgt.size(0)
        np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
        np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
        np_mask = np_mask.cuda()
        pred = model(sentence.transpose(0,1), tgt, tgt_mask = np_mask)
        add_word = TGT.vocab.itos[pred.argmax(dim=2)[-1]]
        translated_sentence+=" "+add_word
        if add_word==EOS_WORD:
            break
        tgt = torch.cat((tgt,torch.LongTensor([[pred.argmax(dim=2)[-1]]]).cuda()))
        #print(tgt)
    return translated_sentence

In [None]:
model.load_state_dict(torch.load(f"/content/drive/MyDrive/AML_projekt/checkpoint_best_epoch_4.pt"))

<All keys matched successfully>

In [None]:
train_losses,valid_losses = train(train_iter, val_iter, model, optim, 35)

Epoch [1/35] complete. Train Loss: 25.999. Val Loss: 53.519
saving state dict
Original Sentence: This is an example to check how our model is performing.
Translated Sentence:  Ez például például például például például arra , hogy hogyan a `` igazi '' a `` ami a számára jó '' . </s>
Epoch [2/35] complete. Train Loss: 25.212. Val Loss: 54.248
saving state dict
Original Sentence: This is an example to check how our model is performing.
Translated Sentence:  Ez például például abban , hogy ez a dolog a mi egyik fontos , hogy vajon milyen a mi egyik a mi egyik a mi
Epoch [3/35] complete. Train Loss: 24.955. Val Loss: 54.245
saving state dict
Original Sentence: This is an example to check how our model is performing.
Translated Sentence:  Ez például jó dolog , hogy is mondjam el , hogyan lehet a mi egyik fő , hogyan lehet a mi megfelelő a mi megfelelő
Epoch [4/35] complete. Train Loss: 24.651. Val Loss: 53.929
saving state dict
Original Sentence: This is an example to check how our model is

KeyboardInterrupt: ignored

Loading the best model for inference (based on checkpoints at each epoch)

In [None]:
#model.load_state_dict(torch.load(f"/content/drive/MyDrive/AML_projekt/checkpoint_best_epoch_3.pt"))

In [None]:
model.load_state_dict(torch.load(f"checkpoint_best_epoch.pt"))

<All keys matched successfully>

Some simple and not so simple sentences to translate:

In [None]:
sentences = ["I can count to ten.", "He is in his office.", "He sat under a tree.", "There is someone at the door.", "The crocodiles snapped at the boat.", "Put the books on the table.", "There are many apples on the tree.", "The castle was heavily bombed during the war.", "Joe stood up and spoke to the crowd.", "I quickly put on my red winter jacket, black snow pants, waterproof boots, homemade mittens, and handknit scarf.", "What would you like for breakfast?", "When did the train leave the station?", "The man looked in the mirror and adjusted his tie.", "He almost managed to get his car to work.", "Maradona's vision, passing, ball control and dribbling skills were combined with his small stature, which gave him a low centre of gravity allowing him to maneuver better than most other football players.", "Due to illness and injury as well as controversial incidents on the field, Maradona had a difficult tenure in Barcelona."]

In [None]:
sentences = sentences[:5]
sentences

['I can count to ten.',
 'He is in his office.',
 'He sat under a tree.',
 'There is someone at the door.',
 'The crocodiles snapped at the boat.']

In [None]:
reference = ['El tudok számolni tízig.', 'Az irodájában van.', 'A fa alatt ült.', 'Van valaki az ajtónál.', 'A krokodilok rátámadtak a csónakra.']

In [None]:
for i in sentences:
  print(str(i) + ':' + str(greeedy_decode_sentence(model,i)))

I can count to ten.: Már tíz tíz tíz tudok tudok tíz tíz tíz tíz tíz tíz akár tíz tíz tíz tíz tíz tíz tíz tíz tíz tudok tudok tudok
He is in his office.: - A szobájában van már az ő szobájában van , a szobájában van az ő szobájában van . </s>
He sat under a tree.: Ott ült egy fa alatt , fa alatt , s ült egy fa alatt fa alatt , fa alatt fa alatt a fa alatt ült
There is someone at the door.: Van valaki az ajtóban van valaki az ajtóban , van valaki az ajtóban , aki van az ajtóban van van az ajtó és van valaki
The crocodiles snapped at the boat.: Az csapat a csónak a hajót felé kapott , a hajó a csónak a lépcsőn a csónak felé kapott a lépcsőn . </s>


In [None]:
candidate = []
for i in sentences:
  candidate.append(str(greeedy_decode_sentence(model,i)))
print(candidate)

[' Már tíz tíz tíz tudok tudok tíz tíz tíz tíz tíz tíz akár tíz tíz tíz tíz tíz tíz tíz tíz tíz tudok tudok tudok', ' - A szobájában van már az ő szobájában van , a szobájában van az ő szobájában van . </s>', ' Ott ült egy fa alatt , fa alatt , s ült egy fa alatt fa alatt , fa alatt fa alatt a fa alatt ült', ' Van valaki az ajtóban van valaki az ajtóban , van valaki az ajtóban , aki van az ajtóban van van az ajtó és van valaki', ' Az csapat a csónak a hajót felé kapott , a hajó a csónak a lépcsőn a csónak felé kapott a lépcsőn . </s>']


In [None]:
reference_tok = []
candidate_tok = []
for i in range(len(candidate)):
  reference_tok.append(word_tokenize(reference[i]))
  candidate_tok.append(word_tokenize(candidate[i]))
reference_tok

[['El', 'tudok', 'számolni', 'tízig', '.'],
 ['Az', 'irodájában', 'van', '.'],
 ['A', 'fa', 'alatt', 'ült', '.'],
 ['Van', 'valaki', 'az', 'ajtónál', '.'],
 ['A', 'krokodilok', 'rátámadtak', 'a', 'csónakra', '.']]

This model works relatively well on shorter simple sentences, but fails to give a sensible translation on longer sentences. Also it often repeats the same words twice in a sentence even in the short ones.

In [None]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
#reference = [['this', 'is', 'a', 'test'], ['this', 'is' 'test']]
#candidate = ['this', 'is', 'a', 'test']
sum_score = 0
for i in range(len(candidate)):
  score = sentence_bleu(reference_tok[i], candidate_tok[i])
  print(score)
  sum_score += score
print('The average bleu score accross sentences: ' + str(sum_score/len(candidate_tok)))

0
0.6147881529512643
0.447213595499958
0
0.5885661912765424
The average bleu score accross sentences: 0.3301135879455529


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


The average bleu score accross sentences: 0.3181742372437443 (first run) \\
The average bleu score accross sentences: 0.3301135879455529 (second run)

In [None]:
input_sentences = []
output_sentences = []


count = 0
for line in open(r'/content/drive/MyDrive/AML_projekt/hun.txt', encoding="utf-8"):
    count += 1

    #if count > NUM_SENTENCES:
     #   break

    if '\t' not in line:
        continue

    input_sentence, output_sentence = line.rstrip().split('\t')[0], line.rstrip().split('\t')[1]

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))

num samples input: 87437
num samples output: 87437


In [None]:
input_sentences[:15]

['Hi.',
 'Hi.',
 'Hi.',
 'Run!',
 'Run!',
 'Run.',
 'Run.',
 'Who?',
 'Wow!',
 'Wow!',
 'Wow!',
 'Wow!',
 'Wow!',
 'Fire!',
 'Fire!']

In [None]:
output_sentences[:15]

['Cső!',
 'Helóka!',
 'Csövi!',
 'Fuss!',
 'Rohanj!',
 'Fuss!',
 'Rohanj!',
 'Ki?',
 'Hűha!',
 'Váó!',
 'Aszta!',
 'Tyűha!',
 'Nahát!',
 'Tűz!',
 'Lőj!']

In [None]:
candidate = []
for i in input_sentences:
  candidate.append(str(greeedy_decode_sentence(model,i)))
print(candidate[:10])

KeyboardInterrupt: ignored

In [None]:
len(candidate)

10979

In [None]:
reference = output_sentences[:10979]

In [None]:
reference_tok = []
candidate_tok = []
for i in range(len(candidate)):
  reference_tok.append(word_tokenize(reference[i]))
  candidate_tok.append(word_tokenize(candidate[i]))
reference_tok[:10]

[['Cső', '!'],
 ['Helóka', '!'],
 ['Csövi', '!'],
 ['Fuss', '!'],
 ['Rohanj', '!'],
 ['Fuss', '!'],
 ['Rohanj', '!'],
 ['Ki', '?'],
 ['Hűha', '!'],
 ['Váó', '!']]

In [None]:
len(input_sentences)

87437

In [None]:
sum_score = 0
for i in range(len(candidate_tok)):
  score = sentence_bleu(reference_tok[i], candidate_tok[i])
  #print(score)
  sum_score += score
print('The average bleu score accross sentences: ' + str(sum_score/len(candidate_tok)))

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


The average bleu score accross sentences: 0.31839040906794214
