In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext.legacy.datasets import TranslationDataset, Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy

import random
import math
import time

import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')
from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE
from train_model import train

In [4]:
# getting data
path_to_data = '../../datasets/Machine_translation_EN_RU/data.txt'
from data_preprocessing import get_dataset


data, vocab = get_dataset(path_to_data)
train_data, valid_data, test_data = data
src_vocab, trg_vocab = vocab
PAD_IDX = trg_vocab.stoi['<pad>']


def _len_sort_key(x):
    return len(x.src)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def get_iterators(train_data=train_data, 
                  valid_data=valid_data,
                  test_data=test_data,
                  batch_size=512):


    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data), 
        batch_size = batch_size, 
        device = device,
        sort_key=_len_sort_key
    )
    return train_iterator, valid_iterator, test_iterator

Number of training examples: 40000
Number of validation examples: 2500
Number of testing examples: 7500
Unique tokens in source (ru) vocabulary: 9193
Unique tokens in target (en) vocabulary: 6714


In [4]:
from base_line_model import get_base_line_model
baseline = get_base_line_model(len(src_vocab), len(trg_vocab))

In [6]:
optimizer = optim.Adam(baseline.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

train_iterator, valid_iterator, test_iterator = get_iterators()
train(baseline, "baseline", train_iterator, valid_iterator, optimizer, criterion, n_epochs = 10)

NameError: name 'get_loss_on_val' is not defined

In [25]:
%load_ext autoreload
%autoreload seq_to_seq_ATTENTION

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import seq_to_seq_ATTENTION

seq_to_seq_ATTENTION_model = seq_to_seq_ATTENTION.get_model(len(src_vocab), len(trg_vocab))
optimizer = optim.Adam(seq_to_seq_ATTENTION_model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

train_iterator, valid_iterator, test_iterator = get_iterators()
train(seq_to_seq_ATTENTION_model, "seq_to_seq_ATTENTION", train_iterator, valid_iterator, optimizer, criterion, n_epochs = 10)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:76] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1237155840 bytes.

In [12]:
from get_score import get_results

In [15]:
get_results(baseline, test_iterator, trg_vocab)

0it [00:00, ?it/s]

The model has 8,951,454 trainable parameters


2it [00:12,  6.05s/it]

BLEU score: 5.27175746941793e-154 

Original: light hostel features free wifi throughout the property .
Generated: the the a a a . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

Original: with classic , the studio comes with a tv and a kitchenette with a dining area .
Generated: the the a a a a . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

Original: it offers a large outdoor pool and massage services upon request . free parking is provided .
Generated: the the a a a . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

Original: resort offers a to surrounding attractions .
Generated: the the a a a . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

Original: it is also less than a 10 - minute walk from the road .
Generated: the the a a a . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

Original: guests at ' s garden can rent a bicycle / car to self - e




In [10]:
import seq_to_seq_ATTENTION

seq_to_seq_ATTENTION_model = seq_to_seq_ATTENTION.get_model(9297, 6718)

In [11]:
seq_to_seq_ATTENTION_model.load_state_dict(torch.load("saved_models/seq_to_seq_ATTENTION.pt", map_location=torch.device('cpu')))
seq_to_seq_ATTENTION_model.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(9297, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(6718, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=6718, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [14]:
train_iterator, valid_iterator, test_iterator = get_iterators()

In [15]:
get_results(seq_to_seq_ATTENTION_model, test_iterator, trg_vocab)

0it [00:00, ?it/s]

The model has 22,578,494 trainable parameters


15it [08:41, 34.78s/it]


BLEU score: 2.0376431163040403
Original: cancun bus station is 500 metres from hostel & friends , while cancun government palace is 700 metres away . the nearest airport is cancún international airport , 15 km from hostel & friends .
Generated: a ride bbq of the - - , you access find a caves - bathroom the , , you access find a caves - minute or away , you can find a bathroom shops and , you can find a bathroom the of , you can find a bathroom the of , you access find a caves - bathroom . 

Original: all rooms include a tv with cable channels .
Generated: each variety pool a 24 rooms with a . 

Original: saint cathedral and church are a 3 - minute walk away , and the blue mosque is a 3 - minute walk from the property .
Generated: the offers property an dining including , a km , , , and . 

Original: free private parking is available on site .
Generated: the equipped pool a the centre of the centre . 

Original: set within the historic core of zadar , just steps away from the seaside pr

In [16]:
from nltk.translate.bleu_score import corpus_bleu

In [46]:
corpus_bleu([["how"]], [["how"]])

0

In [31]:
hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
        'ensures', 'that', 'the', 'military', 'always',
        'obeys', 'the', 'commands', 'of', 'the', 'party']

ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
         'ensures', 'that', 'the', 'military', 'will', 'forever',
         'heed', 'Party', 'commands']
ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
         'guarantees', 'the', 'military', 'forces', 'always',
         'being', 'under', 'the', 'command', 'of', 'the', 'Party']
ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
         'army', 'always', 'to', 'heed', 'the', 'directions',
         'of', 'the', 'party']

hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
        'interested', 'in', 'world', 'history']
ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
         'because', 'he', 'read', 'the', 'book']

In [42]:
list_of_references = [[ref1a]]
hypotheses = [hyp1]
corpus_bleu(list_of_references, hypotheses) * 100

41.18037635691578