In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy.datasets import TranslationDataset, Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy

import random
import math
import time

import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE

In [3]:
"""
tokenize the sentence.lower(). inp: str, return: list of str
"""
tokenizer_W = WordPunctTokenizer()
def tokenize(x, tokenizer=tokenizer_W):
    return tokenizer.tokenize(x.lower())
tokenize("Hi! How are you my friend?")

['hi', '!', 'how', 'are', 'you', 'my', 'friend', '?']

In [7]:
path_do_data = '../../datasets/Machine_translation_EN_RU/data_small.txt'

In [8]:
"""
Get the fields for souce and target. Every sentence in in soruce and in the target 
has been tokenized, so sentence has the form: [<sos>, token_1, token_2, ..., <eos>]
"""

SRC = Field(tokenize=tokenize,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize=tokenize,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

dataset = torchtext.legacy.data.TabularDataset(
    path=path_do_data,
    format='tsv',
    fields=[('trg', TRG), ('src', SRC)]
)

In [10]:
"""
split the dataset, every item in train, val, test has the attributes .src, .trg
"""
train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])
print(train_data[0].src)
print(train_data[0].trg)

['хостел', 'находится', 'недалеко', 'от', 'пристани', 'panajachel', ',', 'где', 'работает', 'множество', 'ресторанов', 'и', 'баров', '.']
['the', 'property', 'is', 'close', 'to', 'the', 'panajachel', 'dock', ',', 'where', 'restaurants', 'and', 'bars', 'are', 'located', '.']


In [11]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 4000
Number of validation examples: 250
Number of testing examples: 750


In [12]:
"""
Get the vocabulary for train_data
"""
SRC.build_vocab(train_data, min_freq = 3)
TRG.build_vocab(train_data, min_freq = 3)

print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (ru) vocabulary: 1886
Unique tokens in target (en) vocabulary: 1435


In [13]:
print(vars(train_data.examples[9]))

{'trg': ['the', 'shops', 'at', 'prudential', 'center', 'is', '900', 'metres', 'from', 'encore', 'b', '&', 'b', ',', 'while', 'john', 'hancock', 'tower', 'is', '900', 'metres', 'away', '.'], 'src': ['торговый', 'центр', 'prudential', 'center', 'и', 'небоскреб', 'джона', 'хэнкока', 'находятся', 'в', '900', 'м', 'от', 'отеля', 'типа', '«', 'постель', 'и', 'завтрак', '»', 'encore', '.']}


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
"""
get the iteratos sorted by length  o source sentence
"""
def _len_sort_key(x):
    return len(x.src)

BATCH_SIZE = 2

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key=_len_sort_key
)

In [16]:
INPUT_DIM = len(SRC.vocab)   # how many words in source vocabulary
OUTPUT_DIM = len(TRG.vocab)  # how many words in target vocabulary
ENC_EMB_DIM = 8
DEC_EMB_DIM = 8
HID_DIM = 4
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

for x in train_iterator:
    sample_src = x.src
    sample_trg = x.trg
    break 
sample_src.shape, sample_trg.shape

(torch.Size([16, 2]), torch.Size([21, 2]))

# Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
#         self.dropout = dropout
        
        self.embedding = nn.Embedding(
            num_embeddings=input_dim,
            embedding_dim=emb_dim
        )
            # <YOUR CODE HERE>
        
        self.rnn = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout
        )
            # <YOUR CODE HERE>
        
        self.dropout = nn.Dropout(p=dropout)# <YOUR CODE HERE>
        
    def forward(self, src):
        
        #src = [src_sent_len, batch_size]
        # Compute an embedding from the src data and apply dropout to it
        embedded = self.embedding(src) # <YOUR CODE HERE>
        embedded = self.dropout(embedded)
        #embedded = [src_sent_len, batch_size, emb_dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        #outputs = [src_sent_len, batch_size, hid_dim * n_directions]
        #hidden = [n_layers * n_directions, batch_size, hid_dim]
        #cell = [n_layers * n_directions, batch_size, hid_dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

In [18]:
import my_network
Encoder = my_network.Encoder
enc = Encoder(input_dim=len(SRC.vocab), 
              emb_dim = 8,
              hid_dim = 4, 
              n_layers = 2,
              dropout = 0.5)

In [21]:
print(sample_src.shape)
enc_hid, enc_cell = enc(sample_src)
print(f"enc hidden shape: {enc_hid.shape}")
print(f"enc dell shape: {enc_cell.shape}")

torch.Size([16, 2])
enc hidden shape: torch.Size([2, 2, 4])
enc dell shape: torch.Size([2, 2, 4])


# Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(
            num_embeddings=output_dim,
            embedding_dim=emb_dim
        )
            # <YOUR CODE HERE>
        
        self.rnn = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout
        )
            # <YOUR CODE HERE>
        
        self.out = nn.Linear(
            in_features=hid_dim,
            out_features=output_dim
        )
            # <YOUR CODE HERE>
        
        self.dropout = nn.Dropout(p=dropout)# <YOUR CODE HERE>
        
    def forward(self, input, hidden, cell):
        
        #input = [batch_size]
        #hidden = [n_layers * n_directions, batch_size, hid_dim]
        #cell = [n_layers * n_directions, batch_size, hid_dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n_layers, batch_size, hid_dim]
        #context = [n_layers, batch_size, hid_dim]
        
        input = input.unsqueeze(0)
        #input = [1, batch size]
        # Compute an embedding from the input data and apply dropout to it
        embedded = self.dropout(self.embedding(input))# <YOUR CODE HERE>
        
        #embedded = [1, batch_size, emb_dim]
        #output = [sent_len, batch_size, hid_dim * n_directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #sent len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [26]:
Decoder = my_network.Decoder
dec = Decoder(output_dim=len(TRG.vocab), 
              emb_dim = 8,
              hid_dim = 4, 
              n_layers = 2,
              dropout = 0.5)

In [27]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src sent len, batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        # Again, now batch is the first dimention instead of zero
        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, max_len):
            
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = (trg[t] if teacher_force else top1)
        
        return outputs

In [44]:
print(sample_src.shape, sample_trg.shape)
enc_hid, enc_cell = enc(sample_src)
print(f"enc hidden shape: {enc_hid.shape}") # (n_layers*n_direct, batch_size, hid_size)
print(f"enc dell shape: {enc_cell.shape}") # (n_layers*n_direct, batch_size, hid_size)

torch.Size([16, 2]) torch.Size([21, 2])
enc hidden shape: torch.Size([2, 2, 4])
enc dell shape: torch.Size([2, 2, 4])


In [41]:
batch_size = 2
max_len = sample_trg.shape[0]
trg_vocab_size = OUTPUT_DIM
print(max_len, batch_size, trg_vocab_size)

21 2 1435


In [42]:
outputs = torch.zeros(max_len, batch_size, trg_vocab_size)
print(outputs.shape)

torch.Size([21, 2, 1435])


In [43]:
 #last hidden state of the encoder is used as the initial hidden state of the decoder
dec_hid, dec_cell = enc_hid, enc_cell

In [47]:
#first input to the decoder is the <sos> tokens
input_ = sample_trg[0,:]
print(input_.shape)
print(input_.unsqueeze(0).shape)

torch.Size([2])
torch.Size([1, 2])


In [48]:
for t in range(1, max_len):
    #output = [batch size, output dim]
        output, dec_hid, dec_cell = dec(input_, dec_hid, dec_cell)
        outputs[t] = output
        teacher_force = random.random() < 0.5
        top1 = output.max(1)[1] # top1 is tensor of size [batch_size], [1] - returns indices
        input_ = (sample_trg[t] if teacher_force else top1)

In [49]:
output.max(1)

torch.return_types.max(
values=tensor([0.6828, 0.6942], grad_fn=<MaxBackward0>),
indices=tensor([727, 727]))

In [50]:
output.max(1)[1]

tensor([727, 727])

In [51]:
outputs.shape

torch.Size([21, 2, 1435])

In [52]:
from seq_to_seq_with_Attention_LSTM import seq_to_seq_attent_LSTM
model = seq_to_seq_attent_LSTM(len(SRC.vocab),len(TRG.vocab))

In [54]:
model(sample_src, sample_trg)

tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[-5.1898e-06,  1.0570e-05, -2.1655e-06,  ..., -1.1921e-05,
           1.1748e-05,  1.8471e-06],
         [ 8.4150e-06, -7.1292e-06,  3.5373e-05,  ...,  1.5775e-05,
           1.6121e-05,  6.3456e-06]],

        [[ 1.0435e-05,  2.6749e-05, -2.7406e-06,  ..., -6.3073e-06,
           3.5059e-05, -1.0482e-05],
         [ 7.6601e-06, -1.3873e-06,  3.7036e-05,  ..., -6.2676e-06,
           2.1456e-05,  6.5172e-07]],

        ...,

        [[-6.1668e-06, -1.0686e-05,  3.5993e-05,  ...,  7.5682e-06,
          -3.2882e-06,  1.2210e-06],
         [ 1.2396e-05, -2.8620e-05,  2.5354e-05,  ..., -6.9123e-07,
          -1.1319e-05, -1.5525e-05]],

        [[-1.5565e-05,  7.0127e-07,  1.9110e-05,  ...,  2.6217e-05,
          -7.5504e-06,  1.6727e-06],
         [ 1.7328e-05, -2.7469e-0

In [None]:
def get_bleu_score(model, test_iterator, trg_vocab): # , bert=False
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    original_text = []
    generated_text = []
    model.eval()
    with torch.no_grad():

        for i, batch in tqdm.tqdm(enumerate(test_iterator)):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0)
            output = output.argmax(dim=-1)

            original_text.extend([get_text(x, trg_vocab) for x in trg.cpu().numpy().T])
            generated_text.extend([get_text(x, trg_vocab) for x in output[1:].detach().cpu().numpy().T])
        score = corpus_bleu([[text] for text in original_text], generated_text) * 100

    return original_text, generated_text, score