In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy.datasets import TranslationDataset, Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy

import random
import math
import time

import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE

In [65]:
from Attention import Attention

In [3]:
path_do_data = '../../datasets/Machine_translation_EN_RU/data_small.txt'

In [4]:
"""
Get the fields for souce and target. Every sentence in in soruce and in the target 
has been tokenized, so sentence has the form: [<sos>, token_1, token_2, ..., <eos>]
"""

SRC = Field(tokenize=tokenize,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize=tokenize,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

dataset = torchtext.legacy.data.TabularDataset(
    path=path_do_data,
    format='tsv',
    fields=[('trg', TRG), ('src', SRC)]
)

In [5]:
"""
split the dataset, every item in train, val, test has the attributes .src, .trg
"""
train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])
print(train_data[0].src)
print(train_data[0].trg)

['комплекс', 'cool', 'sun', 'fully', 'furnished', 'home', 'расположен', 'в', 'городе', 'исламабад', ',', 'в', '10', 'км', 'от', 'торгового', 'комплекса', 'the', 'centaurus', '.', 'к', 'услугам', 'гостей', 'бесплатный', 'wi', '-', 'fi', '.']
['featuring', 'free', 'wifi', ',', 'cool', 'sun', 'fully', 'furnished', 'home', 'offers', 'accommodation', 'in', 'islamabad', ',', '10', 'km', 'from', 'the', 'centaurus', 'mall', '.']


In [6]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 4000
Number of validation examples: 250
Number of testing examples: 750


In [7]:
"""
Get the vocabulary for train_data
"""
SRC.build_vocab(train_data, min_freq = 3)
TRG.build_vocab(train_data, min_freq = 3)

print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (ru) vocabulary: 1926
Unique tokens in target (en) vocabulary: 1456


In [8]:
print(vars(train_data.examples[19]))

{'trg': ['the', 'tyrolean', '-', 'style', 'rooms', 'feature', 'a', 'balcony', 'with', 'mountain', 'views', ',', 'a', 'flat', '-', 'screen', 'satellite', 'tv', ',', 'and', 'a', 'bathroom', '.'], 'src': ['оформленные', 'в', 'тирольском', 'стиле', 'номера', 'располагают', 'ванной', 'комнатой', 'и', 'балконом', ',', 'с', 'которого', 'открывается', 'вид', 'на', 'горы', ',', 'а', 'также', 'в', 'них', 'предоставляется', 'телевизор', 'с', 'плоским', 'экраном', 'с', 'кабельными', 'каналами', '.']}


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [90]:
"""
get the iteratos sorted by length  o source sentence
"""
def _len_sort_key(x):
    return len(x.src)

BATCH_SIZE = 2

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key=_len_sort_key
)

In [91]:
INPUT_DIM = len(SRC.vocab)   # how many words in source vocabulary
OUTPUT_DIM = len(TRG.vocab)  # how many words in target vocabulary
ENC_EMB_DIM = 8
DEC_EMB_DIM = 8
HID_DIM = 4
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

for x in train_iterator:
    sample_src = x.src
    sample_trg = x.trg
    break 
sample_src.shape, sample_trg.shape

(torch.Size([25, 2]), torch.Size([19, 2]))

In [92]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(
            num_embeddings=input_dim,
            embedding_dim=emb_dim
        )
        self.rnn = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout
        )
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, src):
        # src [src_length, batch_size]
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        # embedded = [src_length, batch_size, emb_size]
        output, hidden = self.rnn(embedded)
        # output [src_length, batch_size, hid_dim * n_directions]
        # hidden = (hid, cell), hid =[n_layers*n_directions, batch_size, hid_dim]
        return output, hidden

In [95]:
print(f"sample shape is {sample_src.shape}")

sample shape is torch.Size([25, 2])


In [99]:
enc = Encoder(input_dim = len(SRC.vocab), emb_dim = 16, hid_dim = 18, n_layers = 2, dropout = 0.5)
enc_out, enc_hid = enc(sample_src)
print(f"sample shape is {sample_src.shape}")
print(f"enc_out shape is {enc_out.shape}")
print(f"enc_hid shape is {enc_hid[0].shape}")

sample shape is torch.Size([25, 2])
enc_out shape is torch.Size([25, 2, 18])
enc_hid shape is torch.Size([2, 2, 18])


In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(num_embeddings=output_dim,
                                      embedding_dim=emb_dim)
        self.rnn = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout
        )
        self.attention = Attention(hid_dim)
        self.out = nn.Linear(in_features=2 * hid_dim, out_features=output_dim)        
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, input, hidden, enc_seq):
        # input = [batch_size]
        input = input.unsqueeze(0)
        # input = [1, batch_size]
        
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch_size, emb_dim]
        
        output, hidden = self.rnn(embedded, hidden)
        # output = [1, batch_size, hid_dim]
        # hidden = (hidden = [n_layers, batch_size, hid_dim], cell=[n_layers, batch_size, hid_dim])
        
        # input_1 for attention: query = [batch_size, trg_length=1, hid_dim]
        # input_2 for attention: context = [batch_size, src_length, hid_dim]
        # enc_seq is output of enc: [src_length, batch_size, hid_dim * n_directions]
        
        attention_output, _ = self.attention(output.transpose(0, 1),
                                             enc_seq.transpose(0, 1))
        
        # attention_output = [batch_size, trg_len=1, hid_dim]
        attention_output = attention_output.transpose(0, 1)
        # attention_output = [trg_len=1, batch_size, hid_dim]
        
        prediction = self.out((torch.cat([attention_output.squeeze(0),
                                         output.squeeze(0)], dim=1)))
        return prediction, hidden

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        #src = [src sent len, batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        # Again, now batch is the first dimention instead of zero
        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #we use all hidden states in the enc (enc_seq), and hidden state from last token as input to dec
        enc_seq, hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, max_len):
            output, hidden = self.decoder(input, hidden, enc_seq)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = (trg[t] if teacher_force else top1)
        
        return outputs
