In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy.datasets import TranslationDataset, Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy

import random
import math
import time

import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE

In [2]:
"""
tokenize the sentence.lower(). inp: str, return: list of str
"""
tokenizer_W = WordPunctTokenizer()
def tokenize(x, tokenizer=tokenizer_W):
    return tokenizer.tokenize(x.lower())
tokenize("Hi! How are you my friend?")

['hi', '!', 'how', 'are', 'you', 'my', 'friend', '?']

In [3]:
path_do_data = '../../datasets/Machine_translation_EN_RU/data_small.txt'

In [4]:
"""
Get the fields for souce and target. Every sentence in in soruce and in the target 
has been tokenized, so sentence has the form: [<sos>, token_1, token_2, ..., <eos>]
"""

SRC = Field(tokenize=tokenize,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize=tokenize,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

dataset = torchtext.legacy.data.TabularDataset(
    path=path_do_data,
    format='tsv',
    fields=[('trg', TRG), ('src', SRC)]
)

In [5]:
"""
split the dataset, every item in train, val, test has the attributes .src, .trg
"""
train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])
print(train_data[0].src)
print(train_data[0].trg)

['в', 'отеле', 'работает', 'кафе', 'и', 'ресторан', ',', 'где', 'ежедневно', 'сервируется', 'завтрак', '«', 'шведский', 'стол', '».']
['the', 'tocina', 'business', 'has', 'a', 'café', 'and', 'a', 'restaurant', ',', 'where', 'a', 'daily', 'breakfast', 'buffet', 'is', 'served', '.']


In [6]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 4000
Number of validation examples: 250
Number of testing examples: 750


In [8]:
"""
Get the vocabulary for train_data
"""
SRC.build_vocab(train_data, min_freq = 3)
TRG.build_vocab(train_data, min_freq = 3)

print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (ru) vocabulary: 1865
Unique tokens in target (en) vocabulary: 1424


In [9]:
print(vars(train_data.examples[19]))

{'trg': ['each', 'room', 'has', 'a', 'basin', 'and', 'there', 'are', '2', 'shared', 'bathrooms', 'with', 'a', 'shower', 'for', 'communal', 'use', '.'], 'src': ['гости', 'могут', 'пользоваться', '2', 'общими', 'ванными', 'комнатами', 'с', 'душем', '.']}


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
"""
get the iteratos sorted by length  o source sentence
"""
def _len_sort_key(x):
    return len(x.src)

BATCH_SIZE = 2

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key=_len_sort_key
)

In [12]:
INPUT_DIM = len(SRC.vocab)   # how many words in source vocabulary
OUTPUT_DIM = len(TRG.vocab)  # how many words in target vocabulary
ENC_EMB_DIM = 8
DEC_EMB_DIM = 8
HID_DIM = 4
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

for x in train_iterator:
    sample_src = x.src
    sample_trg = x.trg
    break 
sample_src.shape, sample_trg.shape

(torch.Size([24, 2]), torch.Size([23, 2]))

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        #embedded = [src_len, batch_size, emb_dim]
        
        outputs, hidden = self.rnn(embedded)
                
        #outputs = [src_len, batch_size, hid_dim * num_directions]
        #hidden = [n_layers * num_directions, batch_size, hid_dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src_len, batch_size, enc_hid_dim * 2] - as hidden for every token (concat forw, backw)
        #hidden = [batch_size, dec_hid_dim] - transformed hidden from  last layer and last token
        
        return outputs, hidden

In [17]:
enc = Encoder(input_dim=len(SRC.vocab), 
              emb_dim = 8,
              enc_hid_dim = 4,
              dec_hid_dim = 4,
              dropout = 0.5)

print(sample_src.shape) # [src_len, batch_size]
enc_out, enc_hid = enc(sample_src)
print(f"enc_out shape: {enc_out.shape}")     # [src_len, batch_size, hid_dim*2]
print(f"enc_hid shape: {enc_hid.shape}")  #    [batch_size, hid_dim]

torch.Size([24, 2])
enc_out shape: torch.Size([24, 2, 8])
enc_hid shape: torch.Size([2, 4])


In [18]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec_hid_dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch_size, src_len, dec_hid_dim]
        #encoder_outputs = [batch_size, src_len, enc_hid_dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        #energy = [batch_size, src_len, dec_hid_dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch_size, src_len]
        
        return F.softmax(attention, dim=1)

In [19]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs)       
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        #encoder_outputs = [batch_size, src_len, enc_hid_dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        #weighted = [batch_size, 1, enc_hid_dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        #weighted = [1, batch_size, enc_hid_dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        #rnn_input = [1, batch_size, (enc_hid_dim * 2) + emb_dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        #prediction = [batch size, output dim] 
        
        return prediction, hidden.squeeze(0)

In [20]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [45]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

TypeError: __init__() missing 1 required positional argument: 'dropout'