In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy.datasets import TranslationDataset, Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy

import random
import math
import time

import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE

In [2]:
tokenizer_W = WordPunctTokenizer()
def tokenize(x, tokenizer=tokenizer_W):
    return tokenizer.tokenize(x.lower())

In [4]:
path_do_data = '../datasets/Machine_translation_EN_RU/data_small.txt'

In [5]:
SRC = Field(tokenize=tokenize,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize=tokenize,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

dataset = torchtext.legacy.data.TabularDataset(
    path=path_do_data,
    format='tsv',
    fields=[('trg', TRG), ('src', SRC)]
)

In [6]:
train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])
print(train_data[0].src)
print(train_data[0].trg)

['отель', 'cala', 'ferrera', 'находится', 'в', '44', 'км', 'от', 'курортного', 'поселка', 'эль', '-', 'ареналя', 'и', 'в', '46', 'км', 'от', 'курортного', 'поселка', 'плайя', '-', 'де', '-', 'пальмы', '.']
['el', 'arenal', 'is', '44', 'km', 'from', 'hotel', 'cala', 'ferrera', ',', 'while', 'playa', 'de', 'palma', 'is', '46', 'km', 'from', 'the', 'property', '.']


In [7]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 4000
Number of validation examples: 250
Number of testing examples: 750


In [8]:
SRC.build_vocab(train_data, min_freq = 3)
TRG.build_vocab(train_data, min_freq = 3)

print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (ru) vocabulary: 1889
Unique tokens in target (en) vocabulary: 1439


In [10]:
print(vars(train_data.examples[9]))

{'trg': ['if', 'you', 'feel', 'like', 'visiting', 'the', 'surroundings', ',', 'check', 'out', 'marari', 'beach', 'that', 'is', '25', 'km', 'and', 'the', 'karunakaran', 'musuem', 'that', 'is', '30', 'km', '.'], 'src': ['желающие', 'исследовать', 'окрестности', 'могут', 'посетить', 'пляж', 'марари', '(', 'в', '25', 'км', ')', 'и', 'музей', 'карунакаран', '(', 'в', '30', 'км', ').']}


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
def _len_sort_key(x):
    return len(x.src)

BATCH_SIZE = 2

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key=_len_sort_key
)

In [80]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 8
DEC_EMB_DIM = 8
HID_DIM = 4
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

for x in train_iterator:
    sample_src = x.src
    sample_trg = x.trg
    
sample_src.shape, sample_trg.shape

(torch.Size([15, 2]), torch.Size([11, 2]))

# Encoder

In [81]:
import my_network
Encoder = my_network.Encoder
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)

In [82]:
enc_hid, enc_cell = enc(sample_src)

In [83]:
#hidden = [n layers * n directions, batch size, hid dim]
#cell = [n layers * n directions, batch size, hid dim]
print(sample_src.shape, enc_hid.shape, enc_cell.shape)

torch.Size([15, 2]) torch.Size([2, 2, 4]) torch.Size([2, 2, 4])


# Decoder

In [84]:
Decoder = my_network.Decoder
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

In [85]:
batch_size = sample_trg.shape[1]
max_len = sample_trg.shape[0]
trg_vocab_size = OUTPUT_DIM
print(max_len, batch_size, trg_vocab_size)

11 2 1439


In [86]:
outputs = torch.zeros(max_len, batch_size, trg_vocab_size)

In [87]:
 #last hidden state of the encoder is used as the initial hidden state of the decoder
dec_hid, dec_cell = enc_hid, enc_cell
print(dec_hid.shape, dec_cell.shape)

torch.Size([2, 2, 4]) torch.Size([2, 2, 4])


In [88]:
#first input to the decoder is the <sos> tokens
input_ = sample_trg[0,:]
print(input_)
print(input_.shape)
print(input_.unsqueeze(0).shape)

tensor([2, 2])
torch.Size([2])
torch.Size([1, 2])


In [89]:
for t in range(1, max_len):
    #output = [batch size, output dim]
        output, dec_hid, dec_cell = dec(input_, dec_hid, dec_cell)
        outputs[t] = output
        teacher_force = random.random() < 0.5
        top1 = output.max(1)[1] # top1 is tensor of size [batch_size], [1] - returns indices
        input_ = (sample_trg[t] if teacher_force else top1)

In [90]:
output.max(1)

torch.return_types.max(
values=tensor([0.8479, 0.6821], grad_fn=<MaxBackward0>),
indices=tensor([900, 725]))

In [91]:
output.max(1)[1]

tensor([900, 725])

In [92]:
outputs.shape, outputs[1:].shape, outputs[1:].view(-1, outputs.shape[-1]).shape

(torch.Size([11, 2, 1439]), torch.Size([10, 2, 1439]), torch.Size([20, 1439]))

In [93]:
outputs = outputs[1:].view(-1, 
                           outputs.shape[-1])
#output = [(trg sent len - 1) * batch size, output dim]
outputs.shape

torch.Size([20, 1439])

In [94]:
sample_trg[1:].shape, sample_trg[1:].view(-1).shape

(torch.Size([10, 2]), torch.Size([20]))

In [95]:
PAD_IDX = 1
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [96]:
sample_trg = sample_trg[1:].view(-1)

In [97]:
criterion(outputs, sample_trg)

tensor(7.2299, grad_fn=<NllLossBackward0>)

In [98]:
outputs.shape, sample_trg.shape

(torch.Size([20, 1439]), torch.Size([20]))