In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext.legacy.datasets import TranslationDataset, Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy

import random
import math
import time

import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')
from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE
from train_model import train

In [2]:
# getting data
path_to_data = '../../datasets/Machine_translation_EN_RU/data.txt'
from data_preprocessing import get_dataset


data, vocab = get_dataset(path_to_data)
train_data, valid_data, test_data = data
src_vocab, trg_vocab = vocab
PAD_IDX = trg_vocab.stoi['<pad>']


def _len_sort_key(x):
    return len(x.src)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def get_iterators(train_data=train_data, 
                  valid_data=valid_data,
                  test_data=test_data,
                  batch_size=512):


    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data), 
        batch_size = batch_size, 
        device = device,
        sort_key=_len_sort_key
    )
    return train_iterator, valid_iterator, test_iterator

Number of training examples: 40000
Number of validation examples: 2500
Number of testing examples: 7500
Unique tokens in source (ru) vocabulary: 9260
Unique tokens in target (en) vocabulary: 6708


In [3]:
from Attention import Attention

In [30]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(
            num_embeddings=input_dim,
            embedding_dim=emb_dim
        )
        self.rnn = nn.GRU(emb_dim, hid_dim, bidirectional = True)
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear()
        
    def forward(self, src):
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        output, hidden = self.rnn(embedded)
        #output = [src len, batch size, enc hid dim * 2]
        # hid =[n_layers*n_directions, batch_size, hid_dim]
        return output, hidden

In [31]:
train_iterator, valid_iterator, test_iterator = get_iterators(batch_size=2)

In [32]:
for x in train_iterator:
    sample_src = x.src
    sample_trg = x.trg
    break 
sample_src.shape, sample_trg.shape

(torch.Size([16, 2]), torch.Size([14, 2]))

In [33]:
enc = Encoder(input_dim = len(src_vocab), emb_dim = 16, hid_dim = 18, dropout = 0.5)
enc_out, enc_hid = enc(sample_src)
print(f"sample shape is {sample_src.shape}")
print(f"enc_out shape is {enc_out.shape}")
print(f"enc_hid shape is {enc_hid.shape}")

sample shape is torch.Size([16, 2])
enc_out shape is torch.Size([16, 2, 36])
enc_hid shape is torch.Size([2, 2, 18])


In [42]:
gru = nn.GRU(16, 18)

In [43]:
#embedded = [1, batch size, emb dim]
emb = torch.rand((1, 2, 16))

In [50]:
out = gru(emb, enc_hid[-1].unsqueeze(0))
out[0].shape, out[1].shape 

(torch.Size([1, 2, 18]), torch.Size([1, 2, 18]))

In [51]:
attent = Attention(18)

In [54]:
out[0].transpose(0, 1).shape

torch.Size([2, 1, 18])

In [55]:
enc_out.transpose(0, 1).shape

torch.Size([2, 16, 36])

In [53]:
attention_output = attent(out[0].transpose(0, 1),
                          enc_out.transpose(0, 1))

RuntimeError: Expected batch2_sizes[0] == bs && batch2_sizes[1] == contraction_size to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)