In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
!pip install -U spacy



In [4]:
!pip install https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.1.0/ru_core_news_sm-3.1.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.1.0/ru_core_news_sm-3.1.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.1.0/ru_core_news_sm-3.1.0.tar.gz (15.9 MB)
[K     |████████████████████████████████| 15.9 MB 4.3 MB/s 
[?25hCollecting spacy<3.2.0,>=3.1.0
  Downloading spacy-3.1.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 1.3 MB/s 
[?25hCollecting pymorphy2>=0.9
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 1.5 MB/s 
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 34.1 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Building wheels for collected packages: ru-core-news-sm
  Building

In [5]:
spacy_ru = spacy.load('ru_core_news_sm')

  """


In [6]:
def tokenize_ru(text):
  return [tok.text for tok in spacy_ru.tokenizer(text)]

In [7]:
with open('questions_small.txt') as f:
        questions = f.read().splitlines()
with open('answers_small.txt') as f:
        answers = f.read().splitlines()

In [8]:
def tokenize_q(text):
  return [tok.text for tok in questions.tokenizer(text)]
def tokenize_a(text):
  return [tok.text for tok in answers.tokenizer(text)]

In [9]:
SRC = Field(tokenize = tokenize_q, 
            init_token = '[BOS]', 
            eos_token = '[EOS]', 
            lower = True)

TRG = Field(tokenize = tokenize_a, 
            init_token = '[BOS]', 
            eos_token = '[EOS]', 
            lower = True)

In [14]:
tokenize_q

<function __main__.tokenize_q>

In [10]:
from torchtext import datasets, data
from tqdm import tqdm

In [11]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.11.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 5.2 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.0


In [12]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

In [13]:
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=[ '[EOS]', '[BOS]', '[PAD]'])
tokenizer.train(files=["questions_small.txt", "answers_small.txt"], trainer=trainer)

In [14]:
from torchtext.legacy import data

In [22]:
TEXT = data.Field(
    fix_length=50,
    init_token='[BOS]',
    eos_token='[EOS]',
    pad_token='[PAD]',
    lower=True,
    tokenize=lambda x: tokenizer.encode(x).tokens,
    batch_first=True,
)

fields = (('src', TEXT), ('tgt', TEXT))

In [15]:
!pip install torchtext



In [16]:
from torchtext.data.metrics import bleu_score

In [17]:
from torchtext.legacy import data

In [23]:
with open('questions_small.txt') as f:
    question_snt = list(map(str.strip, f.readlines()))
    
with open('answers_small.txt') as f:
    answer_snt = list(map(str.strip, f.readlines()))
    
examples = [data.Example.fromlist(x, fields) for x in tqdm(zip(question_snt, answer_snt),total=len(answer_snt))]
test = data.Dataset(examples[-1000:], fields)
train, valid = data.Dataset(examples[:-1000], fields).split(0.9)



  0%|          | 0/99999 [02:17<?, ?it/s]
  0%|          | 0/99999 [02:07<?, ?it/s]


  1%|          | 731/99999 [00:00<00:17, 5603.91it/s][A[A

  2%|▏         | 1787/99999 [00:00<00:11, 8192.58it/s][A[A

  3%|▎         | 2901/99999 [00:00<00:10, 9452.28it/s][A[A

  4%|▍         | 3937/99999 [00:00<00:09, 9793.39it/s][A[A

  5%|▌         | 5014/99999 [00:00<00:09, 10135.42it/s][A[A

  6%|▌         | 6069/99999 [00:00<00:09, 10270.86it/s][A[A

  7%|▋         | 7105/99999 [00:00<00:15, 6014.16it/s] [A[A

  8%|▊         | 8219/99999 [00:01<00:12, 7101.10it/s][A[A

  9%|▉         | 9230/99999 [00:01<00:11, 7800.55it/s][A[A

 10%|█         | 10330/99999 [00:01<00:10, 8596.87it/s][A[A

 11%|█▏        | 11317/99999 [00:01<00:09, 8921.16it/s][A[A

 12%|█▏        | 12469/99999 [00:01<00:09, 9627.62it/s][A[A

 14%|█▎        | 13628/99999 [00:01<00:08, 10175.72it/s][A[A

 15%|█▍        | 14731/99999 [00:01<00:08, 10418.47it/s][A[A

 16%|█▌        | 15904/99999 [00:01

In [24]:
TEXT.build_vocab(train, max_size=10000, min_freq=3)

In [25]:
TEXT

<torchtext.legacy.data.field.Field at 0x7fc0de514250>

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, valid, test), 
                                                                      batch_size = BATCH_SIZE, 
                                                                      sort_within_batch=True,
                                                                      sort_key=lambda x: len(x.src),
                                                                      device = device)

In [27]:
test_batch = next(iter(test_iterator))
test_batch


[torchtext.legacy.data.batch.Batch of size 32]
	[.src]:[torch.cuda.LongTensor of size 32x50 (GPU 0)]
	[.tgt]:[torch.cuda.LongTensor of size 32x50 (GPU 0)]

In [45]:
class EncoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
    super(EncoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    #self.input_size = input_size

    # Output size of the word embedding NN
    #self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Regularization parameter
    self.dropout = nn.Dropout(p)
    self.tag = True

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(input_size, embedding_size)
    
    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

  # Shape of x (26, 32) [Sequence_length, batch_size]
  def forward(self, x):

    # Shape -----------> (26, 32, 300) [Sequence_length , batch_size , embedding dims]
    embedding = self.dropout(self.embedding(x))
    
    # Shape --> outputs (26, 32, 1024) [Sequence_length , batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size]
    outputs, (hidden_state, cell_state) = self.LSTM(embedding)

    return hidden_state, cell_state

input_size_encoder = len(TEXT.vocab)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = 0.5

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)
print(encoder_lstm)

EncoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(10004, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
)


In [46]:
input_size_encoder = len(TEXT.vocab)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = 0.5

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)
print(encoder_lstm)

EncoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(10004, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
)


In [37]:
hidden_state_encoder, cell_state = encoder_lstm(test_batch.src)
cell_state.shape, hidden_state_encoder.shape

(torch.Size([2, 50, 1024]), torch.Size([2, 50, 1024]))

In [38]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=64):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.emb_layer = torch.nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.emb_layer(input).view(1, 1, -1)
    
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights
      

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [65]:
output_size = len(TEXT.vocab)
'''attention = AttnDecoderRNN(hidden_size, output_size).to(device)
output, hidden, attention_weight = attention(hidden_state, cell_state)
attention_weight.shape'''
attention = AttnDecoderRNN(hidden_size, output_size).to(device)

In [47]:
class DecoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
    super(DecoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    #self.input_size = input_size

    # Output size of the word embedding NN
    #self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Size of the one hot vectors that will be the output to the encoder (English Vocab Size)
    self.output_size = output_size

    # Regularization parameter
    self.dropout = nn.Dropout(p)

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(input_size, embedding_size)

    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

    # Shape -----------> (1024, 4556) [embedding dims, hidden size, num layers]
    self.fc = nn.Linear(hidden_size, output_size)
    unk_replace: True

  # Shape of x (32) [batch_size]
  def forward(self, x, hidden_state, cell_state):

    # Shape of x (1, 32) [1, batch_size]
    x = x.unsqueeze(0)

    # Shape -----------> (1, 32, 300) [1, batch_size, embedding dims]
    embedding = self.dropout(self.embedding(x))

    # Shape --> outputs (1, 32, 1024) [1, batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size] (passing encoder's hs, cs - context vectors)
    outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))
    # Shape --> predictions (1, 32, 4556) [ 1, batch_size , output_size]
    predictions = self.fc(outputs)

    # Shape --> predictions (32, 4556) [batch_size , output_size]
    predictions = predictions.squeeze(0)

    return predictions, hidden_state, cell_state



In [48]:
input_size_decoder = len(TEXT.vocab)
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
decoder_dropout = 0.5
output_size = len(TEXT.vocab)

decoder_lstm = DecoderLSTM(input_size_decoder, decoder_embedding_size,
                           hidden_size, num_layers, decoder_dropout, output_size).to(device)
print(decoder_lstm)

DecoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(10004, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=10004, bias=True)
)


In [34]:

'''attention_scores = torch.bmm(decoder_hidden, encoder_hidden.transpose(1, 2))
# заметим, что у нас добавилась одна размерность и поэтому чуть меняем софтмакc
attention_distribution = torch.softmax(attention_scores, 2)
attention_vectors = torch.bmm(attention_distribution, encoder_hidden)
decoder_with_attention = torch.cat([decoder_hidden, attention_vectors], dim=-1)'''


In [83]:
class Seq2Seq(nn.Module):
  def __init__(self, Encoder_LSTM, Decoder_LSTM):
    super(Seq2Seq, self).__init__()
    self.Encoder_LSTM = Encoder_LSTM
    self.Decoder_LSTM = Decoder_LSTM


  def forward(self, source, target, tfr=0.5):
    # Shape - Source : (10, 32) [(Sentence length German + some padding), Number of Sentences]
    batch_size = source.shape[1]

    # Shape - Source : (14, 32) [(Sentence length English + some padding), Number of Sentences]
    target_len = target.shape[0]
    target_vocab_size = len(TEXT.vocab)
    
    # Shape --> outputs (14, 32, 5766) 
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    # Shape --> (hs, cs) (2, 32, 1024) ,(2, 32, 1024) [num_layers, batch_size size, hidden_size] (contains encoder's hs, cs - context vectors)
    hidden_state_encoder, cell_state = self.Encoder_LSTM(source)

    # attention_scores = torch.bmm(hidden_state_decoder, hidden_state_encoder.transpose(1, 2))
    # заметим, что у нас добавилась одна размерность и поэтому чуть меняем софтмакc
    # attention_distribution = torch.softmax(attention_scores, 2)
    # attention_vectors = torch.bmm(attention_distribution, hidden_state_encoder)


    # Shape of x (32 elements)
    x = target[0] # Trigger token <SOS>

    for i in range(1, target_len):
      # Shape --> output (32, 5766) 
      output, hidden_state_decoder, cell_state = self.Decoder_LSTM(x, hidden_state_encoder, cell_state)
      attention_scores = torch.bmm(hidden_state_decoder, hidden_state_encoder.transpose(1, 2))
      attention_distribution = torch.softmax(attention_scores, 2)
      attention_vectors = torch.bmm(attention_distribution, hidden_state_encoder)
      outputs[i] = output
      decoder_with_attention = torch.cat([hidden_state_decoder, attention_vectors], dim=-1)
      best_guess = output.argmax(1) # 0th dimension is batch size, 1st dimension is word embedding
      x = target[i] if random.random() < tfr else best_guess # Either pass the next word correctly from the dataset or use the earlier predicted word

    # Shape --> outputs (14, 32, 5766) 
    return outputs


In [84]:
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary

In [85]:
# Hyperparameters

learning_rate = 0.001
writer = SummaryWriter(f"runs/loss_plot")
step = 0

model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = TEXT.vocab.stoi['[PAD]']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [86]:
model

Seq2Seq(
  (Encoder_LSTM): EncoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(10004, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (Decoder_LSTM): DecoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(10004, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=10004, bias=True)
  )
)

In [87]:
def translate_sentence(model, sentence, TEXT, device, max_length=50):
    spacy_ger = spacy.load("ru_core_news_sm")

    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
    tokens.insert(0, TEXT.init_token)
    tokens.append(TEXT.eos_token)
    text_to_indices = [TEXT.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden_state_encoder, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [TEXT.vocab.stoi['[BOS]']]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden_state_decoder, cell = model.Decoder_LSTM(previous_word, hidden_state_encoder, cell)
            attention_scores = torch.bmm(hidden_state_decoder, hidden_state_encoder.transpose(1, 2))
            attention_distribution = torch.softmax(attention_scores, 2)
            attention_vectors = torch.bmm(attention_distribution, hidden_state_encoder)
            decoder_with_attention = torch.cat([hidden_state_decoder, attention_vectors], dim=-1)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == TEXT.vocab.stoi['[EOS]']:
            break

    translated_sentence = [TEXT.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    print('saving')
    print()
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, '/content/checkpoint-NMT')
    torch.save(model.state_dict(),'/content/checkpoint-NMT-SD')

In [46]:
question_snt[3] 

'нужен ли автомобиль семье с маленьким ребенком?'

In [47]:
test_batch = next(iter(test_iterator))
test_batch


[torchtext.legacy.data.batch.Batch of size 32]
	[.src]:[torch.cuda.LongTensor of size 32x50 (GPU 0)]
	[.tgt]:[torch.cuda.LongTensor of size 32x50 (GPU 0)]

In [None]:
epoch_loss = 0.0
num_epochs = 100
best_loss = 999999
best_epoch = -1
sentence1 = "нужен ли автомобиль семье с маленьким ребенком?"
ts1  = []

for epoch in range(num_epochs):
  print("Epoch - {} / {}".format(epoch+1, num_epochs))
  model.eval()
  translated_sentence1 = translate_sentence(model, sentence1, TEXT, device, max_length=50)
  print(f"Translated example sentence 1: \n {translated_sentence1}")
  ts1.append(translated_sentence1)

  model.train(True)
  for batch_idx, batch in tqdm(enumerate(train_iterator), total=len(train_iterator)):
    input = batch.src.to(device)
    target = batch.tgt.to(device)


    # Pass the input and target for model's forward method
    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    # Clear the accumulating gradients
    optimizer.zero_grad()

    # Calculate the loss value for every epoch
    loss = criterion(output, target)

    # Calculate the gradients for weights & biases using back-propagation
    loss.backward()

    # Clip the gradient value is it exceeds > 1
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Update the weights values using the gradients we calculated using bp 
    optimizer.step()
    step += 1
    epoch_loss += loss.item()
    writer.add_scalar("Training loss", loss, global_step=step)

  if epoch_loss < best_loss:
    best_loss = epoch_loss
    best_epoch = epoch
    checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss) 
    if ((epoch - best_epoch) >= 10):
      print("no improvement in 10 epochs, break")
      break
  print("Epoch_Loss - {}".format(loss.item()))
  print()
  
print(epoch_loss / len(train_iterator))

Epoch - 1 / 100


  """


Translated example sentence 1: 
 ['кали', 'нас', 'летних', 'вконтакте', 'вконтакте', 'перестать', 'перестать', 'перестать', 'перестать', 'перестать', 'перестать', 'влади', 'влади', 'ооооо', 'ооооо', 'девочкам', 'перестать', 'перестать', 'перестать', 'перестать', 'влади', 'влади', 'ооооо', 'ооооо', 'девочкам', 'перестать', 'перестать', 'перестать', 'перестать', 'влади', 'влади', 'ооооо', 'ооооо', 'девочкам', 'перестать', 'перестать', 'перестать', 'перестать', 'влади', 'влади', 'ооооо', 'ооооо', 'девочкам', 'перестать', 'перестать', 'перестать', 'перестать', 'влади', 'влади', 'ооооо']


 19%|█▊        | 519/2785 [06:29<28:24,  1.33it/s]