In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [34]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [8]:
!pip install https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.1.0/ru_core_news_sm-3.1.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.1.0/ru_core_news_sm-3.1.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.1.0/ru_core_news_sm-3.1.0.tar.gz (15.9 MB)
[K     |████████████████████████████████| 15.9 MB 5.2 MB/s 
[?25hCollecting spacy<3.2.0,>=3.1.0
  Downloading spacy-3.1.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 5.1 MB/s 
[?25hCollecting pymorphy2>=0.9
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 4.9 MB/s 
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 27.9 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.6-p

In [35]:
spacy_ru = spacy.load('ru_core_news_sm')

In [36]:
def tokenize_ru(text):
  return [tok.text for tok in spacy_ru.tokenizer(text)]

In [None]:
# with open('questions_small.txt') as f:
#         questions = f.read().splitlines()
# with open('answers_small.txt') as f:
#         answers = f.read().splitlines()

In [None]:
# def tokenize_q(text):
#   return [tok.text for tok in questions.tokenizer(text)]
# def tokenize_a(text):
#   return [tok.text for tok in answers.tokenizer(text)]

In [None]:
# SRC = Field(tokenize = tokenize_q, 
#             init_token = '<sos>', 
#             eos_token = '<eos>', 
#             lower = True)

# TRG = Field(tokenize = tokenize_a, 
#             init_token = '<sos>', 
#             eos_token = '<eos>', 
#             lower = True)

In [6]:
from torchtext import datasets, data
from tqdm import tqdm

In [7]:
TEXT = Field(tokenize=tokenize_ru,
               lower=True,
               init_token="<sos>",
               eos_token="<eos>")

fields = (('src', TEXT), ('trg', TEXT))

In [8]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.11.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 4.3 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.0


In [8]:
from torchtext.legacy import data

In [9]:
with open('questions_small.txt') as f:
    question_snt = list(map(str.strip, f.readlines()))
    
with open('answers_small.txt') as f:
    answer_snt = list(map(str.strip, f.readlines()))
    
examples = [data.Example.fromlist(x, fields) for x in tqdm(zip(question_snt, answer_snt),total=len(answer_snt))]
test = data.Dataset(examples[-1000:], fields)
train, valid = data.Dataset(examples[:-1000], fields).split(0.9)

100%|██████████| 99999/99999 [00:15<00:00, 6629.34it/s]


In [10]:
TEXT.build_vocab(train, max_size=30000, min_freq=5)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 300

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, valid, test), 
                                                                      batch_size = BATCH_SIZE, 
                                                                      sort_within_batch=True,
                                                                      sort_key=lambda x: len(x.src),
                                                                      device = device)

In [12]:
class EncoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
    super(EncoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    #self.input_size = input_size

    # Output size of the word embedding NN
    #self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Regularization parameter
    self.dropout = nn.Dropout(p)
    self.tag = True

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(input_size, embedding_size)
    
    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

  # Shape of x (26, 32) [Sequence_length, batch_size]
  def forward(self, x):

    # Shape -----------> (26, 32, 300) [Sequence_length , batch_size , embedding dims]
    embedding = self.dropout(self.embedding(x))
    
    # Shape --> outputs (26, 32, 1024) [Sequence_length , batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size]
    outputs, (hidden_state, cell_state) = self.LSTM(embedding)

    return hidden_state, cell_state

input_size_encoder = len(TEXT.vocab)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = 0.5

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)
print(encoder_lstm)

EncoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(18052, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
)


In [13]:
class DecoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
    super(DecoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    #self.input_size = input_size

    # Output size of the word embedding NN
    #self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Size of the one hot vectors that will be the output to the encoder (English Vocab Size)
    self.output_size = output_size

    # Regularization parameter
    self.dropout = nn.Dropout(p)

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(input_size, embedding_size)

    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

    # Shape -----------> (1024, 4556) [embedding dims, hidden size, num layers]
    self.fc = nn.Linear(hidden_size, output_size)

  # Shape of x (32) [batch_size]
  def forward(self, x, hidden_state, cell_state):

    # Shape of x (1, 32) [1, batch_size]
    x = x.unsqueeze(0)

    # Shape -----------> (1, 32, 300) [1, batch_size, embedding dims]
    embedding = self.dropout(self.embedding(x))

    # Shape --> outputs (1, 32, 1024) [1, batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size] (passing encoder's hs, cs - context vectors)
    outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))

    # Shape --> predictions (1, 32, 4556) [ 1, batch_size , output_size]
    predictions = self.fc(outputs)

    # Shape --> predictions (32, 4556) [batch_size , output_size]
    predictions = predictions.squeeze(0)

    return predictions, hidden_state, cell_state

input_size_decoder = len(TEXT.vocab)
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
decoder_dropout = 0.5
output_size = len(TEXT.vocab)

decoder_lstm = DecoderLSTM(input_size_decoder, decoder_embedding_size,
                           hidden_size, num_layers, decoder_dropout, output_size).to(device)
print(decoder_lstm)

DecoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(18052, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=18052, bias=True)
)


In [14]:
class Seq2Seq(nn.Module):
  def __init__(self, Encoder_LSTM, Decoder_LSTM):
    super(Seq2Seq, self).__init__()
    self.Encoder_LSTM = Encoder_LSTM
    self.Decoder_LSTM = Decoder_LSTM

  def forward(self, source, target, tfr=0.5):
    # Shape - Source : (10, 32) [(Sentence length German + some padding), Number of Sentences]
    batch_size = source.shape[1]

    # Shape - Source : (14, 32) [(Sentence length English + some padding), Number of Sentences]
    target_len = target.shape[0]
    target_vocab_size = len(TEXT.vocab)
    
    # Shape --> outputs (14, 32, 5766) 
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    # Shape --> (hs, cs) (2, 32, 1024) ,(2, 32, 1024) [num_layers, batch_size size, hidden_size] (contains encoder's hs, cs - context vectors)
    hidden_state, cell_state = self.Encoder_LSTM(source)

    # Shape of x (32 elements)
    x = target[0] # Trigger token <SOS>

    for i in range(1, target_len):
      # Shape --> output (32, 5766) 
      output, hidden_state, cell_state = self.Decoder_LSTM(x, hidden_state, cell_state)
      outputs[i] = output
      best_guess = output.argmax(1) # 0th dimension is batch size, 1st dimension is word embedding
      x = target[i] if random.random() < tfr else best_guess # Either pass the next word correctly from the dataset or use the earlier predicted word

    # Shape --> outputs (14, 32, 5766) 
    return outputs


In [15]:
from torch.utils.tensorboard import SummaryWriter
# from torchsummary import summary

In [16]:
# Hyperparameters

learning_rate = 0.001
writer = SummaryWriter(f"runs/loss_plot")
step = 0

model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = TEXT.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [17]:
model

Seq2Seq(
  (Encoder_LSTM): EncoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(18052, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (Decoder_LSTM): DecoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(18052, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=18052, bias=True)
  )
)

In [18]:
def translate_sentence(model, sentence, TEXT, device, max_length=50):
    spacy_ger = spacy.load("ru_core_news_sm")

    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
    tokens.insert(0, TEXT.init_token)
    tokens.append(TEXT.eos_token)
    text_to_indices = [TEXT.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [TEXT.vocab.stoi['<sos>']]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.Decoder_LSTM(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == TEXT.vocab.stoi['<eos>']:
            break

    translated_sentence = [TEXT.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    print('saving')
    print()
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, 'content/checkpoint-NMT_small')
    torch.save(model.state_dict(),'content/checkpoint-NMT-SD_small')

In [None]:
question_snt[3] 

'нужен ли автомобиль семье с маленьким ребенком?'

In [None]:
epoch_loss = 0.0
num_epochs = 100
best_loss = 999999
best_epoch = -1
sentence1 = "ты в кого тут влюбилась ( ся ) признавайся ?"
ts1  = []

for epoch in range(num_epochs):
  print("Epoch - {} / {}".format(epoch+1, num_epochs))
  model.eval()
  translated_sentence1 = translate_sentence(model, sentence1, TEXT, device, max_length=50)
  print(f"Translated example sentence 1: \n {translated_sentence1}")
  ts1.append(translated_sentence1)

  model.train(True)
  for batch_idx, batch in tqdm(enumerate(train_iterator), total=len(train_iterator)):
    input = batch.src.to(device)
    target = batch.trg.to(device)

    # Pass the input and target for model's forward method
    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    # Clear the accumulating gradients
    optimizer.zero_grad()

    # Calculate the loss value for every epoch
    loss = criterion(output, target)

    # Calculate the gradients for weights & biases using back-propagation
    loss.backward()
    #torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    # Clip the gradient value is it exceeds > 1
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Update the weights values using the gradients we calculated using bp 
    optimizer.step()
    step += 1
    epoch_loss += loss.item()
    writer.add_scalar("Training loss", loss, global_step=step)

  if epoch_loss < best_loss:
    best_loss = epoch_loss
    best_epoch = epoch
    checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss) 
    if ((epoch - best_epoch) >= 10):
      print("no improvement in 10 epochs, break")
      break
  print("Epoch_Loss - {}".format(loss.item()))
  print()
  
print(epoch_loss / len(train_iterator))

Epoch - 1 / 100


  0%|          | 0/297 [00:00<?, ?it/s]

Translated example sentence 1: 
 ['а', 'я', 'скажу', '.', '<eos>']


 38%|███▊      | 112/297 [02:19<03:50,  1.24s/it]


KeyboardInterrupt: 

In [None]:
checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss) 

saving



In [29]:
progress  = []
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
for i, sen in enumerate(ts1):
  progress.append(TreebankWordDetokenizer().detokenize(sen))
print(progress)

NameError: ignored

In [None]:
model.eval()
test_sentences  = ["нужен ли автомобиль семье с маленьким ребенком?", "Что делать?", "Когда спать?"]
pred_sentences = []

for idx, i in enumerate(test_sentences):
  model.eval()
  translated_sentence = translate_sentence(model, i, TEXT, device, max_length=50)
  progress.append(TreebankWordDetokenizer().detokenize(translated_sentence))
  print("? : {}".format(i))
  print(" : {}".format(progress[-1]))
  print()


? : нужен ли автомобиль семье с маленьким ребенком?
 : если только суббота <eos>

? : Что делать?
 : <unk> <unk> <unk> отмечать . <eos>

? : Когда спать?
 : когда уже печень . <eos>



# ChatBot

In [19]:
model.load_state_dict(torch.load(open("checkpoint-NMT-SD_small", 'rb'), map_location=torch.device("cpu")))

model.to("cpu")

Seq2Seq(
  (Encoder_LSTM): EncoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(18052, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (Decoder_LSTM): DecoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(18052, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=18052, bias=True)
  )
)

In [20]:
model.eval()

Seq2Seq(
  (Encoder_LSTM): EncoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(18052, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (Decoder_LSTM): DecoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(18052, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=18052, bias=True)
  )
)

In [21]:
BOT_TOKEN = "5064209731:AAFQjhm4XADWSXnZB4ja9inU7l2rFnnjsUI"

In [26]:
!pip install pytelegrambotapi

Collecting pytelegrambotapi
  Downloading pyTelegramBotAPI-4.2.2.tar.gz (140 kB)
[?25l[K     |██▎                             | 10 kB 31.4 MB/s eta 0:00:01[K     |████▋                           | 20 kB 18.9 MB/s eta 0:00:01[K     |███████                         | 30 kB 10.2 MB/s eta 0:00:01[K     |█████████▎                      | 40 kB 8.3 MB/s eta 0:00:01[K     |███████████▋                    | 51 kB 5.2 MB/s eta 0:00:01[K     |██████████████                  | 61 kB 5.3 MB/s eta 0:00:01[K     |████████████████▎               | 71 kB 5.3 MB/s eta 0:00:01[K     |██████████████████▋             | 81 kB 5.9 MB/s eta 0:00:01[K     |█████████████████████           | 92 kB 6.1 MB/s eta 0:00:01[K     |███████████████████████▎        | 102 kB 4.7 MB/s eta 0:00:01[K     |█████████████████████████▋      | 112 kB 4.7 MB/s eta 0:00:01[K     |████████████████████████████    | 122 kB 4.7 MB/s eta 0:00:01[K     |██████████████████████████████▎ | 133 kB 4.7 MB/s eta 0:0

In [22]:
import telebot;
bot = telebot.TeleBot(BOT_TOKEN);

In [30]:
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [25]:
device = 'cpu'

In [37]:
@bot.message_handler(content_types=['text'])
def get_text_messages(message):
    if message.text == "/start":
        bot.send_message(message.from_user.id, "Привет, задай мне любой вопрос")
    elif message.text == "/help":
        bot.send_message(message.from_user.id, "Задай свой вопрос")
    else:
        question = message.text
        answer = translate_sentence(model, question, TEXT, device, max_length=50)
        answer = TreebankWordDetokenizer().detokenize(answer)
        answer = answer.replace('<unk>', '')
        answer = answer.replace('<eos>', '')
        
        bot.send_message(message.from_user.id, answer)

In [38]:
bot.polling(none_stop=True, interval=0)