<a href="https://colab.research.google.com/github/ArtemNechaev/stepik_nnets/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Samsung-IT-Academy/stepik-dl-nlp.git && pip install -r stepik-dl-nlp/requirements.txt
import sys; sys.path.append('./stepik-dl-nlp')

Cloning into 'stepik-dl-nlp'...
remote: Enumerating objects: 293, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 293 (delta 12), reused 12 (delta 5), pack-reused 266[K
Receiving objects: 100% (293/293), 42.27 MiB | 20.20 MiB/s, done.
Resolving deltas: 100% (141/141), done.
Collecting spacy-udpipe
  Downloading spacy_udpipe-1.0.0-py3-none-any.whl (11 kB)
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 1.5 MB/s 
Collecting ipymarkup
  Downloading ipymarkup-0.9.0-py3-none-any.whl (14 kB)
Collecting youtokentome
  Downloading youtokentome-1.0.6-cp37-cp37m-manylinux2010_x86_64.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.5 MB/s 
Collecting pyconll
  Downloading pyconll-3.1.0-py3-none-any.whl (26 kB)
Collecting gensim==3.8.1
  Downloading gensim-3.8.1-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)
[K     |█████████████████

In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import math

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F

from dlnlputils.pipeline import train_eval_loop, predict_with_model, init_random_seed

init_random_seed(765)


In [None]:
input_file =  open('./stepik-dl-nlp/datasets/author_quotes.txt') 
quotes = input_file.read()[:-1].split('\n')


In [None]:
tokenizer = re.compile(r'[\w\d]{1,3}|\s')
tokenize_quotes = [tokenizer.findall(q.lower()) for q in quotes]
",".join(tokenize_quotes[2])

'did, ,you, ,eve,r, ,sto,p, ,to, ,thi,nk, ,and, ,for,get, ,to, ,sta,rt, ,aga,in'

In [None]:
vocab = list(np.unique(np.concatenate(tokenize_quotes)))
vocab = ['<PAD>', '<UNK>', '<BEGIN>', '<END>'] + vocab
vocab = {v: i for i, v in enumerate(vocab)}
list(vocab.items())[-5:]

[('zyi', 4544), ('zyk', 4545), ('zze', 4546), ('zzi', 4547), ('zzl', 4548)]

In [None]:
from typing import Dict
class SeqDataset(Dataset):
  def __init__(self, data, vocab: Dict):
    super().__init__()
    max_length = max([ len(d) for d in data ])
    self.data = torch.zeros((len(data), max_length + 2), dtype=torch.long)
    self.data[:,0] = 2
    for n_sent, sentence in enumerate(data):
      for n_token, token in enumerate(sentence):
        self.data[n_sent, n_token + 1] = vocab.get(token, 1)
      self.data[n_sent, n_token + 2] = 3

  def __len__(self):
    return self.data.shape[0] - 1
  def __getitem__(self, id):
    return self.data[id, :-1], self.data[id, 1:]

In [None]:
dataset = SeqDataset(tokenize_quotes, vocab)

train_size = int(len(dataset)*0.9)
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) -  train_size])
train_dataset[0]

(tensor([    2,  3100,   715,  1285, 10463,   611,  6899,  8914,  6768,  9598,
          8218, 10899,  4835,   747,  7138,   710, 11641,  6777, 10192,  6357,
           662, 10237,  4610,  9443, 10288,   672,  9588,   662,  9991,  7211,
           792,  5969,  9513,  5624, 10463,   758,  4684,  5221,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [None]:
class Model(nn.Module):
  def __init__(self, vocab_size, emb_size = 64, h_size = 64 ):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, emb_size, padding_idx=0)
    self.RNN = nn.LSTM(emb_size, h_size, batch_first=True)
    self.fc = nn.Linear(h_size, vocab_size)

  def forward(self, x):
    """
    x - tensor BatchSize x MaxSeqLen

    """
    h, _ = self.RNN(self.embed(x))
    logits = self.fc(h)

    return logits.permute(0,2,1)

    

In [None]:
rnn = Model(len(vocab))
loss = nn.CrossEntropyLoss()

In [None]:
(best_val_loss,
 rnn_best_model) = train_eval_loop(rnn,       train_dataset,
                                            val_dataset,
                                            loss,
                                            lr=2e-2,
                                            epoch_n=10,
                                            batch_size=128,
                                            device='cuda',
                                            early_stopping_patience=30,
                                            max_batches_per_epoch_train=500,
                                            max_batches_per_epoch_val=100,
                                            lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                                                                                         verbose=True ))

Эпоха 0
Эпоха: 255 итераций, 79.13 сек
Среднее значение функции потерь на обучении 1.6991025863909255
Среднее значение функции потерь на валидации 1.715894814195304
Новая лучшая модель!

Эпоха 1
Эпоха: 255 итераций, 78.90 сек
Среднее значение функции потерь на обучении 1.6375979666616403
Среднее значение функции потерь на валидации 1.7340531513608735

Эпоха 2
Эпоха: 255 итераций, 78.73 сек
Среднее значение функции потерь на обучении 1.6512958320916868
Среднее значение функции потерь на валидации 1.749472083716557

Эпоха 3
Досрочно остановлено пользователем


In [None]:
def generarate_text(generator, temperature=1):
  seq = vocab.get('<BEGIN>', 2)
  seq = torch.tensor([[seq]], dtype=torch.long).cuda()
  k_list = list(vocab.keys())
  for i in range(134):
    probs = (generator(seq).permute(0,2,1)[0,-1]/temperature).softmax(-1).data.cpu().numpy()
    new_token = np.random.choice(len(vocab), p = probs)
    if new_token == 3:
      return ''.join([k_list[ix] for ix in seq.data.cpu().numpy()[0] if ix != 2] )
    new_token = torch.tensor([[new_token]], dtype=torch.long).cuda()
    seq = torch.cat([seq, new_token], dim=1)
    

  return ''.join([k_list[ix] for ix in seq.data.cpu().numpy()[0] if ix != 2])

In [None]:
for _ in range(10):
    print(generarate_text(rnn_best_model, temperature=0.5), )

  self.dropout, self.training, self.bidirectional, self.batch_first)


There is no based in a really aware of the science of the past of the world.
There is no matter of the world, is just something that's going to have to be an individual things that is the straight.
My mother reaching to have to be doing talking about it is the world been collective.
There is no way is a lot of the experience.
My father is the most anything it.
My life not there are find the reality to just love that he whole the kind of the time, I had to be really excited in the delight with their own.
There are been of the record with the world and thought, there is no matter.
I meet many to make a markets of the world.
There is no one is me to be nice of the world have the point of the same time when we good painted the specific around in the first living a lot of the shadcess and the more and that they're more the United States and the hands and they are a lot of the children, and they were all statement than there are little, because they are like a lot of human country.
There are