<a href="https://colab.research.google.com/github/ArtemNechaev/stepik_nnets/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Samsung-IT-Academy/stepik-dl-nlp.git && pip install -r stepik-dl-nlp/requirements.txt
import sys; sys.path.append('./stepik-dl-nlp')

In [2]:
import os

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import math

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F

from dlnlputils.pipeline import train_eval_loop, predict_with_model, init_random_seed

init_random_seed(765)


In [3]:
input_file =  open('./stepik-dl-nlp/datasets/author_quotes.txt') 
quotes = input_file.read()[:-1].split('\n')
##


In [22]:
tokenizer = re.compile(r'[\w\d]{1,4}|\s')
tokenize_quotes = [tokenizer.findall(q.lower()) for q in quotes]
",".join(tokenize_quotes[2])

'did, ,you, ,ever, ,stop, ,to, ,thin,k, ,and, ,forg,et, ,to, ,star,t, ,agai,n'

In [23]:
vocab = list(np.unique(np.concatenate(tokenize_quotes)))
vocab = ['<PAD>', '<UNK>', '<BEGIN>', '<END>'] + vocab
vocab = {v: i for i, v in enumerate(vocab)}
list(vocab.items())[-5:]

[('zuko', 12191),
 ('zulu', 12192),
 ('zure', 12193),
 ('zy', 12194),
 ('zzi', 12195)]

In [24]:
from typing import Dict
class SeqDataset(Dataset):
  def __init__(self, data, vocab: Dict):
    super().__init__()
    max_length = max([ len(d) for d in data ])
    self.data = torch.zeros((len(data), max_length + 2), dtype=torch.long)
    self.data[:,0] = 2
    for n_sent, sentence in enumerate(data):
      for n_token, token in enumerate(sentence):
        self.data[n_sent, n_token + 1] = vocab.get(token, 1)
      self.data[n_sent, n_token + 2] = 3

  def __len__(self):
    return self.data.shape[0] - 1
  def __getitem__(self, id):
    return self.data[id, :-1], self.data[id, 1:]

In [25]:
dataset = SeqDataset(tokenize_quotes, vocab)

train_size = int(len(dataset)*0.9)
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) -  train_size])
train_dataset[0]

(tensor([    2,  4998,  6496,     4, 10292,  5975,     4, 11546, 10225,     4,
          4057,     4, 10640,     4,  7181,  4642,  4855,     4,  7022,     4,
          8218,  7529,     4,  8132,     4, 10640,     4,  9872,     4,   780,
             4,  4998,  6496,     4, 10651,  2921,     4,   682,     4, 10640,
             4, 10735,     4, 10637,     4,  4998,     4, 11949,  3031,     4,
          5261,     4, 11849,     4,  4998,     4, 11774,     4,   332,     4,
         10573,   538,     4,  4998,     4,  4624,     4,   332,     4,  1960,
          2355,     4,  4057,     4,  7022,     4,  1960,  2586,  2355,     4,
         10416, 10641,  9437,     3,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [26]:
class Model(nn.Module):
  def __init__(self, vocab_size, emb_size = 64, h_size = 64 ):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, emb_size, padding_idx=0)
    self.RNN = nn.LSTM(emb_size, h_size, batch_first=True)
    self.fc = nn.Linear(h_size, vocab_size)

  def forward(self, x):
    """
    x - tensor BatchSize x MaxSeqLen

    """
    h, _ = self.RNN(self.embed(x))
    logits = self.fc(h)

    return logits.permute(0,2,1)

    

In [27]:
rnn = Model(len(vocab))
loss = nn.CrossEntropyLoss()

In [28]:
(best_val_loss,
 rnn_best_model) = train_eval_loop(rnn,       train_dataset,
                                            val_dataset,
                                            loss,
                                            lr=2e-3,
                                            epoch_n=10,
                                            batch_size=128,
                                            device='cuda',
                                            early_stopping_patience=30,
                                            max_batches_per_epoch_train=500,
                                            max_batches_per_epoch_val=100,
                                            lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                                                                                         verbose=True ))

Эпоха 0
Эпоха: 255 итераций, 93.13 сек
Среднее значение функции потерь на обучении 2.0368499540815166
Среднее значение функции потерь на валидации 1.177451228273326
Новая лучшая модель!

Эпоха 1
Эпоха: 255 итераций, 93.08 сек
Среднее значение функции потерь на обучении 1.1083298814062978
Среднее значение функции потерь на валидации 1.0339915752410889
Новая лучшая модель!

Эпоха 2
Эпоха: 255 итераций, 92.79 сек
Среднее значение функции потерь на обучении 0.9998902909895953
Среднее значение функции потерь на валидации 0.9567129057029198
Новая лучшая модель!

Эпоха 3
Эпоха: 255 итераций, 92.87 сек
Среднее значение функции потерь на обучении 0.9353294603964861
Среднее значение функции потерь на валидации 0.9068694587411552
Новая лучшая модель!

Эпоха 4
Эпоха: 255 итераций, 92.95 сек
Среднее значение функции потерь на обучении 0.8952510494811862
Среднее значение функции потерь на валидации 0.8780339154703863
Новая лучшая модель!

Эпоха 5
Эпоха: 255 итераций, 92.82 сек
Среднее значение функц

In [29]:
def generarate_text(generator, temperature=1, max_length = 30):
  seq = vocab.get('<BEGIN>', 2)
  seq = torch.tensor([[seq]], dtype=torch.long).cuda()
  k_list = list(vocab.keys())
  for i in range(max_length):
    probs = (generator(seq).permute(0,2,1)[0,-1]/temperature).softmax(-1).data.cpu().numpy()
    new_token = np.random.choice(len(vocab), p = probs)
    if new_token == 3:
      return ''.join([k_list[ix] for ix in seq.data.cpu().numpy()[0] if ix != 2] )
    new_token = torch.tensor([[new_token]], dtype=torch.long).cuda()
    seq = torch.cat([seq, new_token], dim=1)
    

  return ''.join([k_list[ix] for ix in seq.data.cpu().numpy()[0] if ix != 2])

In [30]:
for _ in range(10):
    print(generarate_text(rnn_best_model, temperature=0.7, max_length = max([ len(_) for _ in tokenize_quotes ])), )

  self.dropout, self.training, self.bidirectional, self.batch_first)


i see not home to be an great extrentertaity of you go to do the rule
when i know i think there to do you just i have tours that i dont enjoy a less still i do in the past of the very continue and i administ and there a lot of your greatest of a really little named that your own i dont do like it when they  would last it and that esses of it i really my law it is to be everyone is the country if you was a about the playksgin by the most 
i want and much in the world on in the experience and i did playing that i understanding to be it must my feel is a galle of the art of away to the potential i have am laugh in those it it back and become has that one still at me out and to say my films but i nation
the dull of the number of the mean of the much ideas to be to begins that everyone is a york but i saw us past and i didnt have my old father that painting in my beautiful of his characters and the little day can do that to bring it it are no job of an prefer in the world and a as well what