# RNN

## import libraries

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import re

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms.v2 as v2

from torch.utils.data import BatchSampler, SequentialSampler
from torchvision import models

from tqdm import tqdm

In [None]:
!pip install natasha

In [3]:
from navec import Navec
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

## Class Dataset

In [4]:
class WordsDataset(data.Dataset):
  def __init__(self, path, navec_emb, prev_word=3):
    self.prev_word = prev_word
    self.navec_emb = navec_emb

    with open(path, 'r', encoding='utf-8') as f:
      self.text = f.read()
      self.text = self.text.replace('\ufeff', ' ')
      self.text = self.text.replace('\n', ' ')
      self.text = re.sub(r'[^А-яA-z-]', ' ', self.text)

    self.words = self.text.lower().split()
    self.words = [word for word in self.words if word in self.navec_emb]
    vocab = set(self.words)

    self.int_to_word = dict(enumerate((vocab)))
    self.word_to_int = {v: k for k, v in self.int_to_word.items()}
    self.vocab_size = len(vocab)

  def __getitem__(self, item):
    _data = torch.vstack([torch.tensor(self.navec_emb[self.words[x]])
                          for x in range(item, item+self.prev_word)])

    word = self.words[item+self.prev_word]
    t = self.word_to_int[word]
    return _data, t

  def __len__(self):
    return len(self.words) - 1 - self.prev_word

## Rnn

In [5]:
class wordsRNN(nn.Module):
  def __init__(self, in_features, out_features):
    super().__init__()
    self.hidden_size = 256
    self.in_features = in_features
    self.out_features = out_features

    self.rnn = nn.RNN(in_features, self.hidden_size, batch_first=True)
    self.Linear = nn.Linear(self.hidden_size, out_features)

  def forward(self, x):
    x, h = self.rnn(x)
    y = self.Linear(h)
    return y

In [6]:
d_train = WordsDataset('/content/text_2', navec, prev_word=3)
train_loader = data.DataLoader(d_train, batch_size=8, shuffle=True)

model = wordsRNN(300, d_train.vocab_size)

## Train

In [7]:
optimizer = optim.Adam(params=model.parameters(), lr=0.001)
loss_f = nn.CrossEntropyLoss()


epoch=100
model.train()

for _e in range(epoch):
  loss_mean = 0
  lm_count = 0

  train_tqdm = tqdm(train_loader, leave=False)
  for x_train, y_train in train_tqdm:
    pred = model(x_train).squeeze(0)
    loss = loss_f(pred, y_train.long())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    lm_count+=1
    loss_mean = 1/lm_count * loss.item() + (1-1/lm_count)*loss_mean
    train_tqdm.set_description(f'[epoch ({_e+1}/{epoch}], loss_mean: {loss_mean:.3f}')



## save

In [8]:
st = model.state_dict()
torch.save(st, 'model_rnn_1.tar')

## Test

In [9]:
model.eval()
predict = "Мой дядя самый".lower().split()
total = 10

for _ in range(total):
    _data = torch.vstack([torch.tensor(d_train.navec_emb[predict[-x]]) for x in range(d_train.prev_word, 0, -1)])
    p = model(_data.unsqueeze(0)).squeeze(0)
    indx = torch.argmax(p, dim=1)
    predict.append(d_train.int_to_word[indx.item()])

print(" ".join(predict))

мой дядя самый честных правил когда не в шутку занемог он уважать себя
