# 5. Классификация текстов при помощи сетей прямого распространения

__Автор__: Никита Владимирович Блохин (NVBlokhin@fa.ru)

Финансовый университет, 2020 г. 

## 1. Представление и предобработка текстовых данных 

1.1 Операции по предобработке:
* токенизация
* стемминг / лемматизация
* удаление стоп-слов
* удаление пунктуации
* приведение к нижнему регистру
* любые другие операции над текстом

In [606]:
import math

import nltk
import numpy as np
import torch

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer

import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

In [607]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

Реализовать функцию `preprocess_text(text: str) -> str`, которая:
* приводит строку к нижнему регистру
* заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел


In [608]:
import re
def preprocess_text(text: str) -> str:
    return re.sub(r'[^a-zA-Z.,!?]', ' ', text).lower()

In [609]:
print(preprocess_text(text))

select your preferences and run the install command. stable represents the most currently tested and supported version of pytorch. note that libtorch is only available for c  


1.2 Представление текстовых данных при помощи бинарного кодирования


Представить первое предложение из `text` в виде тензора `sentence_t`: `sentence_t[i] == 1`, если __слово__ с индексом `i` присуствует в предложении.

In [610]:
first_sen = text.split(".")[0]
first_sen_list = word_tokenize(first_sen)
first_sen_list

['Select', 'your', 'preferences', 'and', 'run', 'the', 'install', 'command']

In [611]:
first_sentence_list = word_tokenize(sent_tokenize(text)[0])
first_sentence_list

['Select',
 'your',
 'preferences',
 'and',
 'run',
 'the',
 'install',
 'command',
 '.']

In [612]:
all_words = set(word_tokenize(text))
all_words.remove(".")

In [613]:
all_words

{'C++',
 'LibTorch',
 'Note',
 'PyTorch',
 'Select',
 'Stable',
 'and',
 'available',
 'command',
 'currently',
 'for',
 'install',
 'is',
 'most',
 'of',
 'only',
 'preferences',
 'represents',
 'run',
 'supported',
 'tested',
 'that',
 'the',
 'version',
 'your'}

In [614]:
[1 if i in first_sen_list else 0 for i in all_words]

[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0]

## 2. Классификация фамилий по национальности

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`. 

2.2 Закодировать национальности числами, начиная с 0.

2.3 Разбить датасет на обучающую и тестовую выборку

2.4 Реализовать класс `Vocab` (токен = __символ__)

2.5 Реализовать класс `SurnamesDataset`

2.6. Обучить классификатор.

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [615]:
import pandas as pd

surname_dataset = pd.read_csv("surnames/surnames.csv")
surname_dataset.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [616]:
all_nations = surname_dataset.nationality
all_nations_dict = pd.Series(all_nations.unique()).to_dict()
all_nations_dict

{0: 'English',
 1: 'French',
 2: 'Arabic',
 3: 'Russian',
 4: 'Japanese',
 5: 'Chinese',
 6: 'Italian',
 7: 'Czech',
 8: 'Irish',
 9: 'German',
 10: 'Greek',
 11: 'Spanish',
 12: 'Polish',
 13: 'Dutch',
 14: 'Vietnamese',
 15: 'Korean',
 16: 'Portuguese',
 17: 'Scottish'}

In [617]:
all_nations_dict_reverse = {v: k for k, v in all_nations_dict.items()}
all_nations_dict_reverse

{'English': 0,
 'French': 1,
 'Arabic': 2,
 'Russian': 3,
 'Japanese': 4,
 'Chinese': 5,
 'Italian': 6,
 'Czech': 7,
 'Irish': 8,
 'German': 9,
 'Greek': 10,
 'Spanish': 11,
 'Polish': 12,
 'Dutch': 13,
 'Vietnamese': 14,
 'Korean': 15,
 'Portuguese': 16,
 'Scottish': 17}

In [618]:
all_nations_dict_reverse["Greek"]

10

In [619]:
class Vocab:
  def __init__(self, data: pd.DataFrame):
    """
    Инициализация словаря со всеми символами из данных
    :param data: ВЕСЬ датафрейм
    """
    all_chars = pd.Series(pd.unique(data.values.ravel())).map(lambda x: list(x.lower())).explode().unique()
    self.idx_to_token = {index: token for index, token in enumerate(all_chars)}
    self.token_to_idx = {token: index for index, token in enumerate(all_chars)}
    self.vocab_len = len(all_chars)


In [620]:
from torch.utils.data import Dataset


class SurnamesDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab
    self.max_X = 17
    self.max_y = 10

  def vectorize(self, surname: str):
    '''Генерирует представление фамилии surname в при помощи бинарного кодирования (см. 1.2)'''
    tensor = torch.zeros(len(surname), 1, vocab.vocab_len)
    for li, letter in enumerate(surname.lower()):
      tensor[li][0][vocab.token_to_idx[letter]] = 1
    return tensor
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    surname = self.X.iloc[idx]
    surname_vectorize = self.vectorize(surname)

    nation = self.y.iloc[idx]
    nation_vectorize = all_nations_dict_reverse[nation]
    n = torch.tensor(nation_vectorize)

    return surname_vectorize, n


In [621]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(surname_dataset["surname"], surname_dataset["nationality"], test_size=0.2)

In [622]:
x_train.head(), y_train.head()

(2396     Wrench
 7689    Geftler
 4418    Valovoi
 4843     Kollen
 5977        Kan
 Name: surname, dtype: object,
 2396     English
 7689     Russian
 4418     Russian
 4843       Dutch
 5977    Japanese
 Name: nationality, dtype: object)

In [623]:
vocab = Vocab(surname_dataset)
snds_train = SurnamesDataset(X=x_train, y=y_train, vocab=vocab)
snds_test = SurnamesDataset(X=x_test, y=y_test, vocab=vocab)

In [624]:
vocab.idx_to_token

{0: 'w',
 1: 'o',
 2: 'd',
 3: 'f',
 4: 'r',
 5: 'e',
 6: 'n',
 7: 'g',
 8: 'l',
 9: 'i',
 10: 's',
 11: 'h',
 12: 'c',
 13: 't',
 14: 'é',
 15: 'k',
 16: 'u',
 17: 'y',
 18: 'a',
 19: 'b',
 20: 'z',
 21: 'j',
 22: 'p',
 23: 'm',
 24: 'v',
 25: "'",
 26: 'q',
 27: 'à',
 28: 'x',
 29: 'ü',
 30: '-',
 31: 'í',
 32: 'ú',
 33: 'ä',
 34: 'ö',
 35: 'ó',
 36: '1',
 37: 'ò',
 38: 'ñ',
 39: 'ż',
 40: 'ß',
 41: 'á',
 42: 'è',
 43: 'ã',
 44: 'ê',
 45: 'ì',
 46: 'ś',
 47: 'ń',
 48: 'ù',
 49: 'ç',
 50: '/',
 51: 'õ',
 52: 'ą',
 53: 'ł',
 54: ':'}

In [625]:
test_surname = "Kodama"

In [626]:
snds_train.vectorize(test_surname)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [627]:
[snds_train.vocab.idx_to_token[torch.where(snds_train.vectorize(test_surname)[i] == 1)[1][0].item()] for i in range(len(test_surname))]

['k', 'o', 'd', 'a', 'm', 'a']

In [628]:
snds_train.X.iloc[0]

'Wrench'

In [629]:
def tensor_to_letter(ten):
  return vocab.idx_to_token[torch.where(ten == 1)[1][0].item()]

In [630]:
def tensor_to_surname(ten):
  return [tensor_to_letter(ten[i]) for i in range(ten.shape[0])]

In [631]:
print(tensor_to_surname(snds_train[0][0]))

['w', 'r', 'e', 'n', 'c', 'h']


In [632]:
snds_train.X

2396        Wrench
7689       Geftler
4418       Valovoi
4843        Kollen
5977           Kan
           ...    
10928    Rameckers
5916         Otten
10512       Robson
1745       Zientek
3466        Jijnov
Name: surname, Length: 8784, dtype: object

In [633]:
snds_train[0][0].shape

torch.Size([6, 1, 55])

In [634]:
def my_collate(batch):
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    target = torch.LongTensor(target)
    return data, target

In [635]:
from torch.utils.data import DataLoader

batch_size = 1
trainloader = DataLoader(
    dataset=snds_train,
    # batch_size=batch_size,
    # collate_fn=my_collate,
    shuffle=True,
    num_workers=0
)
testloader = DataLoader(
    dataset=snds_test,
    # batch_size=batch_size,
    # collate_fn=my_collate,
    shuffle=False,
    num_workers=0
)

In [636]:
x_batch, y_batch = next(iter(trainloader))
x_batch[0].shape, y_batch.shape

(torch.Size([4, 1, 55]), torch.Size([1]))

In [637]:
tensor_to_surname(x_batch[0])

['s', 'a', 'i', 'd']

In [638]:
all_nations_dict

{0: 'English',
 1: 'French',
 2: 'Arabic',
 3: 'Russian',
 4: 'Japanese',
 5: 'Chinese',
 6: 'Italian',
 7: 'Czech',
 8: 'Irish',
 9: 'German',
 10: 'Greek',
 11: 'Spanish',
 12: 'Polish',
 13: 'Dutch',
 14: 'Vietnamese',
 15: 'Korean',
 16: 'Portuguese',
 17: 'Scottish'}

In [639]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_nations_dict[category_i], category_i

In [640]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
n_categories = len(all_nations_dict)
rnn = RNN(vocab.vocab_len, n_hidden, n_categories)

In [527]:
criterion = nn.NLLLoss()

In [145]:
learning_rate = 0.005

optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate, momentum=0.4)

def train(line_tensor, category_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()

    # torch.tensor(line_tensor, dtype=torch.long)
    for i in range(torch.tensor(line_tensor, dtype=torch.long).size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    # loss.backward()
    # for p in rnn.parameters():
    #     p.data.add_(p.grad.data, alpha=-learning_rate)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return output, loss.item()

In [147]:
current_loss = 0
all_losses = []

n_iters = 1000
print_every = 100
plot_every = 100

for epoch in range(20):
    for x, y in trainloader:
        output, loss = train(x[0], y)
        current_loss += loss
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == y else f"✗ {categoryFromOutput(output[0])[0]}"
        all_losses.append(current_loss)
        current_loss = 0
    print(f"Epoch {epoch}: {np.array(all_losses).mean()}")

  for i in range(torch.tensor(line_tensor, dtype=torch.long).size()[0]):


Epoch 0: 1.0636049954589166
Epoch 1: 1.0614157269646602
Epoch 2: 1.057570503232305
Epoch 3: 1.056241238270601
Epoch 4: 1.0590577744021739
Epoch 5: 1.0597386062412515
Epoch 6: 1.0595415833941046
Epoch 7: 1.0640670757887145
Epoch 8: 1.0651761796413342
Epoch 9: 1.0640110882567988
Epoch 10: 1.0627886950877345
Epoch 11: 1.0616400745520522
Epoch 12: 1.0603995775345845
Epoch 13: 1.058959929896873
Epoch 14: 1.0576691283489335
Epoch 15: 1.056795507380418
Epoch 16: 1.055909173243771
Epoch 17: 1.0549067473706333
Epoch 18: 1.0545531746191665
Epoch 19: 1.054667610437214


In [152]:
torch.save(rnn, "./nationalities_model.pth")

In [148]:
def evaluate(line_tensor):
    hidden = rnn.initHidden()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    return output


def predict(input_line, n_predictions=3):
    print('\n> %s' % input_line)
    with torch.no_grad():
        output = evaluate(snds_train.vectorize(input_line))

        topv, topi = output.topk(n_predictions, 1, True)
        predictions = []

        for i in range(n_predictions):
            value = topv[0][i].item()
            category_index = topi[0][i].item()
            print('(%.2f) %s' % (value, all_nations_dict[category_index]))
            predictions.append([value, all_nations_dict[category_index]])

In [149]:
predict('Dovesky')
predict('Jackson')
predict('Satoshi')
predict('Kurtaev')
predict('Turin')
predict('Kuznetsov')


> Dovesky
(-0.41) Russian
(-1.33) English
(-2.74) Czech

> Jackson
(-0.30) English
(-1.86) Russian
(-3.52) Scottish

> Satoshi
(-0.37) Japanese
(-2.10) Russian
(-3.12) Korean

> Kurtaev
(-0.00) Russian
(-7.53) Czech
(-7.79) German

> Turin
(-0.46) Russian
(-1.59) English
(-3.23) German

> Kuznetsov
(-0.00) Russian
(-8.01) Greek
(-8.69) Polish


In [150]:
correct_list = []
for x, y in testloader:
    output = evaluate(x[0])
    # current_loss += loss
    guess, guess_i = categoryFromOutput(output)
    if guess_i == y:
        correct = '✓'
        correct_list.append(1)
    else:
        correct = "✗"
        correct_list.append(0)
    print(f"Name: {''.join(tensor_to_surname(x[0]))} \nCorrect: {all_nations_dict[y.item()]} | Predicted: {guess} {correct}")
    # all_losses.append(current_loss)
    # current_loss = 0
# print(f"Epoch {epoch}: {np.array(all_losses).mean()}")

Name: zhmaev 
Correct: Russian | Predicted: Russian ✓
Name: takewaki 
Correct: Japanese | Predicted: Polish ✗
Name: avtokratov 
Correct: Russian | Predicted: Russian ✓
Name: totah 
Correct: Arabic | Predicted: Arabic ✓
Name: shahtmeister 
Correct: Russian | Predicted: English ✗
Name: tchaly 
Correct: Russian | Predicted: English ✗
Name: pakulski 
Correct: Polish | Predicted: Russian ✗
Name: christie 
Correct: Scottish | Predicted: English ✗
Name: santos 
Correct: English | Predicted: Arabic ✗
Name: issa 
Correct: Arabic | Predicted: Arabic ✓
Name: bazzi 
Correct: Arabic | Predicted: Arabic ✓
Name: yamamoto 
Correct: Japanese | Predicted: Japanese ✓
Name: bishara 
Correct: Arabic | Predicted: Japanese ✗
Name: jindra 
Correct: Czech | Predicted: Czech ✓
Name: filippenkov 
Correct: Russian | Predicted: Russian ✓
Name: pirojkov 
Correct: Russian | Predicted: Russian ✓
Name: rim 
Correct: Korean | Predicted: English ✗
Name: awrorin 
Correct: Russian | Predicted: English ✗
Name: martinez 
Co

In [1]:
np.array(correct_list).mean()

NameError: name 'np' is not defined

## 3. Классификация обзоров ресторанов

Датасет: https://disk.yandex.ru/d/nY1o70JtAuYa8g

3.1 Считать файл `yelp/raw_train.csv`. Оставить от исходного датасета 10% строчек.

3.2 Воспользоваться функцией `preprocess_text` из 1.1 для обработки текста отзыва. Закодировать рейтинг числами, начиная с 0.

3.3 Разбить датасет на обучающую и тестовую выборку

3.4 Реализовать класс `Vocab` (токен = слово)

3.5 Реализовать класс `ReviewDataset`

3.6 Обучить классификатор

3.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)


In [763]:
import nltk
import numpy as np
import torch

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer

In [764]:
import re
def preprocess_text(text: str) -> str:
    return re.sub(r'[^a-zA-Z.,!?]', ' ', text).lower()

In [765]:
import pandas as pd

revs_full_dataset = pd.read_csv("yelp/raw_test.csv", header=None)
revs_full_dataset.columns = ["score", "review"]
revs_full_dataset.head()

Unnamed: 0,score,review
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."
3,2,I was overall pretty impressed by this hotel. ...
4,1,Video link at bottom review. Worst service I h...


In [766]:
revs_dataset = revs_full_dataset.sample(frac=0.1)
revs_dataset

Unnamed: 0,score,review
28037,1,If I could give Mark Taylor no stars I would. ...
27644,2,The restaurant is very nice on the inside comp...
31312,1,Maybe the bar is nice but the hotel is horribl...
4496,1,I was here in 2006 and starving. It was meh. N...
23552,1,Let me start by saying: I know my dog is getti...
...,...,...
9427,1,We went back for a second time just to see a b...
25738,1,We will never take our dog here again! We sche...
37218,2,WOW! The Cosmopolitan is definitely as gorgeo...
28163,1,The pizza was warm well I would say almost col...


In [767]:
revs_full_dataset.iloc[:, 0].size

38000

In [768]:
revs_dataset.iloc[:, 0].size

3800

In [769]:
revs_dataset.review = revs_dataset.review.map(preprocess_text)
revs_dataset.score = revs_dataset.score - 1
revs_dataset

Unnamed: 0,score,review
28037,0,if i could give mark taylor no stars i would. ...
27644,1,the restaurant is very nice on the inside comp...
31312,0,maybe the bar is nice but the hotel is horribl...
4496,0,i was here in and starving. it was meh. n...
23552,0,let me start by saying i know my dog is getti...
...,...,...
9427,0,we went back for a second time just to see a b...
25738,0,we will never take our dog here again! we sche...
37218,1,wow! the cosmopolitan is definitely as gorgeo...
28163,0,the pizza was warm well i would say almost col...


In [770]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(revs_dataset.review, revs_dataset.score, test_size=0.2)

In [771]:
x_train, y_train

(30601    it was okay i ve been there before in californ...
 11248    hot n juicy is one of those places that i neve...
 25958    it s only getting one star because i like the ...
 2686     i love the pharmacy staff at this fry s! mr. p...
 2347     don t get me wrong, the enchiladas are very go...
                                ...                        
 18000    ok, this place was really weird. i wanted to l...
 11787    holy cow i am so glad i was talked into coming...
 18290    my brother just moved into the neighborhood an...
 3036     really nice place but i don t believe they ser...
 29247    i think this is one of those restaurants for t...
 Name: review, Length: 3040, dtype: object,
 30601    0
 11248    1
 25958    0
 2686     1
 2347     1
         ..
 18000    0
 11787    1
 18290    1
 3036     1
 29247    0
 Name: score, Length: 3040, dtype: int64)

In [772]:
revs_dataset_unique_words= pd.Series(revs_dataset.review.map(word_tokenize).explode().unique())
reviews_dict = revs_dataset_unique_words.to_dict()
reviews_dict

{0: 'if',
 1: 'i',
 2: 'could',
 3: 'give',
 4: 'mark',
 5: 'taylor',
 6: 'no',
 7: 'stars',
 8: 'would',
 9: '.',
 10: 'all',
 11: 'of',
 12: 'their',
 13: 'leasing',
 14: 'clowns',
 15: 'get',
 16: 'hour',
 17: 'ngreat',
 18: 'pay',
 19: 'for',
 20: 'what',
 21: 'they',
 22: 'do',
 23: '!',
 24: 'guess',
 25: 'are',
 26: 'told',
 27: 'to',
 28: 'not',
 29: 'let',
 30: 'u.s.',
 31: 'vets',
 32: 'be',
 33: 'approved',
 34: 'my',
 35: 'credit',
 36: 'score',
 37: 'was',
 38: 'below',
 39: 'acceptable',
 40: 'security',
 41: 'deposit',
 42: 'wanted',
 43: 'a',
 44: 'more',
 45: 'plus',
 46: 'rent',
 47: 'go',
 48: 'up',
 49: 'per',
 50: 'month',
 51: 'the',
 52: 'things',
 53: 'on',
 54: 'report',
 55: 'were',
 56: 'even',
 57: 'listed',
 58: 'as',
 59: 'identity',
 60: 'theft',
 61: 'didn',
 62: 't',
 63: 'care',
 64: 'women',
 65: 'named',
 66: 'amanda',
 67: 'said',
 68: 'tough',
 69: 'refund',
 70: 'apartment',
 71: 'is',
 72: 'true',
 73: 'scumbag',
 74: 'restaurant',
 75: 'very',
 

In [773]:
reviews_dict_reverse = {v: k for k, v in reviews_dict.items()}
reviews_dict_reverse

{'if': 0,
 'i': 1,
 'could': 2,
 'give': 3,
 'mark': 4,
 'taylor': 5,
 'no': 6,
 'stars': 7,
 'would': 8,
 '.': 9,
 'all': 10,
 'of': 11,
 'their': 12,
 'leasing': 13,
 'clowns': 14,
 'get': 15,
 'hour': 16,
 'ngreat': 17,
 'pay': 18,
 'for': 19,
 'what': 20,
 'they': 21,
 'do': 22,
 '!': 23,
 'guess': 24,
 'are': 25,
 'told': 26,
 'to': 27,
 'not': 28,
 'let': 29,
 'u.s.': 30,
 'vets': 31,
 'be': 32,
 'approved': 33,
 'my': 34,
 'credit': 35,
 'score': 36,
 'was': 37,
 'below': 38,
 'acceptable': 39,
 'security': 40,
 'deposit': 41,
 'wanted': 42,
 'a': 43,
 'more': 44,
 'plus': 45,
 'rent': 46,
 'go': 47,
 'up': 48,
 'per': 49,
 'month': 50,
 'the': 51,
 'things': 52,
 'on': 53,
 'report': 54,
 'were': 55,
 'even': 56,
 'listed': 57,
 'as': 58,
 'identity': 59,
 'theft': 60,
 'didn': 61,
 't': 62,
 'care': 63,
 'women': 64,
 'named': 65,
 'amanda': 66,
 'said': 67,
 'tough': 68,
 'refund': 69,
 'apartment': 70,
 'is': 71,
 'true': 72,
 'scumbag': 73,
 'restaurant': 74,
 'very': 75,
 

In [774]:
def word_to_index(word: str, unique) -> list | torch.Tensor:
    return unique[word]

In [775]:
def sentence_to_index(sentence: torch.Tensor, unique) -> list | torch.Tensor:
    return [word_to_index(word, unique) for word in sentence]

In [776]:
def index_to_word(idx, unique):
    return unique[idx]

In [777]:
def index_list_to_sentence(idx_list: list | torch.Tensor, unique) -> list | torch.Tensor:
    return [index_to_word(idx.item(), unique) for idx in idx_list]

In [778]:
class Vocab:
  def __init__(self, data):
    self.idx_to_token = pd.Series(data.map(word_tokenize).explode().unique()).to_dict()
    self.token_to_idx = {v: k for k, v in self.idx_to_token.items()}
    self.vocab_len = len(reviews_dict)

In [779]:
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, review: str):
    '''Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)'''
    review = word_tokenize(review)
    tensor = torch.zeros(len(review), 1, self.vocab.vocab_len)
    for i, word in enumerate(review):
        tensor[i][0][self.vocab.token_to_idx[word]] = 1
    return tensor
    # return [reviews_dict_reverse[word] for word in word_tokenize(review)]
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
      review = self.vectorize(self.X.iloc[idx])
      score = self.y.iloc[idx]
      return torch.tensor(review), torch.tensor(score, dtype=torch.float)

In [780]:
vocab = Vocab(revs_dataset.review)
vocab.vocab_len

19598

In [781]:
review_train = ReviewDataset(X=x_train, y=y_train, vocab=vocab)

In [782]:
review_test = ReviewDataset(X=x_test, y=y_test, vocab=vocab)

In [783]:
from torch.utils.data import DataLoader

batch_size = 8
trainloader = DataLoader(
    dataset=review_train,
    # batch_size=batch_size,
    # collate_fn=my_collate,
    shuffle=True,
    num_workers=0
)
testloader = DataLoader(
    dataset=review_test,
    # batch_size=batch_size,
    # collate_fn=my_collate,
    shuffle=False,
    num_workers=0
)

In [784]:
# x_batch, y_batch = next(iter(trainloader))
# x_batch[0].shape, y_batch.shape

In [785]:
score_dict = {
    0: "Bad",
    1: "Good"
}

In [786]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return score_dict[category_i], category_i

In [787]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
n_categories = len(reviews_dict)
rnn = RNN(vocab.vocab_len, n_hidden, 2)

In [788]:
criterion = nn.MSELoss()
learning_rate = 0.005

optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate, momentum=0.9)

def train(line_tensor, category_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()

    # torch.tensor(line_tensor, dtype=torch.long)
    for i in range(torch.tensor(line_tensor, dtype=torch.long).size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    # loss.backward()
    # for p in rnn.parameters():
    #     p.data.add_(p.grad.data, alpha=-learning_rate)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return output, loss.item()

In [792]:
current_loss = 0
all_losses = []

for epoch in range(1):
    for i, data in enumerate(trainloader):
        x, y = data
        output, loss = train(x[0], y)
        current_loss += loss
        # if i % 5 == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess_i == y.item() else "✗"
        print(f"Actual: {score_dict[y.item()]} Predicted: {guess} {correct}")
        all_losses.append(current_loss)
        current_loss = 0
    print(f"Epoch {epoch}: {np.array(all_losses).mean()}")

  return torch.tensor(review), torch.tensor(score, dtype=torch.float)
  for i in range(torch.tensor(line_tensor, dtype=torch.long).size()[0]):
  return F.mse_loss(input, target, reduction=self.reduction)


Actual: Good Predicted: Good ✓
Actual: Bad Predicted: Good ✗
Actual: Good Predicted: Good ✓
Actual: Good Predicted: Good ✓
Actual: Bad Predicted: Good ✗
Actual: Bad Predicted: Good ✗
Actual: Good Predicted: Good ✓
Actual: Bad Predicted: Good ✗
Actual: Bad Predicted: Good ✗
Actual: Bad Predicted: Good ✗
Actual: Good Predicted: Good ✓
Actual: Bad Predicted: Good ✗
Actual: Good Predicted: Good ✓
Actual: Good Predicted: Good ✓
Actual: Bad Predicted: Good ✗
Actual: Bad Predicted: Good ✗
Actual: Bad Predicted: Good ✗
Actual: Bad Predicted: Good ✗
Actual: Bad Predicted: Good ✗
Actual: Bad Predicted: Bad ✓
Actual: Good Predicted: Bad ✗
Actual: Good Predicted: Good ✓
Actual: Bad Predicted: Good ✗
Actual: Good Predicted: Bad ✗
Actual: Good Predicted: Bad ✗
Actual: Bad Predicted: Bad ✓
Actual: Good Predicted: Bad ✗
Actual: Bad Predicted: Bad ✓
Actual: Good Predicted: Bad ✗
Actual: Bad Predicted: Bad ✓
Actual: Good Predicted: Bad ✗
Actual: Good Predicted: Bad ✗
Actual: Bad Predicted: Bad ✓
Actual:

KeyboardInterrupt: 

In [152]:
torch.save(rnn, "./review_model.pth")