# 5. Классификация текстов при помощи сетей прямого распространения

__Автор__: Никита Владимирович Блохин (NVBlokhin@fa.ru)

Финансовый университет, 2020 г. 

## 1. Представление и предобработка текстовых данных 

1.1 Операции по предобработке:
* токенизация
* стемминг / лемматизация
* удаление стоп-слов
* удаление пунктуации
* приведение к нижнему регистру
* любые другие операции над текстом

In [199]:
import nltk
import numpy as np
import torch

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer

In [200]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

Реализовать функцию `preprocess_text(text: str) -> str`, которая:
* приводит строку к нижнему регистру
* заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел


In [204]:
import re
def preprocess_text(text: str) -> str:
    return re.sub(r'[^a-zA-Z.,!?] ', '', text).lower()

In [205]:
print(preprocess_text(text))

select your preferences and run the install command. stable represents the most currently tested and supported version of pytorch. note that libtorch is only available for c++


1.2 Представление текстовых данных при помощи бинарного кодирования


Представить первое предложение из `text` в виде тензора `sentence_t`: `sentence_t[i] == 1`, если __слово__ с индексом `i` присуствует в предложении.

In [6]:
first_sen = text.split(".")[0]
first_sen_list = word_tokenize(first_sen)
first_sen_list

['Select', 'your', 'preferences', 'and', 'run', 'the', 'install', 'command']

In [7]:
first_sentence_list = word_tokenize(sent_tokenize(text)[0])
first_sentence_list

['Select',
 'your',
 'preferences',
 'and',
 'run',
 'the',
 'install',
 'command',
 '.']

In [8]:
all_words = set(word_tokenize(text))
all_words.remove(".")

In [9]:
all_words

{'C++',
 'LibTorch',
 'Note',
 'PyTorch',
 'Select',
 'Stable',
 'and',
 'available',
 'command',
 'currently',
 'for',
 'install',
 'is',
 'most',
 'of',
 'only',
 'preferences',
 'represents',
 'run',
 'supported',
 'tested',
 'that',
 'the',
 'version',
 'your'}

In [10]:
[1 if i in first_sen_list else 0 for i in all_words]

[1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0]

## 2. Классификация фамилий по национальности

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`. 

2.2 Закодировать национальности числами, начиная с 0.

2.3 Разбить датасет на обучающую и тестовую выборку

2.4 Реализовать класс `Vocab` (токен = __символ__)

2.5 Реализовать класс `SurnamesDataset`

2.6. Обучить классификатор.

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [11]:
import pandas as pd

surname_dataset = pd.read_csv("surnames/surnames.csv")
surname_dataset.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [12]:
all_nations = surname_dataset.nationality
all_nations_dict = pd.Series(all_nations.unique()).to_dict()
all_nations_dict

{0: 'English',
 1: 'French',
 2: 'Arabic',
 3: 'Russian',
 4: 'Japanese',
 5: 'Chinese',
 6: 'Italian',
 7: 'Czech',
 8: 'Irish',
 9: 'German',
 10: 'Greek',
 11: 'Spanish',
 12: 'Polish',
 13: 'Dutch',
 14: 'Vietnamese',
 15: 'Korean',
 16: 'Portuguese',
 17: 'Scottish'}

In [13]:
all_nations_dict_reverse = {v: k for k, v in all_nations_dict.items()}
all_nations_dict_reverse

{'English': 0,
 'French': 1,
 'Arabic': 2,
 'Russian': 3,
 'Japanese': 4,
 'Chinese': 5,
 'Italian': 6,
 'Czech': 7,
 'Irish': 8,
 'German': 9,
 'Greek': 10,
 'Spanish': 11,
 'Polish': 12,
 'Dutch': 13,
 'Vietnamese': 14,
 'Korean': 15,
 'Portuguese': 16,
 'Scottish': 17}

In [14]:
all_nations_dict_reverse["Greek"]

10

In [15]:
class Vocab:
  def __init__(self, data: pd.DataFrame):
    """
    Инициализация словаря со всеми символами из данных
    :param data: ВЕСЬ датафрейм
    """
    all_chars = pd.Series(pd.unique(data.values.ravel())).map(lambda x: list(x.lower())).explode().unique()
    self.idx_to_token = {index: token for index, token in enumerate(all_chars)}
    self.token_to_idx = {token: index for index, token in enumerate(all_chars)}
    self.vocab_len = len(all_chars)


In [16]:
from torch.utils.data import Dataset


class SurnamesDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab
    self.max_X = 17
    self.max_y = 10

  def vectorize(self, surname: str):
    '''Генерирует представление фамилии surname в при помощи бинарного кодирования (см. 1.2)'''
    tensor = torch.zeros(len(surname), 1, vocab.vocab_len)
    for li, letter in enumerate(surname.lower()):
      tensor[li][0][vocab.token_to_idx[letter]] = 1
    return tensor
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    surname = self.X.iloc[idx]
    surname_vectorize = self.vectorize(surname)

    nation = self.y.iloc[idx]
    nation_vectorize = all_nations_dict_reverse[nation]
    n = torch.tensor(nation_vectorize)

    return surname_vectorize, n


In [17]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(surname_dataset["surname"], surname_dataset["nationality"], test_size=0.2)

In [18]:
x_train.head(), y_train.head()

(4243     Willetts
 10763     Penfold
 3074         Pazi
 9875        Kosko
 7577      Sleiman
 Name: surname, dtype: object,
 4243     English
 10763    English
 3074     Russian
 9875       Czech
 7577      Arabic
 Name: nationality, dtype: object)

In [19]:
vocab = Vocab(surname_dataset)
snds_train = SurnamesDataset(X=x_train, y=y_train, vocab=vocab)
snds_test = SurnamesDataset(X=x_test, y=y_test, vocab=vocab)

In [20]:
vocab.idx_to_token

{0: 'w',
 1: 'o',
 2: 'd',
 3: 'f',
 4: 'r',
 5: 'e',
 6: 'n',
 7: 'g',
 8: 'l',
 9: 'i',
 10: 's',
 11: 'h',
 12: 'c',
 13: 't',
 14: 'é',
 15: 'k',
 16: 'u',
 17: 'y',
 18: 'a',
 19: 'b',
 20: 'z',
 21: 'j',
 22: 'p',
 23: 'm',
 24: 'v',
 25: "'",
 26: 'q',
 27: 'à',
 28: 'x',
 29: 'ü',
 30: '-',
 31: 'í',
 32: 'ú',
 33: 'ä',
 34: 'ö',
 35: 'ó',
 36: '1',
 37: 'ò',
 38: 'ñ',
 39: 'ż',
 40: 'ß',
 41: 'á',
 42: 'è',
 43: 'ã',
 44: 'ê',
 45: 'ì',
 46: 'ś',
 47: 'ń',
 48: 'ù',
 49: 'ç',
 50: '/',
 51: 'õ',
 52: 'ą',
 53: 'ł',
 54: ':'}

In [21]:
test_surname = "Kodama"

In [22]:
snds_train.vectorize(test_surname)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [23]:
[snds_train.vocab.idx_to_token[torch.where(snds_train.vectorize(test_surname)[i] == 1)[1][0].item()] for i in range(len(test_surname))]

['k', 'o', 'd', 'a', 'm', 'a']

In [24]:
snds_train.X.iloc[0]

'Willetts'

In [25]:
def tensor_to_letter(ten):
  return vocab.idx_to_token[torch.where(ten == 1)[1][0].item()]

In [26]:
def tensor_to_surname(ten):
  return [tensor_to_letter(ten[i]) for i in range(ten.shape[0])]

In [27]:
print(tensor_to_surname(snds_train[0][0]))

['w', 'i', 'l', 'l', 'e', 't', 't', 's']


In [28]:
snds_train.X

4243     Willetts
10763     Penfold
3074         Pazi
9875        Kosko
7577      Sleiman
           ...   
10925     Yakubov
2173      Whiting
2221        Daher
7393      Dobbins
8743      Bahtiev
Name: surname, Length: 8784, dtype: object

In [29]:
snds_train[0][0].shape

torch.Size([8, 1, 55])

In [30]:
def my_collate(batch):
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    target = torch.LongTensor(target)
    return data, target

In [31]:
from torch.utils.data import DataLoader

batch_size = 1
trainloader = DataLoader(
    dataset=snds_train,
    # batch_size=batch_size,
    # collate_fn=my_collate,
    shuffle=True,
    num_workers=0
)
testloader = DataLoader(
    dataset=snds_test,
    # batch_size=batch_size,
    # collate_fn=my_collate,
    shuffle=False,
    num_workers=0
)

In [32]:
x_batch, y_batch = next(iter(trainloader))
x_batch[0].shape, y_batch.shape

(torch.Size([5, 1, 55]), torch.Size([1]))

In [33]:
tensor_to_surname(x_batch[0])

['t', 'r', 'i', 'e', 'u']

In [34]:
all_nations_dict

{0: 'English',
 1: 'French',
 2: 'Arabic',
 3: 'Russian',
 4: 'Japanese',
 5: 'Chinese',
 6: 'Italian',
 7: 'Czech',
 8: 'Irish',
 9: 'German',
 10: 'Greek',
 11: 'Spanish',
 12: 'Polish',
 13: 'Dutch',
 14: 'Vietnamese',
 15: 'Korean',
 16: 'Portuguese',
 17: 'Scottish'}

In [59]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_nations_dict[category_i], category_i

In [143]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
n_categories = len(all_nations_dict)
rnn = RNN(vocab.vocab_len, n_hidden, n_categories)

In [144]:
criterion = nn.NLLLoss()

In [145]:
learning_rate = 0.005

optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate, momentum=0.4)

def train(line_tensor, category_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()

    # torch.tensor(line_tensor, dtype=torch.long)
    for i in range(torch.tensor(line_tensor, dtype=torch.long).size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    # loss.backward()
    # for p in rnn.parameters():
    #     p.data.add_(p.grad.data, alpha=-learning_rate)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return output, loss.item()

In [147]:
current_loss = 0
all_losses = []

n_iters = 1000
print_every = 100
plot_every = 100

for epoch in range(20):
    for x, y in trainloader:
        output, loss = train(x[0], y)
        current_loss += loss
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == y else f"✗ {categoryFromOutput(output[0])[0]}"
        all_losses.append(current_loss)
        current_loss = 0
    print(f"Epoch {epoch}: {np.array(all_losses).mean()}")

  for i in range(torch.tensor(line_tensor, dtype=torch.long).size()[0]):


Epoch 0: 1.0636049954589166
Epoch 1: 1.0614157269646602
Epoch 2: 1.057570503232305
Epoch 3: 1.056241238270601
Epoch 4: 1.0590577744021739
Epoch 5: 1.0597386062412515
Epoch 6: 1.0595415833941046
Epoch 7: 1.0640670757887145
Epoch 8: 1.0651761796413342
Epoch 9: 1.0640110882567988
Epoch 10: 1.0627886950877345
Epoch 11: 1.0616400745520522
Epoch 12: 1.0603995775345845
Epoch 13: 1.058959929896873
Epoch 14: 1.0576691283489335
Epoch 15: 1.056795507380418
Epoch 16: 1.055909173243771
Epoch 17: 1.0549067473706333
Epoch 18: 1.0545531746191665
Epoch 19: 1.054667610437214


In [152]:
torch.save(rnn, "./nationalities_model.pth")

In [148]:
def evaluate(line_tensor):
    hidden = rnn.initHidden()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    return output


def predict(input_line, n_predictions=3):
    print('\n> %s' % input_line)
    with torch.no_grad():
        output = evaluate(snds_train.vectorize(input_line))

        topv, topi = output.topk(n_predictions, 1, True)
        predictions = []

        for i in range(n_predictions):
            value = topv[0][i].item()
            category_index = topi[0][i].item()
            print('(%.2f) %s' % (value, all_nations_dict[category_index]))
            predictions.append([value, all_nations_dict[category_index]])

In [149]:
predict('Dovesky')
predict('Jackson')
predict('Satoshi')
predict('Kurtaev')
predict('Turin')
predict('Kuznetsov')


> Dovesky
(-0.41) Russian
(-1.33) English
(-2.74) Czech

> Jackson
(-0.30) English
(-1.86) Russian
(-3.52) Scottish

> Satoshi
(-0.37) Japanese
(-2.10) Russian
(-3.12) Korean

> Kurtaev
(-0.00) Russian
(-7.53) Czech
(-7.79) German

> Turin
(-0.46) Russian
(-1.59) English
(-3.23) German

> Kuznetsov
(-0.00) Russian
(-8.01) Greek
(-8.69) Polish


In [150]:
correct_list = []
for x, y in testloader:
    output = evaluate(x[0])
    # current_loss += loss
    guess, guess_i = categoryFromOutput(output)
    if guess_i == y:
        correct = '✓'
        correct_list.append(1)
    else:
        correct = "✗"
        correct_list.append(0)
    print(f"Name: {''.join(tensor_to_surname(x[0]))} \nCorrect: {all_nations_dict[y.item()]} | Predicted: {guess} {correct}")
    # all_losses.append(current_loss)
    # current_loss = 0
# print(f"Epoch {epoch}: {np.array(all_losses).mean()}")

Name: zhmaev 
Correct: Russian | Predicted: Russian ✓
Name: takewaki 
Correct: Japanese | Predicted: Polish ✗
Name: avtokratov 
Correct: Russian | Predicted: Russian ✓
Name: totah 
Correct: Arabic | Predicted: Arabic ✓
Name: shahtmeister 
Correct: Russian | Predicted: English ✗
Name: tchaly 
Correct: Russian | Predicted: English ✗
Name: pakulski 
Correct: Polish | Predicted: Russian ✗
Name: christie 
Correct: Scottish | Predicted: English ✗
Name: santos 
Correct: English | Predicted: Arabic ✗
Name: issa 
Correct: Arabic | Predicted: Arabic ✓
Name: bazzi 
Correct: Arabic | Predicted: Arabic ✓
Name: yamamoto 
Correct: Japanese | Predicted: Japanese ✓
Name: bishara 
Correct: Arabic | Predicted: Japanese ✗
Name: jindra 
Correct: Czech | Predicted: Czech ✓
Name: filippenkov 
Correct: Russian | Predicted: Russian ✓
Name: pirojkov 
Correct: Russian | Predicted: Russian ✓
Name: rim 
Correct: Korean | Predicted: English ✗
Name: awrorin 
Correct: Russian | Predicted: English ✗
Name: martinez 
Co

In [151]:
np.array(correct_list).mean()

0.6621129326047359

## 3. Классификация обзоров ресторанов

Датасет: https://disk.yandex.ru/d/nY1o70JtAuYa8g

3.1 Считать файл `yelp/raw_train.csv`. Оставить от исходного датасета 10% строчек.

3.2 Воспользоваться функцией `preprocess_text` из 1.1 для обработки текста отзыва. Закодировать рейтинг числами, начиная с 0.

3.3 Разбить датасет на обучающую и тестовую выборку

3.4 Реализовать класс `Vocab` (токен = слово)

3.5 Реализовать класс `ReviewDataset`

3.6 Обучить классификатор

3.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)


In [206]:
import pandas as pd

revs_full_dataset = pd.read_csv("yelp/raw_test.csv", header=None)
revs_full_dataset.columns = ["score", "review"]
revs_full_dataset.head()

Unnamed: 0,score,review
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."
3,2,I was overall pretty impressed by this hotel. ...
4,1,Video link at bottom review. Worst service I h...


In [207]:
revs_dataset = revs_full_dataset.sample(frac=0.1)
revs_dataset

Unnamed: 0,score,review
23766,1,Field Kitchen threw away everything good Sweet...
5389,1,I have not been impressed with Fitchburg Vet. ...
13067,2,This is a fantastic place to find great gifts ...
6885,1,Customer non-oriented. Business ethics in read...
2828,1,"OK, I am not seeing what others see here. For ..."
...,...,...
4920,1,This place is depressing. They certainly do no...
24010,1,If you don't want to grab yourself something f...
18241,2,Asked around for a unique dessert in Montr\u00...
35074,2,I didn't realize I had been to this place so m...


In [208]:
revs_full_dataset.iloc[:, 0].size

38000

In [209]:
revs_dataset.iloc[:, 0].size

3800

In [210]:
revs_dataset.review = revs_dataset.review.map(preprocess_text)
revs_dataset

Unnamed: 0,score,review
23766,1,field kitchen threw away everything good sweet...
5389,1,i have not been impressed with fitchburg vet.w...
13067,2,this is a fantastic place to find great gifts ...
6885,1,customer non-oriented. business ethics in read...
2828,1,"ok, i am not seeing what others see here. for ..."
...,...,...
4920,1,this place is depressing. they certainly do no...
24010,1,if you don't want to grab yourself something f...
18241,2,asked around for a unique dessert in montr\u00...
35074,2,i didn't realize i had been to this place so m...


In [211]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(revs_dataset.review, revs_dataset.score, test_size=0.2)

In [212]:
x_train, y_train

(23836    house red, large white pizza with garlic and i...
 21180    the food is star i would rather starve to deat...
 18712    hard to describe but hilarious and tons of fun...
 37733    been here a couple times with the bf, and i ha...
 30586    i visit olive garden regularly and this one ha...
                                ...                        
 26517    i remember this buffet from years ago, when it...
 11711                 they do a great job on my dirty car!
 31649    out-of-towners need to put fogo de ch\u00e3o b...
 768      el rico's dos is a perfect example of what tra...
 1669     if you're looking for great chicago-style pizz...
 Name: review, Length: 3040, dtype: object,
 23836    2
 21180    1
 18712    2
 37733    2
 30586    1
         ..
 26517    1
 11711    2
 31649    2
 768      2
 1669     2
 Name: score, Length: 3040, dtype: int64)

In [215]:
x_train_words = x_train.map(word_tokenize)
x_train_words

23836    [house, red, ,, large, white, pizza, with, gar...
21180    [the, food, is, star, i, would, rather, starve...
18712    [hard, to, describe, but, hilarious, and, tons...
37733    [been, here, a, couple, times, with, the, bf, ...
30586    [i, visit, olive, garden, regularly, and, this...
                               ...                        
26517    [i, remember, this, buffet, from, years, ago, ...
11711     [they, do, a, great, job, on, my, dirty, car, !]
31649    [out-of-towners, need, to, put, fogo, de, ch\u...
768      [el, rico, 's, dos, is, a, perfect, example, o...
1669     [if, you, 're, looking, for, great, chicago-st...
Name: review, Length: 3040, dtype: object

In [229]:
x_train_unique_words = pd.Series(x_train_words.explode().unique())
x_train_unique_words

0                   house
1                     red
2                       ,
3                   large
4                   white
               ...       
28912    spicy.\n\nrosati
28913     sauce.\n\ntheir
28914             pans.my
28915              3times
28916      2011.excellent
Length: 28917, dtype: object

In [255]:
def word_to_index(word: str) -> list:
    return x_train_unique_words[x_train_unique_words == word].index[0]

In [256]:
def sentence_to_index(sentence: list) -> list:
    return [word_to_index(word) for word in sentence]

In [261]:
def index_to_word(idx):
    return x_train_unique_words.iloc[idx]

In [262]:
def index_list_to_sentence(idx_list: list) -> list:
    return [index_to_word(idx) for idx in idx_list]

In [259]:
print(word_to_index("house"))

0


In [249]:
x_train_words.iloc[3]

['been',
 'here',
 'a',
 'couple',
 'times',
 'with',
 'the',
 'bf',
 ',',
 'and',
 'i',
 'have',
 'no',
 'complaints',
 '...',
 'get',
 'the',
 'killer',
 'shrimp',
 ',',
 'it',
 'is',
 'my',
 'favorite',
 '!',
 'nice',
 'staff',
 ',',
 'and',
 'now',
 'the',
 'happy',
 'hour',
 'is',
 'like',
 'all',
 'afternoon/evening',
 '!',
 'pretty',
 'rad',
 '.']

In [258]:
print(sentence_to_index(x_train_words.iloc[3]))

[83, 84, 16, 85, 86, 6, 23, 87, 2, 8, 26, 39, 88, 89, 90, 53, 23, 91, 92, 2, 93, 15, 94, 95, 22, 96, 97, 2, 8, 98, 23, 99, 100, 15, 101, 102, 103, 22, 104, 105, 11]


In [263]:
print(index_to_word(0))

house


In [264]:
print(index_list_to_sentence(sentence_to_index(x_train_words.iloc[3])))

['been', 'here', 'a', 'couple', 'times', 'with', 'the', 'bf', ',', 'and', 'i', 'have', 'no', 'complaints', '...', 'get', 'the', 'killer', 'shrimp', ',', 'it', 'is', 'my', 'favorite', '!', 'nice', 'staff', ',', 'and', 'now', 'the', 'happy', 'hour', 'is', 'like', 'all', 'afternoon/evening', '!', 'pretty', 'rad', '.']


In [265]:
# x_train_words.map(sentence_to_index)

KeyboardInterrupt: 

In [266]:
import numpy as np
from multiprocessing import cpu_count, Pool

cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want

def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [None]:
out_data = parallelize(x_train_words, sentence_to_index)

In [42]:
class Vocab:
  def __init__(self, data):
    self.idx_to_token = ...
    self.token_to_idx = ...
    self.vocab_len = data.len

In [43]:
class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, review):
    '''Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)'''
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return ...