# 5. Классификация текстов при помощи сетей прямого распространения

__Автор__: Никита Владимирович Блохин (NVBlokhin@fa.ru)

Финансовый университет, 2020 г. 

## 1. Представление и предобработка текстовых данных 

1.1 Операции по предобработке:
* токенизация
* стемминг / лемматизация
* удаление стоп-слов
* удаление пунктуации
* приведение к нижнему регистру
* любые другие операции над текстом

In [318]:
import nltk
import numpy as np
import torch

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer

In [319]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

Реализовать функцию `preprocess_text(text: str) -> str`, которая:
* приводит строку к нижнему регистру
* заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел


In [320]:
import re
def preprocess_text(text: str) -> str:
    return re.sub(r'[^a-zA-Z.,!?]', ' ', text).lower()

In [321]:
print(preprocess_text(text))

select your preferences and run the install command. stable represents the most currently tested and supported version of pytorch. note that libtorch is only available for c  


1.2 Представление текстовых данных при помощи бинарного кодирования


Представить первое предложение из `text` в виде тензора `sentence_t`: `sentence_t[i] == 1`, если __слово__ с индексом `i` присуствует в предложении.

In [322]:
first_sen = text.split(".")[0]
first_sen_list = word_tokenize(first_sen)
first_sen_list

['Select', 'your', 'preferences', 'and', 'run', 'the', 'install', 'command']

In [323]:
first_sentence_list = word_tokenize(sent_tokenize(text)[0])
first_sentence_list

['Select',
 'your',
 'preferences',
 'and',
 'run',
 'the',
 'install',
 'command',
 '.']

In [324]:
all_words = set(word_tokenize(text))
all_words.remove(".")

In [325]:
all_words

{'C++',
 'LibTorch',
 'Note',
 'PyTorch',
 'Select',
 'Stable',
 'and',
 'available',
 'command',
 'currently',
 'for',
 'install',
 'is',
 'most',
 'of',
 'only',
 'preferences',
 'represents',
 'run',
 'supported',
 'tested',
 'that',
 'the',
 'version',
 'your'}

In [326]:
[1 if i in first_sen_list else 0 for i in all_words]

[0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1]

## 2. Классификация фамилий по национальности

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`. 

2.2 Закодировать национальности числами, начиная с 0.

2.3 Разбить датасет на обучающую и тестовую выборку

2.4 Реализовать класс `Vocab` (токен = __символ__)

2.5 Реализовать класс `SurnamesDataset`

2.6. Обучить классификатор.

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [578]:
import pandas as pd

surname_dataset = pd.read_csv("surnames/surnames.csv")
surname_dataset.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [579]:
all_nations = surname_dataset.nationality
all_nations_dict = pd.Series(all_nations.unique()).to_dict()
all_nations_dict

{0: 'English',
 1: 'French',
 2: 'Arabic',
 3: 'Russian',
 4: 'Japanese',
 5: 'Chinese',
 6: 'Italian',
 7: 'Czech',
 8: 'Irish',
 9: 'German',
 10: 'Greek',
 11: 'Spanish',
 12: 'Polish',
 13: 'Dutch',
 14: 'Vietnamese',
 15: 'Korean',
 16: 'Portuguese',
 17: 'Scottish'}

In [675]:
all_nations_dict_reverse = {v: k for k, v in all_nations_dict.items()}
all_nations_dict_reverse

{'English': 0,
 'French': 1,
 'Arabic': 2,
 'Russian': 3,
 'Japanese': 4,
 'Chinese': 5,
 'Italian': 6,
 'Czech': 7,
 'Irish': 8,
 'German': 9,
 'Greek': 10,
 'Spanish': 11,
 'Polish': 12,
 'Dutch': 13,
 'Vietnamese': 14,
 'Korean': 15,
 'Portuguese': 16,
 'Scottish': 17}

In [677]:
all_nations_dict_reverse["Greek"]

10

In [576]:
class Vocab:
  def __init__(self, data: pd.DataFrame):
    """
    Инициализация словаря со всеми символами из данных
    :param data: ВЕСЬ датафрейм
    """
    all_chars = pd.Series(pd.unique(data.values.ravel())).map(lambda x: list(x.lower())).explode().unique()
    self.idx_to_token = {index: token for index, token in enumerate(all_chars)}
    self.token_to_idx = {token: index for index, token in enumerate(all_chars)}
    self.vocab_len = len(all_chars)


In [678]:
from torch.utils.data import Dataset


class SurnamesDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab
    self.max_X = 17
    self.max_y = 10

  def vectorize(self, surname: str):
    '''Генерирует представление фамилии surname в при помощи бинарного кодирования (см. 1.2)'''
    tensor = torch.zeros(len(surname), 1, vocab.vocab_len)
    for li, letter in enumerate(surname.lower()):
      tensor[li][0][vocab.token_to_idx[letter]] = 1
    return tensor
    # out = []
    # for char in surname.lower():
    #   out.append(vocab.token_to_idx[char])

    # while len(out) < self.max_X:
    #   out.insert(0, 0)
    # return out

      # yield vocab.token_to_idx[char]
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    surname = self.X.iloc[idx]
    surname_vectorize = self.vectorize(surname)
    # s = torch.tensor(surname_vectorize)
    # if len(s) < self.max_X:
    #   rp = torch.repeat_interleave(torch.tensor([0]), self.max_X - len(s))
    #   s = torch.concat((rp, s), 0)
    # else:
    #   s = surname_as_tensor

    # uniq_nations = self.y.unique()
    nation = self.y.iloc[idx]
    nation_vectorize = all_nations_dict_reverse[nation]
    n = torch.tensor(nation_vectorize)
    # if len(n) < self.max_y:
    #   rp = torch.repeat_interleave(torch.tensor([0]), self.max_y - len(n))
    #   n = torch.concat((rp, n), 0)
    # else:
    #   n = nation_as_tensor

    return surname_vectorize, n


In [679]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(surname_dataset["surname"], surname_dataset["nationality"], test_size=0.2)

In [680]:
x_train.head(), y_train.head()

(3368       Mckay
 1680     Yaburov
 5003        Wang
 60        Scarsi
 8657    Charlton
 Name: surname, dtype: object,
 3368    English
 1680    Russian
 5003     German
 60      Italian
 8657    English
 Name: nationality, dtype: object)

In [681]:
vocab = Vocab(surname_dataset)
snds_train = SurnamesDataset(X=x_train, y=y_train, vocab=vocab)
snds_test = SurnamesDataset(X=x_test, y=y_test, vocab=vocab)

In [682]:
vocab.idx_to_token

{0: 'w',
 1: 'o',
 2: 'd',
 3: 'f',
 4: 'r',
 5: 'e',
 6: 'n',
 7: 'g',
 8: 'l',
 9: 'i',
 10: 's',
 11: 'h',
 12: 'c',
 13: 't',
 14: 'é',
 15: 'k',
 16: 'u',
 17: 'y',
 18: 'a',
 19: 'b',
 20: 'z',
 21: 'j',
 22: 'p',
 23: 'm',
 24: 'v',
 25: "'",
 26: 'q',
 27: 'à',
 28: 'x',
 29: 'ü',
 30: '-',
 31: 'í',
 32: 'ú',
 33: 'ä',
 34: 'ö',
 35: 'ó',
 36: '1',
 37: 'ò',
 38: 'ñ',
 39: 'ż',
 40: 'ß',
 41: 'á',
 42: 'è',
 43: 'ã',
 44: 'ê',
 45: 'ì',
 46: 'ś',
 47: 'ń',
 48: 'ù',
 49: 'ç',
 50: '/',
 51: 'õ',
 52: 'ą',
 53: 'ł',
 54: ':'}

In [683]:
test_surname = "Kodama"

In [684]:
snds_train.vectorize(test_surname)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [685]:
[snds_train.vocab.idx_to_token[torch.where(snds_train.vectorize(test_surname)[i] == 1)[1][0].item()] for i in range(len(test_surname))]

['k', 'o', 'd', 'a', 'm', 'a']

In [686]:
snds_train.X.iloc[0]

'Mckay'

In [687]:
def tensor_to_letter(ten):
  return vocab.idx_to_token[torch.where(ten == 1)[1][0].item()]

In [688]:
def tensor_to_surname(ten):
  return [tensor_to_letter(ten[i]) for i in range(ten.shape[0])]

In [689]:
print(tensor_to_surname(snds_train[0][0]))

['m', 'c', 'k', 'a', 'y']


In [690]:
snds_train.X

3368       Mckay
1680     Yaburov
5003        Wang
60        Scarsi
8657    Charlton
          ...   
3450     Higuchi
4674    Cattaneo
3761      Penzin
7060      Morcos
9990       Nakao
Name: surname, Length: 8784, dtype: object

In [691]:
snds_train[0][0].shape

torch.Size([5, 1, 55])

In [692]:
def my_collate(batch):
    # data = [item[0] for item in batch]
    # target = [item[1] for item in batch]
    # print(batch[0])
    # target = torch.LongTensor(target)
    # print(tensor_to_surname(batch[0][0]))
    # print((batch[0][1]))
    return batch

In [707]:
from torch.utils.data import DataLoader

batch_size = 1
trainloader = DataLoader(
    dataset=snds_train,
    # batch_size=batch_size,
    # collate_fn=my_collate,
    shuffle=True,
    num_workers=0
)
testloader = DataLoader(
    dataset=snds_test,
    # batch_size=batch_size,
    # collate_fn=my_collate,
    shuffle=False,
    num_workers=0
)

In [714]:
x_batch, y_batch = next(iter(trainloader))
x_batch[0].shape, y_batch.shape

(torch.Size([9, 1, 55]), torch.Size([1]))

In [715]:
tensor_to_surname(x_batch[0])

['l', 'e', 'i', 'b', 'o', 'v', 's', 'k', 'y']

In [717]:
all_nations_dict

{0: 'English',
 1: 'French',
 2: 'Arabic',
 3: 'Russian',
 4: 'Japanese',
 5: 'Chinese',
 6: 'Italian',
 7: 'Czech',
 8: 'Irish',
 9: 'German',
 10: 'Greek',
 11: 'Spanish',
 12: 'Polish',
 13: 'Dutch',
 14: 'Vietnamese',
 15: 'Korean',
 16: 'Portuguese',
 17: 'Scottish'}

In [718]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
n_categories = len(all_nations_dict)
rnn = RNN(vocab.vocab_len, n_hidden, n_categories)

In [719]:
criterion = nn.NLLLoss()

In [720]:
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

def train(line_tensor, category_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    # torch.tensor(line_tensor, dtype=torch.long)
    for i in range(torch.tensor(line_tensor, dtype=torch.long).size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

In [736]:
# all_categories

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_nations_dict[category_i], category_i

In [769]:
current_loss = 0
all_losses = []

n_iters = 1000
print_every = 100
plot_every = 100

for epoch in range(20):
    for x, y in trainloader:
        output, loss = train(x[0], y)
        current_loss += loss
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == y else f"✗ {categoryFromOutput(output[0])[0]}"
        all_losses.append(current_loss)
        current_loss = 0
    print(f"Epoch {epoch}: {np.array(all_losses).mean()}")

  for i in range(torch.tensor(line_tensor, dtype=torch.long).size()[0]):


Epoch 0: 1.1728065510908756
Epoch 1: 1.1589196624866227
Epoch 2: 1.1455561319036685
Epoch 3: 1.1354402756099449
Epoch 4: 1.1268581589133442
Epoch 5: 1.1186911872677694
Epoch 6: 1.1113091041832905
Epoch 7: 1.1047102787004137
Epoch 8: 1.0984479405423184
Epoch 9: 1.0925621968027757
Epoch 10: 1.0875657079992633
Epoch 11: 1.0822583346132586
Epoch 12: 1.0776309314725394
Epoch 13: 1.0734226002741223
Epoch 14: 1.0695862895127246
Epoch 15: 1.06651692168031
Epoch 16: 1.0633078501598971
Epoch 17: 1.0603134043829991
Epoch 18: 1.0572203702247644
Epoch 19: 1.0542386163320954


In [770]:
def evaluate(line_tensor):
    hidden = rnn.initHidden()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    return output

In [781]:
def predict(input_line, n_predictions=3):
    print('\n> %s' % input_line)
    with torch.no_grad():
        output = evaluate(snds_train.vectorize(input_line))

        topv, topi = output.topk(n_predictions, 1, True)
        predictions = []

        for i in range(n_predictions):
            value = topv[0][i].item()
            category_index = topi[0][i].item()
            print('(%.2f) %s' % (value, all_nations_dict[category_index]))
            predictions.append([value, all_nations_dict[category_index]])

predict('Dovesky')
predict('Jackson')
predict('Satoshi')
predict('Kurtaev')
predict('Turin')
predict('Kuznetsov')


> Dovesky
(-0.24) Russian
(-2.08) Czech
(-2.60) English

> Jackson
(-0.55) English
(-1.91) Scottish
(-2.12) Russian

> Satoshi
(-0.79) Japanese
(-0.84) Arabic
(-3.05) Italian

> Kurtaev
(-0.08) Russian
(-3.91) English
(-3.96) Japanese

> Turin
(-0.83) Russian
(-0.95) English
(-3.44) Dutch

> Kuznetsov
(-0.02) Russian
(-4.71) Greek
(-5.45) Japanese


## 3. Классификация обзоров ресторанов

Датасет: https://disk.yandex.ru/d/nY1o70JtAuYa8g

3.1 Считать файл `yelp/raw_train.csv`. Оставить от исходного датасета 10% строчек.

3.2 Воспользоваться функцией `preprocess_text` из 1.1 для обработки текста отзыва. Закодировать рейтинг числами, начиная с 0.

3.3 Разбить датасет на обучающую и тестовую выборку

3.4 Реализовать класс `Vocab` (токен = слово)

3.5 Реализовать класс `ReviewDataset`

3.6 Обучить классификатор

3.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)


In [None]:
class Vocab:
  def __init__(self, data):
    self.idx_to_token = ...
    self.token_to_idx = ...
    self.vocab_len = ...

In [None]:
class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, review):
    '''Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)'''
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return ...