<a href="https://colab.research.google.com/github/Amikuto/DAaML/blob/master/06_CNN_embeddings_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [124]:
import pandas as pd
import numpy as np
import nltk
import torch
import torch.nn as nn

torch.manual_seed(0)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 6. Классификация текстов при помощи сверточных сетей

__Автор__: Никита Владимирович Блохин (NVBlokhin@fa.ru)

Финансовый университет, 2020 г. 

## 1. Представление и предобработка текстовых данных в виде последовательностей

1.1 Представьте первое предложение из строки `text` как последовательность из индексов слов, входящих в это предложение

In [2]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [3]:
first_sen = nltk.sent_tokenize(text)[0].replace(".", "").lower()
first_sen

'select your preferences and run the install command'

In [4]:
words = nltk.word_tokenize(first_sen)
words

['select', 'your', 'preferences', 'and', 'run', 'the', 'install', 'command']

In [5]:
words_dict = {k: v for v, k in enumerate(words)}
words_dict

{'select': 0,
 'your': 1,
 'preferences': 2,
 'and': 3,
 'run': 4,
 'the': 5,
 'install': 6,
 'command': 7}

In [6]:
[words_dict[i] for i in words]

[0, 1, 2, 3, 4, 5, 6, 7]

1.2 Представьте первое предложение из строки `text` как последовательность векторов, соответствующих индексам слов. Для представления индекса в виде вектора используйте унитарное кодирование. В результате должен получиться двумерный тензор размера `количество слов в предложении` x `количество уникальных слов`

In [7]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [8]:
all_words = nltk.word_tokenize(text.lower().replace(".", ""))
all_words_dict = {k: v for v, k in enumerate(all_words)}
all_words_dict

{'select': 0,
 'your': 1,
 'preferences': 2,
 'and': 14,
 'run': 4,
 'the': 10,
 'install': 6,
 'command': 7,
 'stable': 8,
 'represents': 9,
 'most': 11,
 'currently': 12,
 'tested': 13,
 'supported': 15,
 'version': 16,
 'of': 17,
 'pytorch': 18,
 'note': 19,
 'that': 20,
 'libtorch': 21,
 'is': 22,
 'only': 23,
 'available': 24,
 'for': 25,
 'c++': 26}

In [9]:
first_sen = nltk.sent_tokenize(text)[0].replace(".", "").lower()
first_sen_words = nltk.word_tokenize(first_sen)
first_sen_words

['select', 'your', 'preferences', 'and', 'run', 'the', 'install', 'command']

In [10]:
tensor = torch.zeros(len(first_sen_words), len(all_words_dict))

In [11]:
for i, word in enumerate(first_sen_words):
  tensor[i][all_words_dict[word]] = 1
  print(all_words_dict[word])

0
1
2
14
4
10
6
7


In [12]:
tensor

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.]])

In [13]:
tensor.shape

torch.Size([8, 25])

1.3 Решите задачу 1.2, используя модуль `nn.Embedding`

In [14]:
from collections import Counter

vocab = Counter(all_words_dict)
vocab = sorted(vocab, key=vocab.get, reverse=True)
vocab_size = len(vocab)
vocab_size

25

In [15]:
word2idx = {word: ind for ind, word in enumerate(vocab)}
word2idx

{'c++': 0,
 'for': 1,
 'available': 2,
 'only': 3,
 'is': 4,
 'libtorch': 5,
 'that': 6,
 'note': 7,
 'pytorch': 8,
 'of': 9,
 'version': 10,
 'supported': 11,
 'and': 12,
 'tested': 13,
 'currently': 14,
 'most': 15,
 'the': 16,
 'represents': 17,
 'stable': 18,
 'command': 19,
 'install': 20,
 'run': 21,
 'preferences': 22,
 'your': 23,
 'select': 24}

In [16]:
encoded_sentences = [word2idx[word] for word in first_sen_words]
encoded_sentences

[24, 23, 22, 12, 21, 16, 20, 19]

In [17]:
emb_dim = 25
emb_layer = nn.Embedding(vocab_size, emb_dim)
word_vectors = emb_layer(torch.LongTensor(encoded_sentences))
word_vectors

tensor([[-0.1112,  0.3557, -0.7150,  1.6158,  0.3499,  0.6634,  0.0181,  0.8812,
          0.9868, -0.7545, -0.5677,  2.0593, -0.7072, -0.5910,  0.1844, -1.4870,
          1.4205, -0.1919, -2.0925, -0.8850,  0.0351,  0.7003, -2.6861, -1.4419,
          2.2907],
        [ 0.9362, -0.9281, -0.8066, -0.6602, -0.8632, -0.0907, -0.4594, -0.7365,
         -0.1554, -0.4295,  0.9135, -0.0341, -0.8026, -0.4474,  0.4627, -1.3719,
          1.0284, -0.0563, -1.5617,  1.6311,  0.3002, -1.3049,  0.2713, -0.8726,
         -1.0428],
        [-2.2612, -0.6849, -0.5145,  0.6234, -1.2462, -0.8086, -0.5281,  2.0206,
         -1.4252,  1.5869, -0.5484,  0.3505, -2.2405, -0.2322, -0.0751,  1.9459,
         -0.5312, -1.3641,  1.2861, -1.1854,  0.1358,  1.0592, -0.6069,  1.1976,
         -0.9881],
        [ 0.5762,  0.3497,  0.6203,  0.5666, -1.5259, -0.6523, -0.7427,  0.1311,
          0.2687,  0.4992, -0.3572, -1.0284, -0.9737, -2.5078, -0.0549,  1.0692,
          0.4076, -0.5583, -0.6936,  0.8231,  0.3338

In [18]:
word_vectors.shape

torch.Size([8, 25])

## 2. Классификация фамилий по национальности (ConvNet)

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`. 

2.2 Закодировать национальности числами, начиная с 0.

2.3 Разбить датасет на обучающую и тестовую выборку

2.4 Реализовать класс `Vocab` (токен = __символ__)
  * добавьте в словарь специальный токен `<PAD>` с индексом 0
  * при создании словаря сохраните длину самой длинной последовательности из набора данных в виде атрибута `max_seq_len`

2.5 Реализовать класс `SurnamesDataset`
  * метод `__getitem__` возвращает пару: <последовательность индексов токенов (см. 1.1 ), номер класса> 
  * длина каждой такой последовательности должна быть одинаковой и равной `vocab.max_seq_len`. Чтобы добиться этого, дополните последовательность справа индексом токена `<PAD>` до нужной длины

2.6. Обучить классификатор.
  
  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding`. Рассмотрите два варианта: 
    - когда токен представляется в виде унитарного вектора и модуль `nn.Embedding` не обучается
    - когда токен представляется в виде вектора небольшой размерности (меньше, чем размер словаря) и модуль `nn.Embedding` обучается

  * Используйте одномерные свертки и пулинг (`nn.Conv1d`, `nn.MaxPool1d`)
    - обратите внимание, что `nn.Conv1d` ожидает на вход трехмерный тензор размерности `(batch, embedding_dim, seq_len)`

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [19]:
surname_dataset = pd.read_csv("./surnames/surnames.csv")
surname_dataset.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [103]:
surname_dict = pd.Series(surname_dataset.nationality.unique()).to_dict()
surname_dict = dict(map(reversed, surname_dict.items()))
surname_dict_reverse = {v: k for k, v in surname_dict.items()}
surname_dict

{'English': 0,
 'French': 1,
 'Arabic': 2,
 'Russian': 3,
 'Japanese': 4,
 'Chinese': 5,
 'Italian': 6,
 'Czech': 7,
 'Irish': 8,
 'German': 9,
 'Greek': 10,
 'Spanish': 11,
 'Polish': 12,
 'Dutch': 13,
 'Vietnamese': 14,
 'Korean': 15,
 'Portuguese': 16,
 'Scottish': 17}

In [21]:
dataset_nation_as_index = surname_dataset.copy()
dataset_nation_as_index.nationality = surname_dataset.nationality.map(lambda x: surname_dict[x])
dataset_nation_as_index

Unnamed: 0,surname,nationality
0,Woodford,0
1,Coté,1
2,Kore,0
3,Koury,2
4,Lebzak,3
...,...,...
10975,Quraishi,2
10976,Innalls,0
10977,Król,12
10978,Purvis,0


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset_nation_as_index["surname"], dataset_nation_as_index["nationality"], test_size=0.2)

In [23]:
X_train

7323      Dagher
3943       Lyall
6050      Saliba
7376       Deane
10925    Yakubov
          ...   
8422     Douglas
9758     Mansour
5815     Yukhman
9087       Mcrae
4063     Burgess
Name: surname, Length: 8784, dtype: object

In [24]:
max(X_train.map(len))

17

In [25]:
y_train

7323      2
3943      0
6050      2
7376      0
10925     3
         ..
8422     17
9758      2
5815      3
9087      0
4063      0
Name: nationality, Length: 8784, dtype: int64

In [72]:
class Vocab:
  def __init__(self, column: pd.DataFrame | pd.Series):
    all_chars = pd.Series(column.values).map(lambda x: list(x.lower())).explode().unique()
    all_chars = np.insert(all_chars, 0, "<PAD>")
    self.idx_to_token = {index: token for index, token in enumerate(all_chars)}
    self.token_to_idx = {token: index for index, token in enumerate(all_chars)}
    self.max_seq_len = max(column.map(len))
    self.vocab_len = len(all_chars)

In [73]:
vocab = Vocab(dataset_nation_as_index["surname"])
vocab.max_seq_len

17

In [87]:
vocab.vocab_len

56

In [74]:
vocab.token_to_idx

{'<PAD>': 0,
 'w': 1,
 'o': 2,
 'd': 3,
 'f': 4,
 'r': 5,
 'c': 6,
 't': 7,
 'é': 8,
 'k': 9,
 'e': 10,
 'u': 11,
 'y': 12,
 'l': 13,
 'b': 14,
 'z': 15,
 'a': 16,
 'i': 17,
 'n': 18,
 'h': 19,
 'm': 20,
 's': 21,
 'v': 22,
 'p': 23,
 'g': 24,
 'j': 25,
 "'": 26,
 'q': 27,
 'à': 28,
 'x': 29,
 'ü': 30,
 '-': 31,
 'í': 32,
 'ú': 33,
 'ä': 34,
 'ö': 35,
 'ó': 36,
 '1': 37,
 'ò': 38,
 'ñ': 39,
 'ż': 40,
 'ß': 41,
 'á': 42,
 'è': 43,
 'ã': 44,
 'ê': 45,
 'ì': 46,
 'ś': 47,
 'ń': 48,
 'ù': 49,
 'ç': 50,
 '/': 51,
 'õ': 52,
 'ą': 53,
 'ł': 54,
 ':': 55}

2.5 Реализовать класс `SurnamesDataset`
  * метод `__getitem__` возвращает пару: <последовательность индексов токенов (см. 1.1 ), номер класса>
  * длина каждой такой последовательности должна быть одинаковой и равной `vocab.max_seq_len`. Чтобы добиться этого, дополните последовательность справа индексом токена `<PAD>` до нужной длины

In [225]:
from torch.utils.data import Dataset


class SurnamesDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab, embed: bool=False):
    self.embed = embed
    self.X = X
    self.y = y
    self.vocab = vocab
    self.max_X = vocab.max_seq_len
    self.max_y = 10

  def vectorize(self, surname):
    if not self.embed:
      tensor = torch.zeros(self.max_X, vocab.vocab_len)
      for li, letter in enumerate(surname.lower()):
        tensor[li][vocab.token_to_idx[letter]] = 1
      return tensor

    tensor = torch.zeros(self.vocab.max_seq_len, dtype=torch.long)
    for i, val in enumerate(surname):
      tensor[i] = self.vocab.token_to_idx[val.lower()]
    tensor[tensor==0] = self.vocab.token_to_idx["<PAD>"]
    return tensor

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    surname = self.X.iloc[idx]

    nation_tensor = torch.zeros(len(surname_dict))
    nation = self.y.iloc[idx]
    nation_tensor[nation] = 1

    # return emb(torch.LongTensor(self.vectorize(surname))), nation
    return self.vectorize(surname), torch.tensor(nation)

In [226]:
X_train.iloc[0], y_train.iloc[0]

('Dagher', 2)

In [227]:
index_dataset_train = SurnamesDataset(X=X_train, y=y_train, vocab=vocab)
index_dataset_test = SurnamesDataset(X=X_test, y=y_test, vocab=vocab)

In [228]:
index_dataset_train[0][0].shape

torch.Size([17, 56])

In [229]:
hot_dataset_train = SurnamesDataset(X=X_train, y=y_train, vocab=vocab, embed=True)
hot_dataset_test = SurnamesDataset(X=X_test, y=y_test, vocab=vocab, embed=True)

In [230]:
hot_dataset_train[0][0]

tensor([ 3, 16, 24, 19, 10,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

2.6. Обучить классификатор.

  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding`. Рассмотрите два варианта:
    - когда токен представляется в виде унитарного вектора и модуль `nn.Embedding` не обучается
    - когда токен представляется в виде вектора небольшой размерности (меньше, чем размер словаря) и модуль `nn.Embedding` обучается

  * Используйте одномерные свертки и пулинг (`nn.Conv1d`, `nn.MaxPool1d`)
    - обратите внимание, что `nn.Conv1d` ожидает на вход трехмерный тензор размерности `(batch, embedding_dim, seq_len)`

In [205]:
import torch.nn as nn
import torch.nn.functional as F

class SurnameClassifier(nn.Module):
  """ A 2-layer multilayer perceptron for classifying surnames """
  def __init__(self, input_dim, hidden_dim, output_dim, embeded=False):
    """
    Args:
        input_dim (int): the size of the input vectors
        hidden_dim (int): the output size of the first Linear layer
        output_dim (int): the output size of the second Linear layer
    """
    super(SurnameClassifier, self).__init__()
    self.embeded = embeded
    self.fc1 = nn.Linear(input_dim, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, output_dim)
    self.embedding = nn.Embedding(num_embeddings=len(vocab.alphabet), embedding_dim=64)

  def forward(self, x_in, apply_softmax=False):
    """The forward pass of the classifier

    Args:
        x_in (torch.Tensor): an input data tensor
            x_in.shape should be (batch, input_dim)
        apply_softmax (bool): a flag for the softmax activation
            should be false if used with the cross-entropy losses
    Returns:
        the resulting tensor. tensor.shape should be (batch, output_dim).
    """
    intermediate_vector = F.relu(self.fc1(x_in))
    prediction_vector = self.fc2(intermediate_vector)

    if apply_softmax:
      prediction_vector = F.softmax(prediction_vector, dim=1)

    return prediction_vector

In [233]:
class SurnamesClassifier(nn.Module):

  def __init__(
          self,
          vocab: Vocab,
          out_features: int,
          embedding_dim: int = 56,
          use_embedding: bool = True,
          debug: bool = False,
  ):
    super(SurnamesClassifier, self).__init__()
    self.use_embedding = use_embedding
    self.debug = debug

    self.embedding_dim = embedding_dim

    last_conv_out_channels = 64
    adaptive_avg_pool = 8

    self.embedding = nn.Embedding(num_embeddings=vocab.max_seq_len+1, embedding_dim=embedding_dim)
    self.features = nn.Sequential(
      nn.Conv1d(in_channels=embedding_dim, out_channels=64, kernel_size=3),
      nn.BatchNorm1d(num_features=64),
      nn.ReLU(),
      nn.MaxPool1d(kernel_size=2),
      nn.Conv1d(in_channels=64, out_channels=last_conv_out_channels, kernel_size=3),
      nn.BatchNorm1d(num_features=last_conv_out_channels),
      nn.ReLU(),
      nn.MaxPool1d(kernel_size=2),
    )
    # Единственный полезный (и понятный зачем) слой. Зачем? - Позволяет не думать о размерностях
    self.avgpool = nn.AdaptiveAvgPool1d(adaptive_avg_pool)
    self.classifier = nn.Sequential(
      nn.Linear(last_conv_out_channels * adaptive_avg_pool, 256),
      nn.ReLU(),
      nn.Dropout(),
      nn.Linear(256, out_features),
    )

    if self.debug:
      self.forward = self._debug_forward
    else:
      self.forward = self._forward

  def _forward(self, x: torch.Tensor):
    # print(x.shape)
    if self.use_embedding:
      x = self.embedding(x)
    # else:
    #   # Для эксперимента с one-hot - что будет, если растянуть вектора до размера embedding_dim?
    #   # Ответ: ничего
    #   x = F.pad(x, (0, self.embedding_dim - x.size(2), 0, 0), value=0)

    # (batch_size, num_features [embedding_dim], n_tokens)
    x = x.reshape(x.size(0), x.size(2), x.size(1))
    x = self.features(x)
    x = self.avgpool(x)
    x = torch.flatten(x, 1)
    x = self.classifier(x)

    return torch.log_softmax(x, dim=1)

  def _debug_forward(self, x: torch.Tensor):
    print("x: ", x.size())
    if self.use_embedding:
      x = self.embedding(x)
      print("embedding: ", x.size())
    else:
      x = F.pad(x, (0, self.embedding_dim - x.size(2), 0, 0), value=0)
      print("pad: ", x.size())

    x = x.reshape(x.size(0), x.size(2), x.size(1))
    print("reshape: ", x.size())
    x = self.features(x)
    print("features: ", x.size())
    x = self.avgpool(x)
    print("avgpool: ", x.size())
    x = torch.flatten(x, 1)
    print("flatten: ", x.size())
    x = self.classifier(x)
    print("classifier: ", x.size())
    return torch.log_softmax(x, dim=1)

In [207]:
common_net = SurnamesClassifier(vocab, len(surname_dict))

loss_fn = nn.NLLLoss()
optimizer = torch.optim.Adam(common_net.parameters(), lr=0.001)

In [208]:
from torch.utils.data import random_split, Subset
import typing as t
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

def on_cuda(device: str) -> bool:
  return device == "cuda"


def common_train(
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        train_dataloader: DataLoader,
        epochs: int,
        test_dataloader: DataLoader = None,
        lr_scheduler=None,
        verbose: int = 100,
        device: str = "cpu",
) -> t.List[float]:
  train_losses = []
  for epoch in range(epochs):
    print(f"Epoch {epoch + 1}\n" + "-" * 32)
    train_loss = train_loop(
      train_dataloader,
      model,
      loss_fn,
      optimizer,
      verbose=verbose,
      device=device,
    )
    train_losses.append(train_loss.item())
    if test_dataloader:
      loss, acc = test_loop(test_dataloader, model, loss_fn, device=device)
      if lr_scheduler:
        lr_scheduler.step(loss)
    torch.cuda.empty_cache()
  return train_losses


def train_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        verbose: int = 100,
        device: str = "cpu",
) -> torch.Tensor:
  model.train()

  size = len(dataloader.dataset)  # noqa
  num_batches = len(dataloader)
  avg_loss = 0


  for batch, (x, y) in enumerate(dataloader):
    x, y = x.to(device), y.to(device)

    pred = model(x)
    loss = loss_fn(pred, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    avg_loss += loss
    if batch % verbose == 0:
      print(f"loss: {loss:>7f}  [{batch * len(x):>5d}/{size:>5d}]")

    del x, y, pred, loss
    torch.cuda.empty_cache()

  return avg_loss / num_batches


@torch.no_grad()
def test_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
  model.eval()

  size = len(dataloader.dataset)  # noqa
  num_batches = len(dataloader)
  avg_loss, correct = 0, 0

  for x, y in dataloader:
    x, y = x.to(device), y.to(device)
    pred = model(x)
    avg_loss += loss_fn(pred, y)
    correct += (pred.argmax(1) == y).type(torch.float).sum().item()  # noqa

    del x, y, pred
    torch.cuda.empty_cache()

  avg_loss /= num_batches
  accuracy = correct / size
  print(f"Test Error: \n Accuracy: {accuracy:>4f}, Avg loss: {avg_loss:>8f} \n")

  return avg_loss, accuracy


def train_test_split(dataset: t.Union[Dataset, t.Sized], train_part: float) -> t.Tuple[Subset, Subset]:
  train_size = round(train_part * len(dataset))
  test_size = len(dataset) - train_size
  train_dataset, test_dataset = random_split(dataset, lengths=(train_size, test_size))
  return train_dataset, test_dataset


@torch.no_grad()
def get_y_test_y_pred(
        model: nn.Module,
        test_dataloader: DataLoader,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
  model.eval()

  y_test = []
  y_pred = []
  for x, y in test_dataloader:
    x, y = x.to(device), y.to(device)
    pred = model(x).argmax(1)
    y_test.append(y)
    y_pred.append(pred)

    del x
    torch.cuda.empty_cache()

  return torch.hstack(y_test).detach().cpu(), torch.hstack(y_pred).detach().cpu()

In [181]:
from torch.utils.data import Dataset, DataLoader


common_net.use_embedding = False
_ = common_train(
  epochs=10,
  model=common_net,
  loss_fn=loss_fn,
  optimizer=optimizer,
  train_dataloader=DataLoader(index_dataset_train, batch_size=8, shuffle=True),
  test_dataloader=DataLoader(index_dataset_test, batch_size=512),
  verbose=500,
  device=DEVICE,
)

Epoch 1
--------------------------------
loss: 3.112618  [    0/ 8784]
loss: 3.026118  [ 4000/ 8784]
loss: 1.229229  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.548270, Avg loss: 1.595330 

Epoch 2
--------------------------------
loss: 1.313912  [    0/ 8784]
loss: 0.992270  [ 4000/ 8784]
loss: 1.251219  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.574681, Avg loss: 1.456847 

Epoch 3
--------------------------------
loss: 1.970159  [    0/ 8784]
loss: 1.351185  [ 4000/ 8784]
loss: 1.808311  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.596084, Avg loss: 1.393186 

Epoch 4
--------------------------------
loss: 1.332764  [    0/ 8784]
loss: 0.734504  [ 4000/ 8784]
loss: 1.124861  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.605191, Avg loss: 1.346658 

Epoch 5
--------------------------------
loss: 0.415921  [    0/ 8784]
loss: 1.039611  [ 4000/ 8784]
loss: 0.977130  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.612477, Avg loss: 1.352463 

Epoch 6
--------------------------------
loss: 1.267020  [    0/ 8784]

In [209]:
train_losses = []
dataloader = DataLoader(index_dataset_train, batch_size=8, shuffle=True)
for epoch in range(10):
  print(f"Epoch {epoch + 1}\n" + "-" * 32)
  # train_loss = train_loop(
  #   train_dataloader,
  #   model,
  #   loss_fn,
  #   optimizer,
  #   verbose=verbose,
  #   device=device,
  # )
  common_net.train()

  size = len(dataloader.dataset)  # noqa
  num_batches = len(dataloader)
  avg_loss = 0
  print(next(enumerate(dataloader)))

  for idx, batch in enumerate(dataloader):
    print('Batch index: ', idx)
    print('Batch size: ', batch[0].size())
    print('Batch label: ', batch[1])
    break


  # for batch, (x, y) in enumerate(dataloader):
  #   print(batch)
  #   # x, y = x.to(device), y.to(device)
  #
  #   pred = common_net(x)
  #   loss = loss_fn(pred, y)
  #
  #   optimizer.zero_grad()
  #   loss.backward()
  #   optimizer.step()
  #
  #   avg_loss += loss
  #   if batch % 500 == 0:
  #     print(f"loss: {loss:>7f}  [{batch * len(x):>5d}/{size:>5d}]")
  #
  #   del x, y, pred, loss
  #   torch.cuda.empty_cache()
  #
  # train_losses.append(avg_loss / num_batches.item())

Epoch 1
--------------------------------
(0, [tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0.,

In [234]:
torch.manual_seed(0)

embeddings_net = SurnamesClassifier(vocab, len(surname_dict))

loss_fn = nn.NLLLoss()
optimizer = torch.optim.Adam(embeddings_net.parameters(), lr=0.001)

In [235]:
%%time

# Embedding представление
embeddings_net.use_embedding = True
_ = common_train(
  epochs=15,
  model=embeddings_net,
  loss_fn=loss_fn,
  optimizer=optimizer,
  train_dataloader=DataLoader(hot_dataset_train, batch_size=8, shuffle=True),
  test_dataloader=DataLoader(hot_dataset_test, batch_size=512),
  verbose=500,
  device=DEVICE,
)

Epoch 1
--------------------------------


IndexError: index out of range in self

In [34]:
from torch import optim

classifier = SurnameClassifier(input_dim=17,
                               hidden_dim=300,
                               output_dim=len(surname_dict))

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)

In [35]:
from torch.utils.data import DataLoader

batch_size = 1
trainloader = DataLoader(
  dataset=dataset_train,
  batch_size=batch_size,
  # collate_fn=my_collate,
  # shuffle=True,
  num_workers=0
)

In [36]:
emb = nn.Embedding(len(vocab.token_to_idx) + 1, 18)
print(emb.weight.shape)
def get_embedding_index(x):
  results = torch.where(torch.sum((emb.weight==x), axis=1))
  if len(results[0])==len(x):
    return None
  else:
    return results[0][0]

torch.Size([57, 18])


In [37]:
dataset_train[2][0]

tensor([21, 16, 13, 17, 14, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [38]:
emb(torch.LongTensor(dataset_train[2][0]))

tensor([[ 5.4045e-02, -1.8993e+00,  2.2038e+00,  6.0082e-03, -1.7900e+00,
         -6.0637e-01, -7.2622e-01,  5.3939e-02,  1.0849e+00,  1.2687e+00,
         -7.1206e-01, -3.8479e-01, -1.3890e+00,  2.6024e-01, -6.4970e-01,
         -1.7900e-01, -1.5749e+00, -2.1649e+00],
        [ 1.9536e+00, -1.2741e+00, -2.2306e+00,  7.7549e-01, -1.8824e+00,
          2.2897e+00, -3.7220e-01,  1.0720e+00,  1.2969e+00, -1.0190e+00,
          6.9980e-01,  2.9742e-02, -2.8295e-01, -3.0373e-01, -8.0670e-02,
          1.1569e+00,  3.8202e-01, -9.0691e-01],
        [-9.8267e-01, -3.4703e-01, -5.5266e-01, -3.3055e-01, -2.6223e-01,
          1.0678e-01,  8.6874e-01,  1.6919e+00,  1.6781e+00, -2.1355e+00,
         -2.3432e+00,  9.3682e-01,  1.3464e+00,  2.5378e-01, -1.5317e+00,
         -1.1624e+00, -4.8173e-01,  4.2428e-01],
        [-3.1010e-01,  9.7680e-01,  2.7301e+00, -4.1161e-01,  4.5675e-01,
          4.0371e-01, -7.7948e-02,  2.4716e-01,  1.4026e+00, -7.9675e-03,
         -7.1862e-01,  9.3086e-01,  1.2

In [39]:
all_losses = []
for x, y in trainloader:
  y_pred = classifier.forward(x.to(torch.float32), False)
  y_item = y_pred.topk(1)[1].item()
  print(f"Predicted: {surname_dict_reverse[y_item]} Real: {surname_dict_reverse[torch.where(y)[1].item()]}")
  loss = loss_func(y_pred, y)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  all_losses.append(loss.item())
  # break

Predicted: Czech Real: Arabic
Predicted: Czech Real: English
Predicted: Czech Real: Arabic
Predicted: English Real: English
Predicted: English Real: Russian
Predicted: English Real: Russian
Predicted: English Real: Arabic
Predicted: English Real: English
Predicted: English Real: Japanese
Predicted: Arabic Real: Russian
Predicted: English Real: Russian
Predicted: English Real: Arabic
Predicted: Arabic Real: Arabic
Predicted: Arabic Real: English
Predicted: English Real: English
Predicted: Arabic Real: English
Predicted: English Real: English
Predicted: Arabic Real: Russian
Predicted: Arabic Real: Russian
Predicted: Arabic Real: English
Predicted: Arabic Real: Chinese
Predicted: Russian Real: Russian
Predicted: Russian Real: Japanese
Predicted: Russian Real: English
Predicted: Russian Real: Russian
Predicted: Russian Real: English
Predicted: Russian Real: Russian
Predicted: Russian Real: Italian
Predicted: Russian Real: French
Predicted: Russian Real: English
Predicted: Russian Real: Ger

In [40]:
all_losses = []
for x, y in trainloader:
  e = emb(torch.LongTensor(x))[0]
  print(e)
  y_pred = classifier.forward(e, False)
  # y_item = y_pred.topk(1)[1].item()
  # print(f"Predicted: {surname_dict_reverse[y_item]} Real: {surname_dict_reverse[torch.where(y)[1].item()]}")
  # loss = loss_func(y_pred, y)
  # optimizer.zero_grad()
  # loss.backward()
  # optimizer.step()
  # all_losses.append(loss.item())
  break

tensor([[-9.4274e-01,  1.1051e+00,  1.5441e+00,  1.1604e+00, -2.7230e-01,
          3.9574e-01, -1.7473e+00,  6.7534e-01,  3.9269e-01,  3.2758e-01,
         -5.6866e-02,  7.0646e-01,  8.9522e-02, -8.0921e-01, -5.1779e-01,
         -6.9201e-02, -1.0517e+00,  8.7163e-01],
        [ 1.9536e+00, -1.2741e+00, -2.2306e+00,  7.7549e-01, -1.8824e+00,
          2.2897e+00, -3.7220e-01,  1.0720e+00,  1.2969e+00, -1.0190e+00,
          6.9980e-01,  2.9742e-02, -2.8295e-01, -3.0373e-01, -8.0670e-02,
          1.1569e+00,  3.8202e-01, -9.0691e-01],
        [ 1.3241e+00,  6.5159e-01, -5.8455e-01,  8.3924e-01, -2.5403e+00,
         -4.8510e-01, -1.9725e-01,  1.3487e+00,  5.2571e-01,  1.8815e-01,
          7.2298e-04,  3.6663e+00, -2.4268e+00,  1.0689e+00, -9.3866e-01,
          2.9003e-01, -1.6014e-01,  7.1566e-01],
        [-8.0775e-01,  1.1291e+00,  1.3913e+00, -2.3054e-01,  1.1414e+00,
         -3.0274e+00, -8.2377e-01,  1.2445e+00,  1.2338e+00,  6.0105e-02,
          7.9939e-01, -7.1459e-01,  1.6

RuntimeError: mat1 and mat2 shapes cannot be multiplied (17x18 and 17x300)

In [None]:
for epoch in range(20):
  for x, y in trainloader:
    output, loss = train(x[0], y)
    current_loss += loss
    guess, guess_i = categoryFromOutput(output)
    correct = '✓' if guess == y else f"✗ {categoryFromOutput(output[0])[0]}"
    all_losses.append(current_loss)
    current_loss = 0
  print(f"Epoch {epoch}: {np.array(all_losses).mean()}")

In [318]:
a = torch.nn.Embedding(10, 50)
b = torch.LongTensor([2,8])
results = a(b)



indices = torch.Tensor(list(map(get_embedding_index, results)))
indices

tensor([2., 8.])

In [319]:
results

tensor([[-0.4098,  0.2533,  1.2605,  0.7038,  0.2500,  0.9289, -0.2418,  0.5768,
          0.4933, -0.2881, -0.0395, -0.2379, -0.6801,  0.5488,  1.8835,  1.4410,
          1.0493, -0.1259, -0.1851, -1.4215,  0.8290, -0.2408,  0.3402, -1.4582,
          2.0032,  0.4850, -1.2842,  0.1348,  0.8905, -0.9637,  0.7791, -1.8000,
          0.6366,  0.2020,  1.3995, -0.6345, -0.4659, -1.3530,  0.7784,  0.5687,
          0.2248, -1.1422,  0.2366, -0.8745,  1.5936, -2.4422,  0.9036, -0.2850,
         -0.9802,  0.4497],
        [ 0.3270, -1.1546, -1.2967, -0.5088, -1.7634,  0.0211,  0.9224,  0.2901,
          0.4979,  1.0452,  2.3358, -0.8311,  0.7015,  0.2112,  0.7016, -0.0864,
          0.6148, -1.3134,  0.2297,  1.2282, -1.1078, -1.7340,  0.4388,  0.1766,
          0.0893,  2.1184,  0.7886,  0.2694, -0.0800,  0.5345,  0.9193,  1.6917,
         -0.5858,  1.1435,  0.3770, -0.0127,  0.5832,  1.5580,  1.7289,  1.7571,
         -0.0380, -0.7875, -1.8100, -1.1700,  0.6741,  1.6696,  0.3956, -1.6179,


In [273]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(RNN, self).__init__()

    self.hidden_size = hidden_size

    self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
    self.i2o = nn.Linear(input_size + hidden_size, output_size)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input, hidden):
    combined = torch.cat((input, hidden), 1)
    hidden = self.i2h(combined)
    output = self.i2o(combined)
    output = self.softmax(output)
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, self.hidden_size)

In [274]:
n_hidden = 128
n_categories = len(surname_dict)
rnn = RNN(dataset_train.vocab.max_seq_len, n_hidden, n_categories)

In [275]:
criterion = nn.NLLLoss()

In [276]:
learning_rate = 0.005

optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate, momentum=0.4)

def train(line_tensor, category_tensor):
  hidden = rnn.initHidden()
  rnn.zero_grad()

  # torch.tensor(line_tensor, dtype=torch.long)
  for i in range(torch.tensor(line_tensor, dtype=torch.long).size()[0]):
    output, hidden = rnn(line_tensor[i], hidden)

  loss = criterion(output, category_tensor)
  # loss.backward()
  # for p in rnn.parameters():
  #     p.data.add_(p.grad.data, alpha=-learning_rate)

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  return output, loss.item()

In [277]:
from torch.utils.data import DataLoader

batch_size = 1
trainloader = DataLoader(
  dataset=dataset_train,
  # batch_size=batch_size,
  # collate_fn=my_collate,
  shuffle=True,
  num_workers=0
)
# testloader = DataLoader(
#   dataset=snds_test,
#   # batch_size=batch_size,
#   # collate_fn=my_collate,
#   shuffle=False,
#   num_workers=0
# )

In [279]:
def categoryFromOutput(output):
  top_n, top_i = output.topk(1)
  category_i = top_i[0].item()
  return surname_dict[category_i], category_i

In [282]:
current_loss = 0
all_losses = []

n_iters = 1000
print_every = 100
plot_every = 100

for epoch in range(20):
  for x, y in trainloader:
    print(x)
    break
  break
  #   output, loss = train(x[0], y)
  #   current_loss += loss
  #   guess, guess_i = categoryFromOutput(output)
  #   correct = '✓' if guess == y else f"✗ {categoryFromOutput(output[0])[0]}"
  #   all_losses.append(current_loss)
  #   current_loss = 0
  # print(f"Epoch {epoch}: {np.array(all_losses).mean()}")

[tensor([5]), tensor([17]), tensor([24]), tensor([14]), tensor([12]), tensor([0]), tensor([0]), tensor([0]), tensor([0]), tensor([0]), tensor([0]), tensor([0]), tensor([0]), tensor([0]), tensor([0]), tensor([0]), tensor([0])]


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class MultilayerPerceptron(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim):
    """
    Args:
        input_dim (int): the size of the input vectors
        hidden_dim (int): the output size of the first Linear layer
        output_dim (int): the output size of the second Linear layer
    """
    super(MultilayerPerceptron, self).__init__()
    self.fc1 = nn.Linear(input_dim, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, output_dim)

  def forward(self, x_in, apply_softmax=False):
    """The forward pass of the MLP

    Args:
        x_in (torch.Tensor): an input data tensor
            x_in.shape should be (batch, input_dim)
        apply_softmax (bool): a flag for the softmax activation
            should be false if used with the cross-entropy losses
    Returns:
        the resulting tensor. tensor.shape should be (batch, output_dim)
    """
    intermediate = F.relu(self.fc1(x_in))
    output = self.fc2(intermediate)

    if apply_softmax:
      output = F.softmax(output, dim=1)
    return output

In [None]:
batch_size = 2 # number of samples input at once
input_dim = 17
hidden_dim = 50
output_dim = len(surname_dict)

# Initialize model
mlp = MultilayerPerceptron(input_dim, hidden_dim, output_dim)
print(mlp)

In [None]:
y_output = mlp(x_input, apply_softmax=False)
describe(y_output)

## 3. Классификация обзоров на фильмы (ConvNet)

Датасет: https://disk.yandex.ru/d/tdinpb0nN_Dsrg

2.1 Создайте набор данных на основе файлов polarity/positive_reviews.csv (положительные отзывы) и polarity/negative_reviews.csv (отрицательные отзывы). Разбейте на обучающую и тестовую выборку.
  * токен = __слово__
  * данные для обучения в датасете представляются в виде последовательности индексов токенов
  * словарь создается на основе _только_ обучающей выборки. Для корректной обработки ситуаций, когда в тестовой выборке встретится токен, который не хранится в словаре, добавьте в словарь специальный токен `<UNK>`
  * добавьте предобработку текста

2.2. Обучите классификатор.
  
  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding` 
    - подберите адекватную размерность вектора эмбеддинга: 
    - модуль `nn.Embedding` обучается

  * Используйте одномерные свертки и пулинг (`nn.Conv1d`, `nn.MaxPool1d`)
    - обратите внимание, что `nn.Conv1d` ожидает на вход трехмерный тензор размерности `(batch, embedding_dim, seq_len)`


2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)
* Целевое значение accuracy на валидации - 70+%

In [None]:
positive_raw = pd.read_csv("./polarity/positive_reviews.csv", header=None)
negative_raw = pd.read_csv("./polarity/negative_reviews.csv", header=None)

In [None]:
positive_raw.head()

In [None]:
negative_raw.head()

In [None]:
positive_raw["state"] = 1
negative_raw["state"] = 0

In [None]:
negative_raw

In [None]:
positive_raw

In [None]:
all_reviews = pd.concat([negative_raw, positive_raw], axis=0)
all_reviews.columns = ["review", "rating"]
all_reviews

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(all_reviews["review"], all_reviews["rating"], test_size=0.2)

In [None]:
nltk.word_tokenize(all_reviews.review)

In [None]:
all_reviews["review"].flatten()