# 6. Классификация текстов при помощи сверточных сетей

__Автор__: Никита Владимирович Блохин (NVBlokhin@fa.ru)

Финансовый университет, 2020 г.

In [16]:
import re
import typing as t
from collections import defaultdict
from functools import lru_cache
from pathlib import Path
import nltk
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from nltk.corpus import stopwords, wordnet
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, Subset, random_split

In [17]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ace\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ace\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ace\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Ace\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Ace\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [18]:
DATA_DIR = Path("data/")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {DEVICE.upper()} device")

Using CPU device


In [19]:
def on_cuda(device: str) -> bool:
    return device == "cuda"

In [20]:
def common_train(
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        train_dataloader: DataLoader,
        epochs: int,
        test_dataloader: DataLoader = None,
        lr_scheduler=None,
        verbose: int = 100,
        device: str = "cpu",
) -> t.List[float]:
    train_losses = []
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}\n" + "-" * 32)
        train_loss = train_loop(
            train_dataloader,
            model,
            loss_fn,
            optimizer,
            verbose=verbose,
            device=device,
        )
        train_losses.append(train_loss.item())
        if test_dataloader:
            loss, acc = test_loop(test_dataloader, model, loss_fn, device=device)
            if lr_scheduler:
                lr_scheduler.step(loss)
        torch.cuda.empty_cache()
    return train_losses

In [21]:
def train_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        verbose: int = 100,
        device: str = "cpu",
) -> torch.Tensor:
    model.train()

    size = len(dataloader.dataset)  # noqa
    num_batches = len(dataloader)
    avg_loss = 0

    for batch, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)

        pred = model(x)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_loss += loss
        if batch % verbose == 0:
            print(f"loss: {loss:>7f}  [{batch * len(x):>5d}/{size:>5d}]")

        del x, y, pred, loss
        torch.cuda.empty_cache()

    return avg_loss / num_batches

In [22]:
@torch.no_grad()
def test_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model.eval()

    size = len(dataloader.dataset)  # noqa
    num_batches = len(dataloader)
    avg_loss, correct = 0, 0

    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        pred = model(x)
        avg_loss += loss_fn(pred, y)
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()  # noqa

        del x, y, pred
        torch.cuda.empty_cache()

    avg_loss /= num_batches
    accuracy = correct / size
    print(f"Test Error: \n Accuracy: {accuracy:>4f}, Avg loss: {avg_loss:>8f} \n")

    return avg_loss, accuracy

In [23]:
def train_test_split(dataset: t.Union[Dataset, t.Sized], train_part: float) -> t.Tuple[Subset, Subset]:
    train_size = round(train_part * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, lengths=(train_size, test_size))
    return train_dataset, test_dataset

In [24]:
@torch.no_grad()
def get_y_test_y_pred(
        model: nn.Module,
        test_dataloader: DataLoader,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model.eval()

    y_test = []
    y_pred = []
    for x, y in test_dataloader:
        x, y = x.to(device), y.to(device)
        pred = model(x).argmax(1)
        y_test.append(y)
        y_pred.append(pred)
        del x
        torch.cuda.empty_cache()
    return torch.hstack(y_test).detach().cpu(), torch.hstack(y_pred).detach().cpu()

## 1. Представление и предобработка текстовых данных в виде последовательностей

1.1 Представьте первое предложение из строки `text` как последовательность из индексов слов, входящих в это предложение

In [25]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [26]:
text = text.lower()
alphabet = list(set(nltk.word_tokenize(text.replace(".", ""))))
word2index = {w: i for i, w in enumerate(alphabet)}
first_sentence = nltk.sent_tokenize(text)[0].replace(".", "")
[word2index[w] for w in nltk.word_tokenize(first_sentence)]

[13, 0, 21, 11, 22, 10, 6, 9]

1.2 Представьте первое предложение из строки `text` как последовательность векторов, соответствующих индексам слов. Для представления индекса в виде вектора используйте унитарное кодирование. В результате должен получиться двумерный тензор размера `количество слов в предложении` x `количество уникальных слов`

In [27]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [28]:
text = text.lower()
alphabet = list(set(nltk.word_tokenize(text.replace(".", ""))))
word2index = {w: i for i, w in enumerate(alphabet)}
first_sentence = nltk.sent_tokenize(text)[0].replace(".", "")
words = nltk.word_tokenize(first_sentence)
vectors = torch.zeros(len(words), len(alphabet))
indices = [(i, word2index[w]) for i, w in enumerate(words)]
vectors[list(zip(*indices))] = 1
vectors

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.]])

1.3 Решите задачу 1.2, используя модуль `nn.Embedding`

In [29]:
torch.manual_seed(0)
embeds = nn.Embedding(num_embeddings=len(alphabet), embedding_dim=len(alphabet))
indices = torch.tensor([word2index[w] for w in nltk.word_tokenize(first_sentence)])
embeds(indices)

tensor([[ 1.0077,  1.0046, -0.4335, -1.2426,  1.2846,  0.2438,  0.5304, -0.0145,
         -2.2357,  1.4660, -1.2191,  0.6442,  3.9300, -0.1244,  0.2953,  0.3827,
         -0.5497, -0.9940,  1.3459,  1.9457, -1.2904, -2.3495, -2.0689,  0.9094,
         -0.6946],
        [-1.1258, -1.1524, -0.2506, -0.4339,  0.8487,  0.6920, -0.3160, -2.1152,
          0.3223, -1.2633,  0.3500,  0.3081,  0.1198,  1.2377,  1.1168, -0.2473,
         -1.3527, -1.6959,  0.5667,  0.7935,  0.5988, -1.5551, -0.3414,  1.8530,
          0.7502],
        [ 1.2532, -0.4445,  0.8185, -0.8180,  0.3603, -1.6146, -2.4734,  0.0362,
         -0.3422, -0.3817, -0.0569,  0.8436,  0.6829,  3.3944, -1.6688,  0.5109,
         -0.2860,  0.3351,  1.1719,  1.2955,  0.8909, -0.4898, -1.1727, -0.6870,
         -2.3349],
        [ 0.2683, -2.0589,  0.5340, -0.5354, -0.8637, -0.0235,  1.1717,  0.3987,
         -0.1987, -1.1559, -0.3167,  0.9403, -1.1470,  0.5588,  0.7918, -0.1847,
         -0.7318, -0.0807, -0.9801,  0.0605, -0.4890

## 2. Классификация фамилий по национальности (ConvNet)

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`.

In [30]:
surnames_df = pd.read_csv("surnames.csv")
surnames_df.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


2.2 Закодировать национальности числами, начиная с 0.

In [31]:
surnames_labeler = LabelEncoder()
surnames_df["target"] = surnames_labeler.fit_transform(surnames_df["nationality"])
print(f"classes: {len(surnames_labeler.classes_)}")
surnames_df.head()

classes: 18


Unnamed: 0,surname,nationality,target
0,Woodford,English,4
1,Coté,French,5
2,Kore,English,4
3,Koury,Arabic,0
4,Lebzak,Russian,14


2.4 Реализовать класс `Vocab` (токен = __символ__)
  * добавьте в словарь специальный токен `<PAD>` с индексом 0
  * при создании словаря сохраните длину самой длинной последовательности из набора данных в виде атрибута `max_seq_len`


In [32]:
class Vocab:
    pad = "<PAD>"
    def __init__(self, series: pd.Series):
        uniques = set()
        max_len = 0
        for w in map(str.lower, series):
            uniques.update(w)
            max_len = max(len(w), max_len)

        self.alphabet = [self.pad, *uniques]
        self.max_len = max_len
        self.ch2i = {ch: i for i, ch in enumerate(self.alphabet)}

    def encode(self, word: str) -> torch.Tensor:
        indices = [self.ch2i[ch] for ch in word]
        # дополняем до одинакового размера индексом служебного символа
        indices += [self.ch2i[self.pad]] * (self.max_len - len(indices))
        return torch.tensor(indices, dtype=torch.long)

    def decode(self, indices: torch.Tensor) -> str:
        pad_indices = torch.nonzero(indices == self.ch2i[self.pad], as_tuple=True)[0]  # noqa
        if len(pad_indices):
            indices = indices[:pad_indices[0]]  # отрезаем служебные символы
        return "".join(self.alphabet[i] for i in indices)
vocab = Vocab(surnames_df["surname"])
indices = vocab.encode("kovalev")
print(indices, vocab.decode(indices))

tensor([10,  5, 23, 20, 35, 54, 23,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]) kovalev


2.5 Реализовать класс `SurnamesDataset`
  * метод `__getitem__` возвращает пару: <последовательность индексов токенов (см. 1.1 ), номер класса>
  * длина каждой такой последовательности должна быть одинаковой и равной `vocab.max_seq_len`. Чтобы добиться этого, дополните последовательность справа индексом токена `<PAD>` до нужной длины


In [33]:
class SurnamesDataset(Dataset):
    def __init__(self, df: pd.DataFrame, vocab: Vocab, transform: t.Callable = None):
        self.surnames = df["surname"].tolist()

        if transform:
            # 1 раз transform - прохождение эпох быстрее
            size = transform(self.surnames[0]).size()
            self.data = torch.vstack([transform(w) for w in self.surnames]).view(len(self.surnames), *size)
        else:
            self.data = self.surnames
        self.targets = torch.tensor(df["target"], dtype=torch.long)

        self.vocab = vocab
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [34]:
def to_indices(word: str) -> torch.Tensor:
    return vocab.encode(word.lower())

In [35]:
def one_hot(word: str) -> torch.Tensor:
    vectors = torch.zeros(vocab.max_len, len(vocab.alphabet))
    indices = [(i, vocab.ch2i[ch]) for i, ch in enumerate(word.lower())]
    vectors[list(zip(*indices))] = 1
    return vectors
surnames_indices_dataset = SurnamesDataset(surnames_df, vocab, transform=to_indices)
surnames_one_hot_dataset = SurnamesDataset(surnames_df, vocab, transform=one_hot)
surnames_indices_dataset[0], surnames_one_hot_dataset[0]

((tensor([47,  5,  5, 25, 41,  5, 26, 25,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
  tensor(4)),
 (tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
           0., 0.],
          [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0.],
          [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0.],
          [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0

2.3 Разбить датасет на обучающую и тестовую выборку

In [36]:
torch.manual_seed(0)
train_indices_dataset, test_indices_dataset = train_test_split(surnames_indices_dataset, train_part=0.8)
train_one_hot_dataset, test_one_hot_dataset = train_test_split(surnames_one_hot_dataset, train_part=0.8)
print(len(train_indices_dataset), len(test_indices_dataset))

8784 2196


2.6. Обучить классификатор.

  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding`. Рассмотрите два варианта:
    - когда токен представляется в виде унитарного вектора и модуль `nn.Embedding` не обучается
    - когда токен представляется в виде вектора небольшой размерности (меньше, чем размер словаря) и модуль `nn.Embedding` обучается

  * Используйте одномерные свертки и пулинг (`nn.Conv1d`, `nn.MaxPool1d`)
    - обратите внимание, что `nn.Conv1d` ожидает на вход трехмерный тензор размерности `(batch, embedding_dim, seq_len)`


In [37]:
class SurnamesClassifier(nn.Module):
    def __init__(
            self,
            vocab: Vocab,
            out_features: int,
            embedding_dim: int = 128,
            use_embedding: bool = True,
            debug: bool = False,
    ):
        super(SurnamesClassifier, self).__init__()
        self.use_embedding = use_embedding
        self.debug = debug
        self.embedding_dim = embedding_dim
        last_conv_out_channels = 64
        adaptive_avg_pool = 8
        self.embedding = nn.Embedding(num_embeddings=len(vocab.alphabet), embedding_dim=embedding_dim)
        self.features = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim, out_channels=64, kernel_size=3),
            nn.BatchNorm1d(num_features=64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(in_channels=64, out_channels=last_conv_out_channels, kernel_size=3),
            nn.BatchNorm1d(num_features=last_conv_out_channels),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
        )
        self.avgpool = nn.AdaptiveAvgPool1d(adaptive_avg_pool)
        self.classifier = nn.Sequential(
            nn.Linear(last_conv_out_channels * adaptive_avg_pool, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, out_features),
        )
        if self.debug:
            self.forward = self._debug_forward
        else:
            self.forward = self._forward

    def _forward(self, x: torch.Tensor):
        if self.use_embedding:
            x = self.embedding(x)
        else:
            x = F.pad(x, (0, self.embedding_dim - x.size(2), 0, 0), value=0)
        x = x.reshape(x.size(0), x.size(2), x.size(1))
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return torch.log_softmax(x, dim=1)

    def _debug_forward(self, x: torch.Tensor):
        print("x: ", x.size())
        if self.use_embedding:
            x = self.embedding(x)
            print("embedding: ", x.size())
        else:
            x = F.pad(x, (0, self.embedding_dim - x.size(2), 0, 0), value=0)
            print("pad: ", x.size())

        x = x.reshape(x.size(0), x.size(2), x.size(1))
        print("reshape: ", x.size())
        x = self.features(x)
        print("features: ", x.size())
        x = self.avgpool(x)
        print("avgpool: ", x.size())
        x = torch.flatten(x, 1)
        print("flatten: ", x.size())
        x = self.classifier(x)
        print("classifier: ", x.size())
        return torch.log_softmax(x, dim=1)


In [38]:
%%time
torch.manual_seed(0)
common_net = SurnamesClassifier(vocab, len(surnames_labeler.classes_)).to(DEVICE)
loss_fn = nn.NLLLoss()
optimizer = optim.Adam(common_net.parameters(), lr=0.001)
common_net.use_embedding = False
_ = common_train(
    epochs=10,
    model=common_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=DataLoader(train_one_hot_dataset, batch_size=8, shuffle=True),
    test_dataloader=DataLoader(test_one_hot_dataset, batch_size=512),
    verbose=500,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 3.008995  [    0/ 8784]
loss: 1.674296  [ 4000/ 8784]
loss: 1.310381  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.558743, Avg loss: 1.596468 

Epoch 2
--------------------------------
loss: 1.325799  [    0/ 8784]
loss: 1.230295  [ 4000/ 8784]
loss: 1.573640  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.605647, Avg loss: 1.398547 

Epoch 3
--------------------------------
loss: 1.450298  [    0/ 8784]
loss: 0.586835  [ 4000/ 8784]
loss: 1.431115  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.622495, Avg loss: 1.357075 

Epoch 4
--------------------------------
loss: 1.118284  [    0/ 8784]
loss: 0.827805  [ 4000/ 8784]
loss: 0.768332  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.618852, Avg loss: 1.330902 

Epoch 5
--------------------------------
loss: 1.292893  [    0/ 8784]
loss: 1.024059  [ 4000/ 8784]
loss: 1.152696  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.649818, Avg loss: 1.301598 

Epoch 6
--------------------------------
loss: 0.818191  [    0/ 8784]

In [39]:
%%time
torch.manual_seed(0)
embeddings_net = SurnamesClassifier(vocab, len(surnames_labeler.classes_)).to(DEVICE)
loss_fn = nn.NLLLoss()
optimizer = optim.Adam(embeddings_net.parameters(), lr=0.001)
embeddings_net.use_embedding = True
_ = common_train(
    epochs=15,
    model=embeddings_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=DataLoader(train_indices_dataset, batch_size=8, shuffle=True),
    test_dataloader=DataLoader(test_indices_dataset, batch_size=512),
    verbose=500,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.871342  [    0/ 8784]
loss: 1.331137  [ 4000/ 8784]
loss: 1.249040  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.542805, Avg loss: 1.669320 

Epoch 2
--------------------------------
loss: 1.539569  [    0/ 8784]
loss: 1.800099  [ 4000/ 8784]
loss: 1.033039  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.634791, Avg loss: 1.307033 

Epoch 3
--------------------------------
loss: 1.952749  [    0/ 8784]
loss: 1.417925  [ 4000/ 8784]
loss: 0.917647  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.633880, Avg loss: 1.268972 

Epoch 4
--------------------------------
loss: 0.280862  [    0/ 8784]
loss: 1.097283  [ 4000/ 8784]
loss: 0.986332  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.673042, Avg loss: 1.116463 

Epoch 5
--------------------------------
loss: 0.746358  [    0/ 8784]
loss: 1.943287  [ 4000/ 8784]
loss: 0.667607  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.693989, Avg loss: 1.097032 

Epoch 6
--------------------------------
loss: 0.788891  [    0/ 8784]

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [40]:
test_loop(
    dataloader=DataLoader(test_indices_dataset, batch_size=512),
    model=embeddings_net,
    loss_fn=loss_fn,
    device=DEVICE,
);

Test Error: 
 Accuracy: 0.698543, Avg loss: 1.392302 



In [41]:
def inference(
        surname: str,
        target: str,
        model: nn.Module,
        vocab: Vocab,
        labeler: LabelEncoder,
        k: int = 3,
        device: str = "cpu",
):
    x = vocab.encode(surname.lower())
    x = x.to(device)

    pred = model(x.unsqueeze(0))
    pred_proba, pred_label_indices = F.softmax(pred, 1).topk(k, dim=1)
    pred_labels = labeler.inverse_transform(pred_label_indices.squeeze().cpu())

    predicts = ", ".join(
        [f"{label} ({prob:.2f})" for (label, prob) in zip(pred_labels, pred_proba.squeeze())]
    )
    print(f"Surname : {surname}")
    print(f"True    : {target}")
    print(f"Predicts: {predicts}\n")

In [42]:
students = [
    "Alexandrova",
    "Baranov",
    "Brusova",
    "Volkova",
    "Kovalev",
    "Kostyuchenko",
    "Kuzin",
    "Likhachev",
    "Telitsyn",
    "Ustimova",
    "Khamikoeva",
]
for surname in students:
    inference(
        surname=surname,
        target="Russian",
        model=embeddings_net,
        vocab=vocab,
        labeler=surnames_labeler,
        device=DEVICE,
    )

Surname : Alexandrova
True    : Russian
Predicts: English (0.52), Russian (0.24), French (0.18)

Surname : Baranov
True    : Russian
Predicts: Russian (1.00), Czech (0.00), English (0.00)

Surname : Brusova
True    : Russian
Predicts: Czech (0.60), Russian (0.19), Japanese (0.11)

Surname : Volkova
True    : Russian
Predicts: Russian (0.66), Czech (0.34), Polish (0.00)

Surname : Kovalev
True    : Russian
Predicts: Russian (1.00), Czech (0.00), Polish (0.00)

Surname : Kostyuchenko
True    : Russian
Predicts: Russian (1.00), English (0.00), German (0.00)

Surname : Kuzin
True    : Russian
Predicts: Japanese (0.65), Russian (0.16), Czech (0.09)

Surname : Likhachev
True    : Russian
Predicts: Russian (1.00), Irish (0.00), English (0.00)

Surname : Telitsyn
True    : Russian
Predicts: English (0.84), Russian (0.14), French (0.02)

Surname : Ustimova
True    : Russian
Predicts: Japanese (0.92), Russian (0.08), Czech (0.01)

Surname : Khamikoeva
True    : Russian
Predicts: Russian (0.99), 

**Вывод:** использование Embedding позволило увеличить точность модели.

## 3. Классификация обзоров на фильмы (ConvNet)

Датасет: https://disk.yandex.ru/d/tdinpb0nN_Dsrg

2.1 Создайте набор данных на основе файлов polarity/positive_reviews.csv (положительные отзывы) и polarity/negative_reviews.csv (отрицательные отзывы). Разбейте на обучающую и тестовую выборку.
  * токен = __слово__
  * данные для обучения в датасете представляются в виде последовательности индексов токенов
  * словарь создается на основе _только_ обучающей выборки. Для корректной обработки ситуаций, когда в тестовой выборке встретится токен, который не хранится в словаре, добавьте в словарь специальный токен `<UNK>`
  * добавьте предобработку текста

In [43]:
def get_pos(word: str) -> str:
    tag = nltk.pos_tag([word])[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [44]:
STOPWORDS = set(stopwords.words("english"))

In [45]:
def preprocess_review(text: str) -> str:
    text = text.lower()
    # удаляем все символы кроме букв латинского алфавита
    text = re.sub(r"[^a-z]", repl=" ", string=text, flags=re.MULTILINE)

    lemmatizer = nltk.WordNetLemmatizer()
    words = []
    for word in nltk.word_tokenize(text):
        if word not in STOPWORDS:  # удаляем стоп-слова до лемматизации - так можно чуть-чуть сэкономить
            lemma = lemmatizer.lemmatize(word, pos=get_pos(word))
            # удаляем стоп-слова, наивное предположение - не брать леммы короче 3-х символов дало значительный прирост точности
            if lemma not in STOPWORDS and len(lemma) > 2:
                words.append(lemma)

    return " ".join(words)

In [46]:
class ReviewsDataset(Dataset):

    def __init__(self, positive_path: Path, negative_path: Path, seed: int = None):
        self.positive_path = positive_path
        self.negative_path = negative_path
        self.positive_reviews = self.read_reviews(positive_path, preprocess_review)
        self.negative_reviews = self.read_reviews(negative_path, preprocess_review)

        data = self.positive_reviews + self.negative_reviews
        targets = torch.cat([torch.ones(len(self.positive_reviews)), torch.zeros(len(self.negative_reviews))])

        if seed is not None:
            torch.manual_seed(seed)
        indices = torch.randperm(len(data))

        self.data = [data[i] for i in indices]
        self.targets = targets[indices].to(torch.long)

    @staticmethod
    def read_reviews(path: Path, process: t.Callable[[str], str]) -> list[str]:
        reviews = []
        with open(path) as f:
            for review in f.readlines():
                review = process(review)
                if review:
                    reviews.append(review)
        return reviews

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.targets[index]

In [50]:
reviews_dataset = ReviewsDataset(
    "positive_reviews.txt",
    "negative_reviews.txt",
    seed=0,
)

In [51]:
len(reviews_dataset), reviews_dataset[0]

(10660,
 ('none happily ever spangle monsoon wedding late marriage part make dover kosashvili outstanding feature debut potent',
  tensor(0)))

In [52]:
torch.manual_seed(0)
train_reviews, test_reviews = train_test_split(reviews_dataset, train_part=0.8)
len(train_reviews), len(test_reviews)

(8528, 2132)

In [53]:
class ReviewsVocab:
    pad = "<PAD>"
    unknown = "<UNK>"

    def __init__(self, reviews: t.List[str]):
        uniques = set()
        max_len = 0
        for review in reviews:
            words = nltk.word_tokenize(review)
            uniques.update(words)
            max_len = max(len(words), max_len)

        self.alphabet = [self.pad, self.unknown, *uniques]
        self.max_len = max_len

        w2i = {w: i for i, w in enumerate(self.alphabet)}
        # если ключ отсутствует, будет возвращена 1 - индекс служебного символа
        self.w2i = defaultdict(lambda: 1, w2i)

    def __len__(self):
        return len(self.alphabet)

    @lru_cache(maxsize=8192)  # сомнительная эффективность? Ну да
    def encode(self, review: str) -> torch.Tensor:
        indices = [self.w2i[w] for w in nltk.word_tokenize(review)]
        indices += [self.w2i[self.pad]] * (self.max_len - len(indices))
        return torch.tensor(indices, dtype=torch.long)

    def decode(self, indices: torch.Tensor) -> str:
        pad_indices = torch.nonzero(indices == self.w2i[self.pad], as_tuple=True)[0]  # noqa
        if len(pad_indices):
            indices = indices[:pad_indices[0]]
        return " ".join(self.alphabet[i] for i in indices)


In [54]:
vocab = ReviewsVocab([review for review, _ in train_reviews])
print(f"alphabet: {len(vocab)}", f"longest: {vocab.max_len}")
encoded = vocab.encode("this is a neutral review")
encoded, vocab.decode(encoded)

alphabet: 13287 longest: 29


(tensor([   1,    1,    1, 8247, 2391,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0]),
 '<UNK> <UNK> <UNK> neutral review')

2.2. Обучите классификатор.

  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding`
    - подберите адекватную размерность вектора эмбеддинга:
    - модуль `nn.Embedding` обучается

  * Используйте одномерные свертки и пулинг (`nn.Conv1d`, `nn.MaxPool1d`)
    - обратите внимание, что `nn.Conv1d` ожидает на вход трехмерный тензор размерности `(batch, embedding_dim, seq_len)`


In [55]:
class ReviewsClassifier(nn.Module):
    LAST_CONV_OUT_CHANNELS = 64
    ADAPTIVE_AVG_POOL = 8

    def __init__(self, num_embeddings: int, embedding_dim: int):
        super(ReviewsClassifier, self).__init__()

        # Как же этой модели все это... безразлично
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
        self.features = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim, out_channels=self.LAST_CONV_OUT_CHANNELS, kernel_size=2),
            nn.BatchNorm1d(num_features=self.LAST_CONV_OUT_CHANNELS),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
        )
        # Единственный полезный (и понятный зачем) слой. Зачем? - Позволяет не думать о размерностях
        self.avgpool = nn.AdaptiveAvgPool1d(self.ADAPTIVE_AVG_POOL)
        self.classifier = nn.Sequential(
            nn.Linear(self.LAST_CONV_OUT_CHANNELS * self.ADAPTIVE_AVG_POOL, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 2),
        )

    def forward(self, x: torch.Tensor):
        x = self.embedding(x)
        x = x.reshape(x.size(0), x.size(2), x.size(1))
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x


def collate(batch: t.List[t.Tuple[str, torch.Tensor]]) -> t.Tuple[torch.Tensor, torch.Tensor]:
    xs, ys = [], []
    for x, y in batch:
        xs.append(vocab.encode(x))
        ys.append(y)
    return torch.vstack(xs), torch.hstack(ys)

In [56]:
torch.manual_seed(0)
net = ReviewsClassifier(num_embeddings=len(vocab), embedding_dim=128).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.000914092001)  # а почему нет?
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer,
    mode="min",
    patience=5,
    factor=0.333333,
    min_lr=0.000001,
    threshold=0.001,
    verbose=True,
)
train_dataloader = DataLoader(train_reviews, batch_size=22, collate_fn=collate, shuffle=True)
test_dataloader = DataLoader(test_reviews, batch_size=512, collate_fn=collate)

In [57]:
%%time
_ = common_train(
    epochs=20,
    model=net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    lr_scheduler=lr_scheduler,
    verbose=150,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 0.710937  [    0/ 8528]
loss: 0.721881  [ 3300/ 8528]
loss: 0.690325  [ 6600/ 8528]
Test Error: 
 Accuracy: 0.500000, Avg loss: 0.692950 

Epoch 2
--------------------------------
loss: 0.685188  [    0/ 8528]
loss: 0.695510  [ 3300/ 8528]
loss: 0.693171  [ 6600/ 8528]
Test Error: 
 Accuracy: 0.500938, Avg loss: 0.692922 

Epoch 3
--------------------------------
loss: 0.691839  [    0/ 8528]
loss: 0.686422  [ 3300/ 8528]
loss: 0.713710  [ 6600/ 8528]
Test Error: 
 Accuracy: 0.502345, Avg loss: 0.695628 

Epoch 4
--------------------------------
loss: 0.746132  [    0/ 8528]
loss: 0.698044  [ 3300/ 8528]
loss: 0.711347  [ 6600/ 8528]
Test Error: 
 Accuracy: 0.499062, Avg loss: 0.689848 

Epoch 5
--------------------------------
loss: 0.668270  [    0/ 8528]
loss: 0.733846  [ 3300/ 8528]
loss: 0.616055  [ 6600/ 8528]
Test Error: 
 Accuracy: 0.614447, Avg loss: 0.655173 

Epoch 6
--------------------------------
loss: 0.574306  [    0/ 8528]

У модели явная проблема с переобучением. После 3-ей эпохи тестовая ошибка начала увеличиваться,
в то же время ошибка на обучающей выборки быстро приблизилась к 0.

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)
* Целевое значение accuracy на валидации - 70+%

In [58]:
y_test, y_pred = get_y_test_y_pred(net, test_dataloader, DEVICE)
print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=["negative", "positive"],
))

              precision    recall  f1-score   support

    negative       0.66      0.69      0.68      1061
    positive       0.68      0.65      0.66      1071

    accuracy                           0.67      2132
   macro avg       0.67      0.67      0.67      2132
weighted avg       0.67      0.67      0.67      2132



In [59]:
def inference(
        review: str,
        target: str,
        model: nn.Module,
        vocab: ReviewsVocab,
        target_names: list[str],
        device: str = "cpu",
):
    x = vocab.encode(preprocess_review(review))
    x = x.to(device)
    pred = model(x.unsqueeze(0))
    pred_proba, pred_label_idx = F.softmax(pred, 1).max(dim=1)
    pred_label = target_names[pred_label_idx.cpu()]
    print(f"Review : {review}")
    print(f"True   : {target}")
    print(f"Predict: {pred_label} ({pred_proba.item():.2f})\n")

In [60]:
reviews = [
    ("No intrigue, poor character disclosure.", "negative"),
    ("A fascinating story. The actors played their characters perfectly.", "positive"),
]
for review, target in reviews:
    inference(
        review=review,
        target=target,
        model=net,
        vocab=vocab,
        target_names=["negative", "positive"],
        device=DEVICE,
    )

Review : No intrigue, poor character disclosure.
True   : negative
Predict: positive (1.00)

Review : A fascinating story. The actors played their characters perfectly.
True   : positive
Predict: negative (0.88)

