# Фаза 2 • Неделя 10 • Понедельник
## Обработка естественного языка
### Рекуррентные нейронные сети • RNN • LSTM

In [61]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import re
import string
from collections import Counter

from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torchutils as tu
from torchmetrics.classification import BinaryAccuracy

from dataclasses import dataclass
from typing import Union
from src.train_rnn import train
from src.rnn_preprocessing import preprocess_single_string

In [63]:
df = pd.read_csv('data/imdb.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [64]:
def data_preprocessing(text: str) -> str:
    """preprocessing string: lowercase, removing html-tags, punctuation and stopwords

    Args:
        text (str): input string for preprocessing

    Returns:
        str: preprocessed string
    """

    text = text.lower()
    text = re.sub("<.*?>", "", text)  # html tags
    text = "".join([c for c in text if c not in string.punctuation])
    splitted_text = [word for word in text.split() if word not in stop_words]
    text = " ".join(splitted_text)
    return text


df["cleaned_reviews"] = df["review"].apply(data_preprocessing)
df.head()

Unnamed: 0,review,sentiment,cleaned_reviews
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


Обычно при токенизации и сопоставлении целых чисел словам принцип такой: наиболее популярным словам соответсвуют меньшие индексы (но ноль зарезервирован – `pad`).

In [65]:
corpus = [word for text in df['cleaned_reviews'] for word in text.split()]
count_words = Counter(corpus)
# print(count_words)
sorted_words = count_words.most_common()

In [66]:
print(sorted_words[-10:])
# print(sorted_words[:10])

[('frenchonly', 1), ('terminalannie', 1), ('bumi', 1), ('clatter', 1), ('superbi', 1), ('horriblecatwoman', 1), ('theirsall', 1), ('originalif', 1), ('goodat', 1), ('yosemitei', 1)]


In [67]:
# Можем взять только слова, которые чаще всего встречаются в данных, 
# но это гиперпараметр
def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
    return list(filter(lambda x: x[1] > n, sorted_words))

In [68]:
sorted_words = get_words_by_freq(sorted_words, 1000)

In [69]:
sorted_words[-10:]

[('suddenly', 1006),
 ('convincing', 1006),
 ('biggest', 1006),
 ('follows', 1006),
 ('younger', 1005),
 ('rate', 1004),
 ('office', 1003),
 ('portrayal', 1003),
 ('moves', 1002),
 ('former', 1001)]

Сопоставляем каждому слову в словаре целое число – эти данные пойдут в нейросеть. 

In [70]:
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
print(f"movie: {vocab_to_int['movie']}")
print(f"cool : {vocab_to_int['cool']}")
print(f"film : {vocab_to_int['film']}")
print(f"good : {vocab_to_int['good']}")

movie: 1
cool : 476
film : 2
good : 5


In [103]:
import json 
with open('vocab2int.json', 'w') as f:
    json.dump(vocab_to_int, f)

In [111]:
stop_words_txt = ','.join([str(i) for i in stop_words])
with open('stop_words.txt', 'w') as f:
    f.write(stop_words_txt)

In [71]:
df["cleaned_reviews"].iloc[99]

'mario fan long remember fond memories playing super mario world kid game brought back many memories adding something new super mario galaxy latest installment amazing mario franchise much different game mario still keeping intact greatest elements mario first noticeable difference story takes place spacethe story begins much like mario game mario receives letter princess peach inviting celebration castle mushroom kingdom upon arriving peachs castle mario finds bowser son bowser jr attacking castle airships bowser kidnaps princess peach lifts castle space midst castle lifted space mario falls lands unknown planet mario found talking star named luma taken back lumas home floating space station mario meets many lumas also meets leader woman named rosalina rosalina tells mario bowser taken away space stations power stars scattered across universe mario help lumas find save peach thus adventure beginsthe way play game flying space station galaxies galaxy consists multiple planets mario tra

In [72]:
reviews_int = []
for text in df["cleaned_reviews"]:
    r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
    reviews_int.append(r)
print([i for i in reviews_int[99]])

[214, 102, 265, 278, 92, 427, 362, 672, 53, 30, 51, 73, 375, 12, 164, 362, 46, 654, 664, 19, 11, 192, 170, 11, 696, 12, 4, 362, 559, 544, 388, 632, 632, 605, 145, 560, 235, 638, 466, 53, 236, 632, 764, 30, 17, 764, 152, 638, 580, 466, 140, 632, 545, 304, 478, 229, 67, 457, 26, 185, 362, 632, 304, 545, 304, 30, 114, 26, 87, 101, 287, 272, 94, 30, 607, 246, 39, 57, 362, 405, 4, 46, 217, 30, 818, 644, 362, 217, 362, 13, 304, 17, 122, 178, 233, 865, 235, 126, 39, 158, 15, 362, 171, 25, 135, 111, 45, 171, 5, 70, 808, 192, 262, 217, 17, 11, 84, 712, 143, 77, 356, 358, 362, 395, 30, 133, 576, 356, 46, 575, 785, 362, 428, 318, 456, 362, 909, 290, 193, 193]


In [73]:
# Label Encoding
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df.head()

Unnamed: 0,review,sentiment,cleaned_reviews
0,One of the other reviewers has mentioned that ...,1,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...


Все отзывы состоят из разного числа слова: для удобства можно подсчитать длину всех отзывов: 

In [74]:
review_len = [len(x) for x in reviews_int]
df['Review len'] = review_len
df.head()

Unnamed: 0,review,sentiment,cleaned_reviews,Review len
0,One of the other reviewers has mentioned that ...,1,one reviewers mentioned watching 1 oz episode ...,78
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...,43
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...,43
3,Basically there's a family where a little boy ...,0,basically theres family little boy jake thinks...,43
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...,79


In [75]:
print(df['Review len'].describe())

count    50000.000000
mean        66.094540
std         45.719724
min          0.000000
25%         38.000000
50%         52.000000
75%         80.000000
max        595.000000
Name: Review len, dtype: float64


Есть и очень длинные отзывы, возмножно с ними стоит поработать отдельно, чтобы посмотреть, какую часть отзыва можно убрать из объекта. 

### Padding

При работе с текстом почти всегда применяется `padding` – дополнение нулями до фиксированной длины. Это необходимо для сохранения размерностей. 
Если последовательность слишком длинная, то мы вынуждены обрезать ее. 

In [76]:
def padding(review_int: list, seq_len: int) -> np.ndarray:
    """Make left-sided padding for input list of tokens

    Args:
        review_int (list): input list of tokens
        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros

    Returns:
        np.array: padded sequences
    """
    features = np.zeros((len(reviews_int), seq_len), dtype=int)
    for i, review in enumerate(review_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            new = zeros + review
        else:
            new = review[:seq_len]
        features[i, :] = np.array(new)
    return features

In [77]:
# SEQ_LEN – гиперпараметр, от длины будет зависеть качество
SEQ_LEN = 32
features = padding(reviews_int, SEQ_LEN)
print(features[1, :])

[277  37 252 283 407 176 310 299  60 433  13 387  88 247  10 675  13 165
  56 319 299 252   3  15 115  42   9 157 236  37  83 946]


In [78]:
features[32]

array([  0,   0,  19,   5,   3,  67, 322, 635, 280, 672,  76, 851, 372,
       206, 508,  78,  91,   2, 637, 124, 484, 581, 271, 180, 116,  57,
       163, 222,   1, 490,   1, 280])

In [79]:
X_train, X_valid, y_train, y_valid = train_test_split(features, df['sentiment'].to_numpy(), test_size=0.2, random_state=1)

In [80]:
print(X_train)
print(y_train)

[[  2 957 472 ... 341 594 741]
 [  0   0   0 ... 302 339 787]
 [915  21 886 ...  78  52 253]
 ...
 [910 701  37 ... 436  49 824]
 [ 11 332 633 ... 358 243 121]
 [180 242  75 ...  36 713   8]]
[0 1 1 ... 0 0 1]


In [81]:
# create tensor dataset
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(y_valid))

# dataloaders
BATCH_SIZE = 48

train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=BATCH_SIZE)

In [82]:
# посмотрим, что внутри
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)

print('Sample input size: BATCH_SIZE x SEQ_LEN', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample input: \n', sample_y)

Sample input size: BATCH_SIZE x SEQ_LEN torch.Size([48, 32])
Sample input: 
 tensor([[  3, 566,   3,  ...,  25, 346, 180],
        [ 39, 578, 113,  ..., 224, 113,  29],
        [  3,  21,  82,  ..., 776, 671, 758],
        ...,
        [  0,   0,   0,  ..., 798, 206, 583],
        [ 68,   9, 471,  ..., 805,  30, 114],
        [  0,   0,   0,  ..., 789, 905, 624]])
Sample input: 
 tensor([0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1])


### Simple RNN model

Создадим dataclass для хранения конфигурации сети: теперь параметров больше, их можно задать и константами, но мы используем немного более сложный подход. Подробнее о dataclasses: https://habr.com/ru/articles/415829/

In [83]:
@dataclass
class ConfigRNN:
    vocab_size: int
    device: str
    n_layers: int
    embedding_dim: int
    hidden_size: int
    seq_len: int
    bidirectional: Union[bool, int]

In [84]:
net_config = ConfigRNN(
    vocab_size=len(vocab_to_int) + 1,
    device="cpu",
    n_layers=1,
    embedding_dim=8,
    hidden_size=16,
    seq_len=SEQ_LEN,
    bidirectional=False,
)
net_config

ConfigRNN(vocab_size=968, device='cpu', n_layers=1, embedding_dim=8, hidden_size=16, seq_len=32, bidirectional=False)

PyTorch: [docs](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)

In [85]:
class RNNNet(nn.Module):
    """
    vocab_size: int, размер словаря (аргумент embedding-слоя)
    emb_size:   int, размер вектора для описания каждого элемента последовательности
    hidden_dim: int, размер вектора скрытого состояния, default 0
    batch_size: int, размер batch
    """

    def __init__(self, rnn_conf=net_config) -> None:
        super().__init__()
        self.rnn_conf = rnn_conf
        self.seq_len = rnn_conf.seq_len
        self.emb_size = rnn_conf.embedding_dim
        self.hidden_dim = rnn_conf.hidden_size
        self.n_layers = rnn_conf.n_layers
        self.vocab_size = rnn_conf.vocab_size
        self.bidirectional = bool(rnn_conf.bidirectional)

        self.embedding = nn.Embedding(self.vocab_size, self.emb_size)
        self.rnn_cell = nn.RNN(
            input_size=self.emb_size,
            hidden_size=self.hidden_dim,
            batch_first=True,
            bidirectional=self.bidirectional,
            num_layers=self.n_layers,
        )
        self.bidirect_factor = 2 if self.bidirectional == 1 else 1
        self.linear = nn.Sequential(
            nn.Linear(self.hidden_dim * self.seq_len * self.bidirect_factor, 16),
            nn.Tanh(),
            nn.Linear(16, 1)
        )

    def model_description(self):
        direction = "bidirect" if self.bidirectional else "onedirect"
        return f"rnn_{direction}_{self.n_layers}"

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x.to(self.rnn_conf.device))
        output, _ = self.rnn_cell(x)  # Забираем hidden states со всех промежуточных состояний, второй выход отправляем в _
        output = output.contiguous().view(output.size(0), -1)
        out = self.linear(output)
        return out


model_rnn = RNNNet(net_config)
tu.get_model_summary(model_rnn, sample_x.to(net_config.device))

Layer                Kernel        Output      Params       FLOPs
0_embedding          [8, 968]    [48, 32, 8]    7,744       1,536
1_rnn_cell                  -   [48, 32, 16]      416   6,021,120
2_linear.Linear_0   [512, 16]       [48, 16]    8,208     785,664
3_linear.Tanh_1             -       [48, 16]        0       3,840
4_linear.Linear_2     [16, 1]        [48, 1]       17       1,488
Total params: 16,385
Trainable params: 16,385
Non-trainable params: 0
Total FLOPs: 6,813,648 / 6.81 MFLOPs
-----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.29
Params size (MB): 0.06
Estimated Total Size (MB): 0.37


In [86]:
criterion = nn.BCEWithLogitsLoss()
optimizer_rnn = torch.optim.Adam(model_rnn.parameters())
metric = BinaryAccuracy()

In [87]:
train_losses, val_losses, train_metric, val_metric, rnn_time = train(
    epochs=1,
    model=model_rnn,
    train_loader=train_loader,
    valid_loader=valid_loader,
    optimizer=optimizer_rnn,
    rnn_conf=net_config,
    criterion=criterion,
    metric=metric,
)

Epoch 1
train_loss : 0.6418 val_loss : 0.5729
train_accuracy : 0.61 val_accuracy : 0.71


### LSTM-модель

Весь процесс останется прежним, мы изменим только саму модель

In [88]:
class LSTMClassifier(nn.Module):
    def __init__(self, rnn_conf=net_config) -> None:
        super().__init__()

        self.embedding_dim = rnn_conf.embedding_dim
        self.hidden_size = rnn_conf.hidden_size
        self.bidirectional = rnn_conf.bidirectional
        self.n_layers = rnn_conf.n_layers

        self.embedding = nn.Embedding(rnn_conf.vocab_size, self.embedding_dim)
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.hidden_size,
            bidirectional=self.bidirectional,
            batch_first=True,
            num_layers=self.n_layers,
        )
        self.bidirect_factor = 2 if self.bidirectional else 1
        self.clf = nn.Sequential(
            nn.Linear(self.hidden_size * self.bidirect_factor, 32),
            nn.Tanh(),
            nn.Dropout(),
            nn.Linear(32, 1)
        )

    def model_description(self):
        direction = "bidirect" if self.bidirectional else "onedirect"
        return f"lstm_{direction}_{self.n_layers}"

    def forward(self, x: torch.Tensor):
        embeddings = self.embedding(x)
        out, _ = self.lstm(embeddings)
        # print(out.shape)
        out = out[:, -1, :]  # [все элементы батча, последний h_n, все элементы последнего h_n]
        # print(out.shape)
        out = self.clf(out)
        return out


model_lstm = LSTMClassifier(net_config)
tu.get_model_summary(model_lstm, sample_x)

Layer              Kernel       Output      Params        FLOPs
0_embedding       [8, 968]    [48, 32, 8]    7,744        1,536
1_lstm                   -   [48, 32, 16]    1,664   20,692,992
2_clf.Linear_0    [16, 32]       [48, 32]      544       47,616
3_clf.Tanh_1             -       [48, 32]        0        7,680
4_clf.Dropout_2          -       [48, 32]        0            0
5_clf.Linear_3     [32, 1]        [48, 1]       33        3,024
Total params: 9,985
Trainable params: 9,985
Non-trainable params: 0
Total FLOPs: 20,752,848 / 20.75 MFLOPs
---------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.32
Params size (MB): 0.04
Estimated Total Size (MB): 0.37


In [89]:
optimizer_lstm = torch.optim.Adam(model_lstm.parameters(), lr=0.005)

In [90]:
train_losses, val_losses, train_metric, val_metric, lstm_time = train(
    epochs=1, 
    model=model_lstm, 
    train_loader=train_loader,
    valid_loader=valid_loader,
    optimizer=optimizer_lstm,
    rnn_conf=net_config,
    criterion=criterion,
    metric=metric
    )

Epoch 1
train_loss : 0.5675 val_loss : 0.4795
train_accuracy : 0.69 val_accuracy : 0.77


In [91]:
## Используем функцию для препроцессинга отдельной строки
preprocess_single_string("great movie", net_config.seq_len, vocab_to_int)

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15,  1])

In [92]:
model_rnn.eval();
model_lstm.eval();

In [93]:
preprocess_single_string('good bad start film', net_config.seq_len, vocab_to_int)

tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          5,  16, 258,   2])

In [94]:
def predict_review(review_text: str, seq_len: int = net_config.seq_len) -> torch.tensor:
    sample = preprocess_single_string(review_text, seq_len, vocab_to_int)
    probability_rnn = model_rnn(sample.unsqueeze(0)).to(net_config.device).sigmoid()
    probability_lstm = model_lstm(sample.unsqueeze(0)).to(net_config.device).sigmoid()
    print(f"RNN: {probability_rnn.item():.3f} LSTM: {probability_lstm.item():.3f}")

In [95]:
positive_reviews = [
    "This film is a masterpiece! The storytelling is captivating, the acting is phenomenal, and the visuals are breathtaking. I couldn't take my eyes off the screen.",
    "I absolutely loved this movie. It's a rollercoaster of emotions, and the characters are so relatable. The ending left me in tears, but in the best way possible.",
    "What a gem of a film! It's heartwarming and filled with humor. I left the theater with a big smile on my face, and I can't wait to watch it again.",
    "Incredible cinematography and a thought-provoking plot make this movie a must-see. I was glued to my seat the entire time, and it left me thinking about it for days.",
    "This film is a true work of art. The music, the visuals, and the performances all come together to create an unforgettable cinematic experience. I highly recommend it to everyone."
]

negative_reviews = [
    "I couldn't stand this movie. The plot was convoluted, and the characters were one-dimensional. It was a complete waste of my time.",
    "The acting in this film was cringe-worthy. I couldn't connect with any of the characters, and the dialogue felt forced and unrealistic.",
    "I had high hopes for this movie, but it was a major disappointment. The pacing was off, and I found myself checking my watch throughout the entire film.",
    "This film is just another generic Hollywood cash grab. It's full of clichés, and the story is completely predictable. I wish I had skipped it.",
    "I can't believe the hype around this movie. It was boring, and the ending was so unsatisfying. I wouldn't recommend it to anyone."
]


In [96]:
for i in positive_reviews+negative_reviews:
    predict_review(i)

RNN: 0.492 LSTM: 0.721
RNN: 0.625 LSTM: 0.797
RNN: 0.573 LSTM: 0.675
RNN: 0.696 LSTM: 0.583
RNN: 0.874 LSTM: 0.888
RNN: 0.424 LSTM: 0.154
RNN: 0.527 LSTM: 0.387
RNN: 0.746 LSTM: 0.625
RNN: 0.648 LSTM: 0.341
RNN: 0.260 LSTM: 0.256
