# Импортирование библиотек

In [1]:
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import pandas as pd
import torch.nn as nn
import numpy as np

import torch

# Выгрузка данных
## Информация про датасет
### Columns - [text, label]
1. text - Содержит отзыв о фильме
2. label:
    - 0 - негативный отзыв
    - 1 - позитивный отзыв
- ссылка на датасет: https://www.kaggle.com/datasets/thedevastator/imdb-movie-review-sentiment-dataset?select=train.csv

In [2]:
df1 = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')
df = pd.concat([df1, df2])
df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [3]:
df.label.value_counts()

label
0    25000
1    25000
Name: count, dtype: int64

#### Пример негативного отзыва

In [4]:
df[df['label'] == 0].iloc[15].text

"This film is just plain horrible. John Ritter doing pratt falls, 75% of the actors delivering their lines as if they were reading them from cue cards, poor editing, horrible sound mixing (dialogue is tough to pick up in places over the background noise), and a plot that really goes nowhere. I didn't think I'd ever say this, but Dorothy Stratten is not the worst actress in this film. There are at least 3 others that suck more. Patti Hansen delivers her lines with the passion of Ben Stein. I started to wonder if she wasn't dead inside. Even Bogdanovich's kids are awful (the oldest one is definitely reading her lines from a cue card). This movie is seriously horrible. There's a reason Bogdanovich couldn't get another project until 4 years later. Please don't watch it. If you see it in your television listings, cancel your cable. If a friend suggests it to you, reconsider your friendship. If your spouse wants to watch it, you're better off finding another soulmate. I'd rather gouge my eye

##### Пример положительного отзыва

In [5]:
df[df['label'] == 1].iloc[10].text

"Lars von Trier's Europa is a worthy echo of The Third Man, about an American coming to post-World War II Europe and finds himself entangled in a dangerous mystery.<br /><br />Jean-Marc Barr plays Leopold Kessler, a German-American who refused to join the US Army during the war, arrives in Frankfurt as soon as the war is over to work with his uncle as a sleeping car conductor on the Zentropa Railway. What he doesn't know is the war is still secretly going on with an underground terrorist group called the Werewolves who target American allies. Leopold is strongly against taking any sides, but is drawn in and seduced by Katharina Hartmann (Barbara Sukowa), the femme fatale daughter of the owner of the railway company. Her father was a Nazi sympathizer, but is pardoned by the American Colonel Harris (Eddie Considine) because he can help get the German transportation system up and running again. The colonel soon enlists, or forces, Leopold to be a spy (without giving him a choice or chance

# Обработка данных 
1. Привести всё к нижнему регистру и токенезировать слова.
2. Векторизовать строки с помощью Tf-IDF алгоритма

In [6]:
df['text'] = df['text'].apply(lambda x: ' '.join(wordpunct_tokenize(x.lower())))

In [7]:
vec = TfidfVectorizer()
X = vec.fit_transform(df['text'])
y = df['label']

# Разделение данных на тестовую и тренировочную выборку

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
X_train.shape

(33500, 101895)

# Обучение и тест моделей

### Обучение логистической регрессии

In [10]:
log_reg = LogisticRegression(random_state=0).fit(X_train, y_train)


print("Точность на тренировочной выборке: {:.2f}".format(log_reg.score(X_train, y_train))) 
print("Точность на тестовой выборке: {:.2f}".format(log_reg.score(X_test, y_test)))

Точность на тренировочной выборке: 0.93
Точность на тестовой выборке: 0.90


### Линейная нейронная сеть (состоящая из линейных слоёв)

In [11]:
class LinearNetwork(nn.Module):
    def __init__(self, seq_size):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(seq_size, seq_size//1024),
            nn.ReLU(),
            nn.Linear(seq_size//1024, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid(),
        )
    def forward(self, x):
        return self.model(x)

In [53]:
class RecNetwork(nn.Module):
    def __init__(self, seq_size):
        super().__init__()
        self.lin1 = nn.Linear(seq_size, seq_size//1024)
        self.rnn = nn.LSTM(seq_size//1024, 32, 2)
        self.rnn2 = nn.LSTM(32, 32, 2)
        self.lin2 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = self.lin1(x)
        x = nn.ReLU()(x)
        x = nn.Dropout(0.25)(x)
        x, hn = self.rnn(x)
        x = nn.Dropout(0.25)(x)
        x, _ = self.rnn2(x, hn)
        x = nn.Dropout(0.25)(x)
        x = self.lin2(x)
        return nn.Sigmoid()(x)

In [54]:
device = torch.device('cuda')

In [55]:
lin_net = LinearNetwork(X_train.shape[1]).to(device)
loss_fn = nn.BCELoss()
lin_opt = torch.optim.Adam(lin_net.parameters())

In [56]:
rec_net = RecNetwork(X_train.shape[1]).to(device)
loss_fn = nn.BCELoss()
rec_opt = torch.optim.Adam(rec_net.parameters())

In [57]:
batch_size = 48
epochs = 1

In [58]:
def test(model):
    total_loss = 0
    accuracy = 0
    for i in tqdm(range(0, X_test.shape[0] - batch_size, batch_size)):
        X = torch.tensor(X_test[i:i+batch_size].toarray()).float().to(device)
        y = torch.tensor(y_test[i:i+batch_size].to_numpy()).reshape(batch_size, 1).float().to(device)

        with torch.no_grad():
            out = model(X)
            loss = loss_fn(out, y)
            total_loss += loss
        results = torch.round(out)
        for i in range(batch_size):
            if results[i] == y[i]:
                accuracy += 1
        accuracy /= batch_size

    return accuracy

In [64]:
def train(network, optim, epochs=1):
    for epoch in range(epochs):
        accuracy = 0
        for i in tqdm(range(0, X_train.shape[0] - batch_size, batch_size)):
            X = torch.tensor(X_train[i:i+batch_size].toarray()).float().to(device)
            y = torch.tensor(y_train[i:i+batch_size].to_numpy()).reshape(batch_size, 1).float().to(device)
            
            out = network(X)
            loss = loss_fn(out, y)

            network.zero_grad()
            loss.backward()
            optim.step()

            results = torch.round(out)
            for i in range(batch_size):
                if results[i] == y[i]:
                    accuracy += 1
            accuracy /= batch_size
        print("Точность для тренировочного набора: {:.2f}%".format(accuracy / batch_size))
        print("Точность для тестового набора: {:.2f}%".format(test(network)*100))

In [65]:
train(lin_net, lin_opt, epochs=epochs)

100%|██████████| 697/697 [00:25<00:00, 27.56it/s]


Точность для тестового набора: 64979.17%


100%|██████████| 343/343 [00:12<00:00, 27.36it/s]

Точность для тестового набора: 85.28%





In [61]:
train(rec_net, rec_opt, epochs=epochs)

  0%|          | 0/697 [00:00<?, ?it/s]


NameError: name 'results' is not defined

In [73]:
lin_net.eval(), rec_net.eval()

(LinearNetwork(
   (model): Sequential(
     (0): Linear(in_features=101895, out_features=99, bias=True)
     (1): ReLU()
     (2): Linear(in_features=99, out_features=32, bias=True)
     (3): ReLU()
     (4): Linear(in_features=32, out_features=1, bias=True)
     (5): Sigmoid()
   )
 ),
 RecNetwork(
   (lin1): Linear(in_features=101895, out_features=99, bias=True)
   (rnn): LSTM(99, 20, num_layers=2)
   (lin2): Linear(in_features=20, out_features=1, bias=True)
   (relu): ReLU()
 ))

# Пример использования

### Пример предсказания негативного отзыва

In [74]:
review = "I really hate this movie! The plot is so boring! Don't recommend it to anyone"
text = [' '.join(wordpunct_tokenize(review.lower()))]
text = vec.transform(text)

text_to_model = torch.tensor(text.toarray().reshape(1, -1)).float()

print("Отзыв: ", review)
log_pred = log_reg.predict_proba(text.reshape(1, -1)[0])
print(f"Предсказания логистической регрессии: {'Хороший' if np.argmax(log_pred) == 1 else 'Плохой'}")
print("С вероятностью: {:.2f}%".format(max(log_reg.predict_proba(text.reshape(1, -1))[0])*100))
print()
lin_model_pred = lin_net(text_to_model).item()
print(f"Предсказание линейной нейронной сети: {'Хороший' if lin_model_pred > 0.5 else 'Плохой'}")
print("С вероятность {:.2f}%".format(lin_model_pred*100 if lin_model_pred > 0.5 else (1 - lin_model_pred)*100))
print()
rec_model_pred = rec_net(text_to_model).item()
print(f"Предсказание рекуррентной нейронной сети: {'Хороший' if rec_model_pred > 0.5 else 'Плохой'}")
print("С вероятность {:.2f}%".format(rec_model_pred*100 if rec_model_pred > 0.5 else (1 - rec_model_pred)*100))


Отзыв:  I really hate this movie! The plot is so boring! Don't recommend it to anyone
Предсказания логистической регрессии: Плохой
С вероятностью: 94.28%

Предсказание линейной нейронной сети: Плохой
С вероятность 99.92%

Предсказание рекуррентной нейронной сети: Плохой
С вероятность 98.90%


### Пример предсказания позитивного отзыва

In [75]:
review = "It was the best movie I've ever seen! I mean the plot is so awesome and intresting. I'm defenetly going to rewatch it!"
text = [' '.join(wordpunct_tokenize(review.lower()))]
text = vec.transform(text)

text_to_model = torch.tensor(text.toarray().reshape(1, -1)).float()

print("Отзыв:", review)
log_pred = log_reg.predict_proba(text.reshape(1, -1)[0])
print(f"Предсказания логистической регрессии: {'Хороший' if np.argmax(log_pred) == 1 else 'Плохой'}")
print("С вероятностью: {:.2f}%".format(max(log_reg.predict_proba(text.reshape(1, -1))[0])*100))
print()
lin_model_pred = lin_net(text_to_model).item()
print(f"Предсказание линейной нейронной сети: {'Хороший' if lin_model_pred > 0.5 else 'Плохой'}")
print("С вероятность {:.2f}%".format(lin_model_pred*100 if lin_model_pred > 0.5 else (1 - lin_model_pred)*100))
print()
rec_model_pred = rec_net(text_to_model).item()
print(f"Предсказание рекуррентной нейронной сети: {'Хороший' if rec_model_pred > 0.5 else 'Плохой'}")
print("С вероятность {:.2f}%".format(rec_model_pred*100 if rec_model_pred > 0.5 else (1 - rec_model_pred)*100))


Отзыв: It was the best movie I've ever seen! I mean the plot is so awesome and intresting. I'm defenetly going to rewatch it!
Предсказания логистической регрессии: Хороший
С вероятностью: 80.51%

Предсказание линейной нейронной сети: Хороший
С вероятность 99.63%

Предсказание рекуррентной нейронной сети: Хороший
С вероятность 99.07%


### Пример предсказания нейтрального отзыва

In [76]:
review = "It wasn't that bad but I mean the plot is not that intesting either. Would I recommend it? I'm not sure. You can watch it, maybe you will like it more."
text = [' '.join(wordpunct_tokenize(review.lower()))]
text = vec.transform(text)

text_to_model = torch.tensor(text.toarray().reshape(1, -1)).float()

print("Отзыв: ", review)
log_pred = log_reg.predict_proba(text.reshape(1, -1)[0])
print(f"Предсказания логистической регрессии: {'Хороший' if np.argmax(log_pred) == 1 else 'Плохой'}")
print("С вероятностью: {:.2f}%".format(max(log_reg.predict_proba(text.reshape(1, -1))[0])*100))
print()
lin_model_pred = lin_net(text_to_model).item()
print(f"Предсказание линейной нейронной сети: {'Хороший' if lin_model_pred > 0.5 else 'Плохой'}")
print("С вероятность {:.2f}%".format(lin_model_pred*100 if lin_model_pred > 0.5 else (1 - lin_model_pred)*100))
print()
rec_model_pred = rec_net(text_to_model).item()
print(f"Предсказание рекуррентной нейронной сети: {'Хороший' if rec_model_pred > 0.5 else 'Плохой'}")
print("С вероятность {:.2f}%".format(rec_model_pred*100 if rec_model_pred > 0.5 else (1 - rec_model_pred)*100))


Отзыв:  It wasn't that bad but I mean the plot is not that intesting either. Would I recommend it? I'm not sure. You can watch it, maybe you will like it more.
Предсказания логистической регрессии: Плохой
С вероятностью: 81.81%

Предсказание линейной нейронной сети: Плохой
С вероятность 99.13%

Предсказание рекуррентной нейронной сети: Плохой
С вероятность 98.90%


#### Вывод: логистическая регрессия показывает намного лучше результат, чем случайный лес, поэтому подходит намного лучше для предсказания.
##### Логичестическая регрессия итак показывает довольно неплохой результат, но этот результат можно улучшить ещё больше, если использовать более продвинутые модели глубого обучения для анализа текста, использующие Attention и Self-Attention