The Dataset used for this is taken from: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/data

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from nltk.tokenize import word_tokenize
import nltk

Data Cleaning

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
df = pd.read_csv('IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
review = df['review'].values
sentiment = df['sentiment'].values

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return word_tokenize(text)

tokenized_reviews = [preprocess_text(rev) for rev in review]

In [6]:
# Build vocabulary
all_words = [word for review in tokenized_reviews for word in review]
word_to_index = {word: i+1 for i, word in enumerate(set(all_words))}
word_to_index[''] = 0

# Convert text to sequences
def text_to_sequence(text, word_to_index, max_len):
    return [word_to_index.get(word, 0) for word in text][:max_len] + [0] * max(0, max_len - len(text))

In [10]:
max_len = 100
x = [text_to_sequence(review, word_to_index, max_len) for review in tokenized_reviews]

In [11]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(sentiment)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [13]:
def create_data_loader(x, y, batch_size):
    tensor_x = torch.tensor(x, dtype=torch.long)
    tensor_y = torch.tensor(y, dtype=torch.float32)
    dataset = TensorDataset(tensor_x, tensor_y)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [14]:
train_loader = create_data_loader(x_train, y_train, batch_size=128)
test_loader = create_data_loader(x_test, y_test, batch_size=128)

In [28]:
def train(model, train_loader, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

In [37]:
def evaluate(model, test_loader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs).squeeze()
            preds = torch.sigmoid(outputs).round().long()
            y_true.extend(targets.tolist())
            y_pred.extend(preds.tolist())
    return classification_report(y_true, y_pred, target_names=label_encoder.classes_)

In [38]:
embedding_dim = 50
hidden_dim = 64

RNN Model using Pytorch

In [20]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Take the output of the last time step
        return out

In [39]:
model1 = RNNModel(len(word_to_index), embedding_dim, hidden_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model1.parameters(), lr=0.001)

In [41]:
train(model1, train_loader, epochs=5)
print(f"Accuracy: {evaluate(model1, test_loader)}")

Epoch 1, Loss: 0.7059938273490808
Epoch 2, Loss: 0.7059434105793889
Epoch 3, Loss: 0.7060132257092875
Epoch 4, Loss: 0.7060090621439413
Epoch 5, Loss: 0.7059678589574064
Accuracy:               precision    recall  f1-score   support

    negative       0.49      0.56      0.52      4961
    positive       0.50      0.43      0.46      5039

    accuracy                           0.49     10000
   macro avg       0.50      0.50      0.49     10000
weighted avg       0.50      0.49      0.49     10000



LSTM Model Using Pytorch

In [22]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim,1)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Take the output of the last time step
        return out

In [40]:
model2 = LSTMModel(len(word_to_index), embedding_dim, hidden_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model2.parameters(), lr=0.001)

In [42]:
train(model2, train_loader, epochs=5)
print(f"Accuracy: {evaluate(model2, test_loader)}")

Epoch 1, Loss: 0.693507554812934
Epoch 2, Loss: 0.6915980870731342
Epoch 3, Loss: 0.6917920899086486
Epoch 4, Loss: 0.6928271001901108
Epoch 5, Loss: 0.6921248119860031
Accuracy:               precision    recall  f1-score   support

    negative       0.50      0.88      0.64      4961
    positive       0.55      0.14      0.23      5039

    accuracy                           0.51     10000
   macro avg       0.52      0.51      0.43     10000
weighted avg       0.52      0.51      0.43     10000

