In [24]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)  # No truncation for column values
pd.set_option('display.float_format', '{:.6f}'.format)

import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})  # Binary encoding for labels
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'], test_size=0.2, random_state=42
)


In [4]:
## Tokenize and pad sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq,maxlen=100)
X_test_padded = pad_sequences(X_test_seq,maxlen=100)

## Convert lables to tensors
y_train_tensor = torch.tensor(y_train.values,dtype = torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype = torch.float32)

In [16]:
X_train_padded[39996]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         10,  252,   11,   17,    5,   26,  178,  726,    2,  997,  430,
         49,  102,   49,  151,    2,  192,  747,    5,  394,  125,  610,
        475,    1,  110, 1267,  236,   25,   75,    3,  228,   73,   18,
        442,    3,  535,  699,   43,   22,  423,   11,   19,   10,   59,
        374,    1, 1946, 4125,    1,  319,    4, 1659,    2,   54,   95,
         41])

In [21]:
## Defining a dataset function

class IMDBDataset(Dataset):
    def __init__(self,reviews,labels):
        self.reviews = torch.tensor(reviews,dtype=torch.long)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self,idx):
        return self.reviews[idx],self.labels[idx]
        

In [23]:
## Creating datasets and dataloaders

train_datasets = IMDBDataset(X_train_padded,y_train_tensor)
test_datasets = IMDBDataset(X_test_padded,y_test_tensor)

train_loader = DataLoader(train_datasets,batch_size=32,shuffle=True)
test_loader = DataLoader(test_datasets,batch_size=32, shuffle = True)

In [25]:
## RNN implementation

class RNNClassifier(nn.Module):
    def __init__(self,vocab_size,embed_dim, hidden_dim,output_dim):
        super(RNNClassifier,self).__init__()
        self.embedding = nn.Embedding(vocab_size,embed_dim)
        self.rnn = nn.RNN(embed_dim,hidden_dim,batch_first = True)
        self.fc = nn.Linear(hidden_dim,output_dim)
        self.sigmoid = nn.Sigmoid()
    def forward(self,x):
        x = self.embedding(x)
        out,_ = self.rnn(x)
        out = out[:,-1,:]
        out = self.fc(out)
        out = self.sigmoid(out)
        return out


In [26]:
## LSTM implementation

class LSTMClassifier(nn.Module):
    def __init__(self,vocab_size,embed_dim,hidden_dim,output_dim):
        super(LSTMClassifier,self).__init__()
        self.embedding = nn.Embedding(vocab_size,embed_dim)
        self.lstm = nn.LSTM(embed_dim,hidden_dim,batch_first = True)
        self.fc = nn.Linear(hidden_dim,output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x):
        x = self.embedding(x)
        out,(hn,cn) = self.lstm(x)
        out = hn[-1,:,:]
        out = self.fc(out)
        out = self.sigmoid(out)
        return out

In [27]:
def train_model(model, train_loader, criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for reviews, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(reviews).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}")

def evaluate_model(model, test_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for reviews, labels in test_loader:
            outputs = model(reviews).squeeze()
            predictions = (outputs >= 0.5).float()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [28]:
# RNN Model Training and evaluation

vocab_size = 5000
embed_dim = 128
hidden_dim = 128
output_dim = 1

rnn_model = RNNClassifier(vocab_size=vocab_size,embed_dim=embed_dim,hidden_dim=hidden_dim,output_dim=output_dim)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(rnn_model.parameters(),lr = 0.001)

print("Training RNN...")
train_model(rnn_model, train_loader, criterion, optimizer, epochs=5)
print("Evaluating RNN...")
evaluate_model(rnn_model, test_loader)

Training RNN...
Epoch [1/5], Loss: 0.6308
Epoch [2/5], Loss: 0.5211
Epoch [3/5], Loss: 0.5175
Epoch [4/5], Loss: 0.5248
Epoch [5/5], Loss: 0.4885
Evaluating RNN...
Test Accuracy: 78.19%


In [29]:
# LSTM Model
lstm_model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)

print("Training LSTM...")
train_model(lstm_model, train_loader, criterion, optimizer, epochs=5)
print("Evaluating LSTM...")
evaluate_model(lstm_model, test_loader)


Training LSTM...
Epoch [1/5], Loss: 0.5241
Epoch [2/5], Loss: 0.3580
Epoch [3/5], Loss: 0.2843
Epoch [4/5], Loss: 0.2375
Epoch [5/5], Loss: 0.1890
Evaluating LSTM...
Test Accuracy: 86.73%


In [30]:
## Validation and testing



new_review_raw = [
"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",
"Best movie ever! I loved wasting 3 hours of my life on this masterpiece.",
"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",
"Definitely recommend this movie... if you want to bore yourself to death.",
"The movie was about two friends who embark on a journey. It has a runtime of two hours.",
"It is a typical superhero movie with action scenes and some emotional moments.",
"The cinematography was colorful, and the soundtrack was loud.",
"The second half of the movie was longer than the first.",
"It was okay, I guess, but I wouldn’t watch it again.",
"Not bad, but not great either.",
"I laughed, I cried, but I still don’t know if I liked it or not.",
"The second half was much better than the first, though the ending was questionable.",
"It was very good, super, fantastic.",
"It was good until the second half ",
"It was second half ",
"Second half was good ",
"Movie is amazing, especially in the Second half",
"Terrible movie, Second half was hilarious",
"It was okay, Second half was hilarious",
"Best movie if you are looking for a headache",
"Lots of fun"

]



In [32]:
# Tokenize and convert the new reviews to sequences
new_review_seq = tokenizer.texts_to_sequences(new_review_raw)

# Pad the sequences to the same length as the training data (e.g., maxlen=100)
new_review_padded = pad_sequences(new_review_seq, maxlen=100)

# Convert the padded sequences to PyTorch tensor
new_review_tensor = torch.tensor(new_review_padded, dtype=torch.long)
rnn_model.eval()
# Get predictions from the RNN model
with torch.no_grad():
    rnn_output = rnn_model(new_review_tensor).squeeze().numpy()  # Output probabilities
# Create a DataFrame for validation results
validationResults = pd.DataFrame()
validationResults['Test cases'] = new_review_raw

# Add RNN predictions
validationResults['RNN Negative Probabilities'] = 1 - rnn_output

validationResults['RNN Positive Probabilities'] = rnn_output
validationResults['RNN Prediction'] = np.where(rnn_output > 0.5, "Positive", "Negative")
validationResults

Unnamed: 0,Test cases,RNN Negative Probabilities,RNN Positive Probabilities,RNN Prediction
0,"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",0.50298,0.49702,Negative
1,Best movie ever! I loved wasting 3 hours of my life on this masterpiece.,0.17907,0.82093,Positive
2,"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",0.875031,0.124969,Negative
3,Definitely recommend this movie... if you want to bore yourself to death.,0.732772,0.267228,Negative
4,The movie was about two friends who embark on a journey. It has a runtime of two hours.,0.190263,0.809737,Positive
5,It is a typical superhero movie with action scenes and some emotional moments.,0.045783,0.954217,Positive
6,"The cinematography was colorful, and the soundtrack was loud.",0.092349,0.907651,Positive
7,The second half of the movie was longer than the first.,0.190367,0.809633,Positive
8,"It was okay, I guess, but I wouldn’t watch it again.",0.554983,0.445017,Negative
9,"Not bad, but not great either.",0.585304,0.414696,Negative


In [33]:
# Tokenize and convert the new reviews to sequences
new_review_seq = tokenizer.texts_to_sequences(new_review_raw)

# Pad the sequences to the same length as the training data (e.g., maxlen=100)
new_review_padded = pad_sequences(new_review_seq, maxlen=100)

# Convert the padded sequences to PyTorch tensor
new_review_tensor = torch.tensor(new_review_padded, dtype=torch.long)
lstm_model.eval()

# Get predictions from the LSTM model
with torch.no_grad():
    lstm_output = lstm_model(new_review_tensor).squeeze().numpy()  # Output probabilities

# Create a DataFrame for validation results
validationResults = pd.DataFrame()
validationResults['Test cases'] = new_review_raw

# Add LSTM predictions
validationResults['LSTM Negative Probabilities'] = 1 - lstm_output

validationResults['LSTM Positive Probabilities'] = lstm_output
validationResults['LSTM Prediction'] = np.where(lstm_output > 0.5, "Positive", "Negative")

validationResults

Unnamed: 0,Test cases,LSTM Negative Probabilities,LSTM Positive Probabilities,LSTM Prediction
0,"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",0.887635,0.112365,Negative
1,Best movie ever! I loved wasting 3 hours of my life on this masterpiece.,0.738373,0.261627,Negative
2,"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",0.992719,0.007281,Negative
3,Definitely recommend this movie... if you want to bore yourself to death.,0.947287,0.052713,Negative
4,The movie was about two friends who embark on a journey. It has a runtime of two hours.,0.303149,0.696851,Positive
5,It is a typical superhero movie with action scenes and some emotional moments.,0.073287,0.926713,Positive
6,"The cinematography was colorful, and the soundtrack was loud.",0.401341,0.598659,Positive
7,The second half of the movie was longer than the first.,0.254328,0.745672,Positive
8,"It was okay, I guess, but I wouldn’t watch it again.",0.552467,0.447533,Negative
9,"Not bad, but not great either.",0.789199,0.210801,Negative
