In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Manoj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv("IMDB Dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})


In [5]:
def preprocess_text(text):
    # Lower case
    text = text.lower()
    # removing the punctuations and special characters
    text = re.sub(r'[^\w\s]','',text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)
    

In [6]:
## applying the transformation

data['cleaned_review'] = data['review'].apply(preprocess_text)

In [93]:
# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_review'], data['sentiment'], test_size=0.2, random_state=42)

In [94]:
## Applying bag of words/ count vectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [95]:
## Initializing the countvectorizer

vectorizer = CountVectorizer(max_features=5000)

x_train_bow = vectorizer.fit_transform(X_train)
x_test_bow = vectorizer.transform(X_test)

In [96]:
vectorizer.get_feature_names_out()

array(['10', '100', '1010', ..., 'zombie', 'zombies', 'zone'],
      dtype=object)

In [97]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report


In [98]:
LR_model = LogisticRegression(max_iter=1000)
LR_model.fit(x_train_bow,y_train)

y_pred = LR_model.predict(x_test_bow)

print(f"Accuracy :",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy : 0.876
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      4961
           1       0.87      0.89      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [99]:
new_review = ["The movie was not bad, but it could have been better."]
new_review = vectorizer.transform(new_review)

output = LR_model.predict_proba(new_review)

print(output)
prediction = "positive" if output[0][0] <= 0.5 else "negative"
print(f"Prediction: {prediction}")

[[0.72647093 0.27352907]]
Prediction: negative


In [110]:
new_review_raw = [
"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",
"Best movie ever! I loved wasting 3 hours of my life on this masterpiece.",
"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",
"Definitely recommend this movie... if you want to bore yourself to death.",
"The movie was about two friends who embark on a journey. It has a runtime of two hours.",
"It is a typical superhero movie with action scenes and some emotional moments.",
"The cinematography was colorful, and the soundtrack was loud.",
"The second half of the movie was longer than the first.",
"It was okay, I guess, but I wouldn’t watch it again.",
"Not bad, but not great either.",
"I laughed, I cried, but I still don’t know if I liked it or not.",
"The second half was much better than the first, though the ending was questionable.",
"It was very good, super, fantastic.",
"It was good until the second half ",
"It was second half ",
"Second half was good ",
"Movie is amazing, especially in the Second half",
"Terrible movie, Second half was hilarious",
"It was okay, Second half was hilarious",
"Best movie if you are looking for a headache",
"Lots of fun"

]

new_review = vectorizer.transform(new_review_raw)

output = LR_model.predict_proba(new_review)

# Create a DataFrame for validation results
validationResults = pd.DataFrame()
validationResults['Test cases'] = new_review_raw
validationResults['Negative Probabilities'] = output[:, 0]  # Corrected: All rows, 0th column
validationResults['Positive Probabilities'] = output[:, 1]  # Corrected: All rows, 1st column

# Add predictions based on the threshold
validationResults['Prediction'] = np.where(validationResults['Positive Probabilities'] > 0.5, "Positive", "Negative")


In [116]:
pd.set_option('display.max_colwidth', None)  # No truncation for column values


In [117]:
validationResults

Unnamed: 0,Test cases,Negative Probabilities,Positive Probabilities,Prediction
0,"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",0.632273,0.367727,Negative
1,Best movie ever! I loved wasting 3 hours of my life on this masterpiece.,0.514737,0.485263,Negative
2,"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",0.912701,0.087299,Negative
3,Definitely recommend this movie... if you want to bore yourself to death.,0.410663,0.589337,Positive
4,The movie was about two friends who embark on a journey. It has a runtime of two hours.,0.486831,0.513169,Positive
5,It is a typical superhero movie with action scenes and some emotional moments.,0.23964,0.76036,Positive
6,"The cinematography was colorful, and the soundtrack was loud.",0.375903,0.624097,Positive
7,The second half of the movie was longer than the first.,0.561964,0.438036,Negative
8,"It was okay, I guess, but I wouldn’t watch it again.",0.708351,0.291649,Negative
9,"Not bad, but not great either.",0.564788,0.435212,Negative


In [13]:
## Implementing TF-IDF technique

In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [119]:
tf_idf = TfidfVectorizer(max_features=5000)

x_train_tfidf = tf_idf.fit_transform(X_train)
x_test_tfidf = tf_idf.transform(X_test)

In [120]:
LR_model2 = LogisticRegression()
LR_model2.fit(x_train_tfidf,y_train)

In [121]:
y_pred2 = LR_model2.predict(x_test_tfidf)

print(f"Accuracy :",accuracy_score(y_test,y_pred2))
print(classification_report(y_test,y_pred2))

Accuracy : 0.8891
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [122]:
new_review_raw = [
"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",
"Best movie ever! I loved wasting 3 hours of my life on this masterpiece.",
"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",
"Definitely recommend this movie... if you want to bore yourself to death.",
"The movie was about two friends who embark on a journey. It has a runtime of two hours.",
"It is a typical superhero movie with action scenes and some emotional moments.",
"The cinematography was colorful, and the soundtrack was loud.",
"The second half of the movie was longer than the first.",
"It was okay, I guess, but I wouldn’t watch it again.",
"Not bad, but not great either.",
"I laughed, I cried, but I still don’t know if I liked it or not.",
"The second half was much better than the first, though the ending was questionable.",
"It was very good, super, fantastic.",
"It was good until the second half ",
"It was second half ",
"Second half was good ",
"Movie is amazing, especially in the Second half",
"Terrible movie, Second half was hilarious",
"It was okay, Second half was hilarious",
"Best movie if you are looking for a headache",
"Lots of fun"

]

new_review = tf_idf.transform(new_review_raw)

output = LR_model2.predict_proba(new_review)

# Create a DataFrame for validation results
validationResults = pd.DataFrame()
validationResults['Test cases'] = new_review_raw
validationResults['Negative Probabilities'] = output[:, 0]  # Corrected: All rows, 0th column
validationResults['Positive Probabilities'] = output[:, 1]  # Corrected: All rows, 1st column

# Add predictions based on the threshold
validationResults['Prediction'] = np.where(validationResults['Positive Probabilities'] > 0.5, "Positive", "Negative")


In [123]:
validationResults

Unnamed: 0,Test cases,Negative Probabilities,Positive Probabilities,Prediction
0,"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",0.666979,0.333021,Negative
1,Best movie ever! I loved wasting 3 hours of my life on this masterpiece.,0.063021,0.936979,Positive
2,"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",0.894854,0.105146,Negative
3,Definitely recommend this movie... if you want to bore yourself to death.,0.194494,0.805506,Positive
4,The movie was about two friends who embark on a journey. It has a runtime of two hours.,0.400014,0.599986,Positive
5,It is a typical superhero movie with action scenes and some emotional moments.,0.215763,0.784237,Positive
6,"The cinematography was colorful, and the soundtrack was loud.",0.299297,0.700703,Positive
7,The second half of the movie was longer than the first.,0.608536,0.391464,Negative
8,"It was okay, I guess, but I wouldn’t watch it again.",0.909385,0.090615,Negative
9,"Not bad, but not great either.",0.81963,0.18037,Negative


In [18]:
## Building a neural network model

In [124]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

In [125]:
## Creating Pytorch dataset and dataloader objects

In [126]:
class IMDBDataset(Dataset):
    def __init__(self,features,labels):
        self.features = torch.tensor(features.todense(),dtype = torch.float32)
        self.labels = torch.tensor(labels.values,dtype=torch.float32)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self,idx):
        return self.features[idx], self.labels[idx]


In [127]:
## Creating dataset

train_dataset = IMDBDataset(x_train_tfidf,y_train)
test_dataset = IMDBDataset(x_test_tfidf,y_test)

In [128]:
## Creatinh a dataloader

train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True)
test_dataloader = DataLoader(test_dataset,batch_size=32,shuffle=True)

In [129]:
## Defining a model architecture

class IMDB_Network(nn.Module):
    def __init__(self,input_size):
        super(IMDB_Network,self).__init__()
        self.fc1 = nn.Linear(input_size,256)
        # self.fc4 = nn.Linear(256,128)
        self.fc5 = nn.Linear(256,1)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self,x):
        x = self.fc1(x)
        # x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)

        # x = self.fc2(x)
        # x = self.relu(x)
        # x = self.fc3(x)
        # x = self.relu(x)
        # x = self.fc4(x)

        # x = self.relu(x)
        # x = self.dropout(x)

        x = self.fc5(x)
        x = self.sigmoid(x)
        return x        

In [130]:
nnModel = IMDB_Network(x_train_tfidf.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(nnModel.parameters(),lr=0.01)

In [131]:
# Training function
epochs = 10
def train_model(nnModel, train_dataloader, criterion, optimizer, epochs):

    nnModel.train()  # Set model to training mode
    
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (features, labels) in enumerate(train_dataloader):
            optimizer.zero_grad()  # Clear gradients
    
            # Forward pass
            outputs = nnModel(features).squeeze()  # Squeeze output to match label shape
            loss = criterion(outputs, labels)
    
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
    
            total_loss += loss.item()
    
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(train_dataloader):.4f}")
    
# Train the model
train_model(nnModel, train_dataloader, criterion, optimizer, epochs=10)

Epoch [1/10], Loss: 0.3076
Epoch [2/10], Loss: 0.2145
Epoch [3/10], Loss: 0.1185
Epoch [4/10], Loss: 0.0350
Epoch [5/10], Loss: 0.0105
Epoch [6/10], Loss: 0.0067
Epoch [7/10], Loss: 0.0076
Epoch [8/10], Loss: 0.0098
Epoch [9/10], Loss: 0.0053
Epoch [10/10], Loss: 0.0031


In [132]:
# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient computation for evaluation
        for features, labels in test_loader:
            outputs = model(features).squeeze()
            predictions = (outputs >= 0.5).float()  # Threshold at 0.5
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")

# Evaluate the model
evaluate_model(nnModel, test_dataloader)


Test Accuracy: 0.8735


In [134]:
# Transform the raw reviews using the trained TF-IDF vectorizer
new_review_tfidf = tf_idf.transform(new_review_raw)

# Convert the sparse matrix to a dense tensor for PyTorch
new_review_tensor = torch.tensor(new_review_tfidf.todense(), dtype=torch.float32)
# Set the model to evaluation mode
nnModel.eval()

# Get predictions
with torch.no_grad():  # No need to compute gradients for inference
    outputs = nnModel(new_review_tensor).squeeze()  # Get the output probabilities

# Convert outputs to probabilities
predicted_probs = outputs.numpy() 
# Create the validation results DataFrame
validationResults = pd.DataFrame()
validationResults['Test cases'] = new_review_raw
validationResults['Negative Probabilities'] = 1 - predicted_probs  # Complement for the negative class
validationResults['Positive Probabilities'] = predicted_probs  # Output from sigmoid (positive class)

# Add predictions based on a threshold
validationResults['Prediction'] = np.where(validationResults['Positive Probabilities'] >= 0.5, "Positive", "Negative")


In [136]:
pd.set_option('display.float_format', '{:.6f}'.format)


In [137]:
validationResults

Unnamed: 0,Test cases,Negative Probabilities,Positive Probabilities,Prediction
0,"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",1.0,0.0,Negative
1,Best movie ever! I loved wasting 3 hours of my life on this masterpiece.,0.999696,0.000304,Negative
2,"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",1.0,0.0,Negative
3,Definitely recommend this movie... if you want to bore yourself to death.,0.797786,0.202214,Negative
4,The movie was about two friends who embark on a journey. It has a runtime of two hours.,0.365811,0.634189,Positive
5,It is a typical superhero movie with action scenes and some emotional moments.,0.035161,0.964839,Positive
6,"The cinematography was colorful, and the soundtrack was loud.",0.034176,0.965824,Positive
7,The second half of the movie was longer than the first.,0.995704,0.004296,Negative
8,"It was okay, I guess, but I wouldn’t watch it again.",0.999717,0.000283,Negative
9,"Not bad, but not great either.",0.979542,0.020458,Negative


In [28]:
## N gram models

In [150]:
# Use TF-IDF with n-grams
vectorizer_ngrams = TfidfVectorizer(ngram_range=(1, 2), max_features=15000)  # Unigrams and bigrams
X_train_tfidf_ngram = vectorizer_ngrams.fit_transform(X_train)
X_test_tfidf_ngram = vectorizer_ngrams.transform(X_test)

In [153]:
LR_model_bigrams = LogisticRegression(max_iter=1000)
LR_model_bigrams.fit(X_train_tfidf_ngram,y_train)

y_pred = LR_model_bigrams.predict(X_test_tfidf_ngram)

print(f"Accuracy :",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy : 0.8999
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      4961
           1       0.89      0.92      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [154]:
new_review = vectorizer_ngrams.transform(new_review_raw)

output = LR_model_bigrams.predict_proba(new_review)

# Create a DataFrame for validation results
validationResults = pd.DataFrame()
validationResults['Test cases'] = new_review_raw
validationResults['Negative Probabilities'] = output[:, 0]  # Corrected: All rows, 0th column
validationResults['Positive Probabilities'] = output[:, 1]  # Corrected: All rows, 1st column

# Add predictions based on the threshold
validationResults['Prediction'] = np.where(validationResults['Positive Probabilities'] > 0.5, "Positive", "Negative")

validationResults

Unnamed: 0,Test cases,Negative Probabilities,Positive Probabilities,Prediction
0,"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",0.718667,0.281333,Negative
1,Best movie ever! I loved wasting 3 hours of my life on this masterpiece.,0.043447,0.956553,Positive
2,"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",0.879831,0.120169,Negative
3,Definitely recommend this movie... if you want to bore yourself to death.,0.145549,0.854451,Positive
4,The movie was about two friends who embark on a journey. It has a runtime of two hours.,0.534178,0.465822,Negative
5,It is a typical superhero movie with action scenes and some emotional moments.,0.251785,0.748215,Positive
6,"The cinematography was colorful, and the soundtrack was loud.",0.296571,0.703429,Positive
7,The second half of the movie was longer than the first.,0.594796,0.405204,Negative
8,"It was okay, I guess, but I wouldn’t watch it again.",0.916377,0.083623,Negative
9,"Not bad, but not great either.",0.844127,0.155873,Negative


In [155]:
train_dataset = IMDBDataset(X_train_tfidf_ngram, y_train)
test_dataset = IMDBDataset(X_test_tfidf_ngram, y_test)

## Creatinh a dataloader

train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True)
test_dataloader = DataLoader(test_dataset,batch_size=32,shuffle=True)

In [156]:
# Initialize the neural network
input_dim = X_train_tfidf_ngram.shape[1]  # Adjust to include n-grams (5000 features)
model = IMDB_Network(input_dim)

# Train the model with n-gram features
train_model(model, train_dataloader, criterion, optimizer, epochs=10)

# Evaluate the model
evaluate_model(model, test_dataloader)


Epoch [1/10], Loss: 0.6935
Epoch [2/10], Loss: 0.6935
Epoch [3/10], Loss: 0.6935
Epoch [4/10], Loss: 0.6935
Epoch [5/10], Loss: 0.6935
Epoch [6/10], Loss: 0.6935
Epoch [7/10], Loss: 0.6935
Epoch [8/10], Loss: 0.6935
Epoch [9/10], Loss: 0.6935
Epoch [10/10], Loss: 0.6935
Test Accuracy: 0.5039


In [157]:
# Transform the raw reviews using the trained TF-IDF vectorizer
new_review_tfidf = vectorizer_ngrams.transform(new_review_raw)

# Convert the sparse matrix to a dense tensor for PyTorch
new_review_tensor = torch.tensor(new_review_tfidf.todense(), dtype=torch.float32)
# Set the model to evaluation mode
model.eval()

# Get predictions
with torch.no_grad():  # No need to compute gradients for inference
    outputs = model(new_review_tensor).squeeze()  # Get the output probabilities

# Convert outputs to probabilities
predicted_probs = outputs.numpy() 
# Create the validation results DataFrame
validationResults = pd.DataFrame()
validationResults['Test cases'] = new_review_raw
validationResults['Negative Probabilities'] = 1 - predicted_probs  # Complement for the negative class
validationResults['Positive Probabilities'] = predicted_probs  # Output from sigmoid (positive class)

# Add predictions based on a threshold
validationResults['Prediction'] = np.where(validationResults['Positive Probabilities'] >= 0.5, "Positive", "Negative")


In [158]:
validationResults

Unnamed: 0,Test cases,Negative Probabilities,Positive Probabilities,Prediction
0,"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",0.487196,0.512804,Positive
1,Best movie ever! I loved wasting 3 hours of my life on this masterpiece.,0.48761,0.51239,Positive
2,"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",0.487925,0.512075,Positive
3,Definitely recommend this movie... if you want to bore yourself to death.,0.487745,0.512255,Positive
4,The movie was about two friends who embark on a journey. It has a runtime of two hours.,0.487811,0.512189,Positive
5,It is a typical superhero movie with action scenes and some emotional moments.,0.48728,0.51272,Positive
6,"The cinematography was colorful, and the soundtrack was loud.",0.487663,0.512337,Positive
7,The second half of the movie was longer than the first.,0.487328,0.512672,Positive
8,"It was okay, I guess, but I wouldn’t watch it again.",0.48763,0.51237,Positive
9,"Not bad, but not great either.",0.48875,0.51125,Positive
