# Movie Review Sentiment Classification using Naive Bayes and the IMDB Dataset

## The IMDB Movie Review Dataset

### Downloading the Dataset

In [1]:
import requests
import zipfile
import io
import os
import pandas as pd

# URL of the dataset (IMDB movie reviews)
url = "https://academy.hackthebox.com/storage/modules/292/skills_assessment_data.zip"

# Download the dataset
response = requests.get(url)
if response.status_code == 200:
    print("Download successful")
else:
    print("Failed to download the dataset")

# Extract the dataset to a folder called 'imdb_reviews'
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall("imdb_reviews")
    print("Extraction successful")

# List the extracted files
extracted_files = os.listdir("imdb_reviews")
print("Extracted files:", extracted_files)

Download successful
Extraction successful
Extracted files: ['test.json', 'train.json']


### Loading the Dataset

In [2]:
# Adjust the file name and structure as needed
train_df = pd.read_json("imdb_reviews/train.json")  # Use `lines=True` if the JSON file is line-delimited

# Display basic information about the dataset
print("-------------------- HEAD (Train) --------------------")
print(train_df.head())
print("-------------------- DESCRIBE (Train) --------------------")
print(train_df.describe())
print("-------------------- INFO (Train) --------------------")
print(train_df.info())

# Check for missing values
print("Missing values in training set:\n", train_df.isnull().sum())

# Check for duplicates
print("Duplicate entries in training set:", train_df.duplicated().sum())

# Remove duplicates if any
train_df = train_df.drop_duplicates()

-------------------- HEAD (Train) --------------------
                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1
-------------------- DESCRIBE (Train) --------------------
             label
count  25000.00000
mean       0.50000
std        0.50001
min        0.00000
25%        0.00000
50%        0.50000
75%        1.00000
max        1.00000
-------------------- INFO (Train) --------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    25000 non-null  object
 1   label   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+

## Preprocessing the IMDB Dataset

In [3]:
import nltk

# Download the necessary NLTK data files
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

print("=== BEFORE ANY PREPROCESSING (Train) ===") 
print(train_df.head(5))

=== BEFORE ANY PREPROCESSING (Train) ===
                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Lowercasing the Text

In [4]:
# Convert all review text to lowercase
train_df["text"] = train_df["text"].str.lower()
print("\n=== AFTER LOWERCASING (Train) ===")
print(train_df["text"].head(5))


=== AFTER LOWERCASING (Train) ===
0    bromwell high is a cartoon comedy. it ran at t...
1    homelessness (or houselessness as george carli...
2    brilliant over-acting by lesley ann warren. be...
3    this is easily the most underrated film inn th...
4    this is not the typical mel brooks film. it wa...
Name: text, dtype: object


### Removing Punctuation and Numbers


In [5]:
import re

# Remove non-essential punctuation and numbers, keep symbols like $ and !
train_df["text"] = train_df["text"].apply(lambda x: re.sub(r"[^a-z\s$!]", "", x))
print("\n=== AFTER REMOVING PUNCTUATION & NUMBERS (Train) ===")
print(train_df["text"].head(5))


=== AFTER REMOVING PUNCTUATION & NUMBERS (Train) ===
0    bromwell high is a cartoon comedy it ran at th...
1    homelessness or houselessness as george carlin...
2    brilliant overacting by lesley ann warren best...
3    this is easily the most underrated film inn th...
4    this is not the typical mel brooks film it was...
Name: text, dtype: object


### Tokenizing the Text

In [6]:
from nltk.tokenize import word_tokenize

# Split each review into individual tokens
train_df["text"] = train_df["text"].apply(word_tokenize)
print("\n=== AFTER TOKENIZATION (Train) ===")
print(train_df["text"].head(5))


=== AFTER TOKENIZATION (Train) ===
0    [bromwell, high, is, a, cartoon, comedy, it, r...
1    [homelessness, or, houselessness, as, george, ...
2    [brilliant, overacting, by, lesley, ann, warre...
3    [this, is, easily, the, most, underrated, film...
4    [this, is, not, the, typical, mel, brooks, fil...
Name: text, dtype: object


### Removing Stop Words

In [7]:
from nltk.corpus import stopwords

# Define a set of English stop words and remove them
stop_words = set(stopwords.words("english"))
train_df["text"] = train_df["text"].apply(lambda x: [word for word in x if word not in stop_words])
print("\n=== AFTER REMOVING STOP WORDS (Train) ===")
print(train_df["text"].head(5))


=== AFTER REMOVING STOP WORDS (Train) ===
0    [bromwell, high, cartoon, comedy, ran, time, p...
1    [homelessness, houselessness, george, carlin, ...
2    [brilliant, overacting, lesley, ann, warren, b...
3    [easily, underrated, film, inn, brooks, cannon...
4    [typical, mel, brooks, film, much, less, slaps...
Name: text, dtype: object


### Stemming

In [8]:
from nltk.stem import PorterStemmer

# Stem each token
stemmer = PorterStemmer()
train_df["text"] = train_df["text"].apply(lambda x: [stemmer.stem(word) for word in x])
print("\n=== AFTER STEMMING (Train) ===")
print(train_df["text"].head(5))


=== AFTER STEMMING (Train) ===
0    [bromwel, high, cartoon, comedi, ran, time, pr...
1    [homeless, houseless, georg, carlin, state, is...
2    [brilliant, overact, lesley, ann, warren, best...
3    [easili, underr, film, inn, brook, cannon, sur...
4    [typic, mel, brook, film, much, less, slapstic...
Name: text, dtype: object



### Joining Tokens Back into a Single String

In [9]:
# Rejoin tokens into a single string for feature extraction
train_df["text"] = train_df["text"].apply(lambda x: " ".join(x))
print("\n=== AFTER JOINING TOKENS BACK INTO STRINGS (Train) ===")
print(train_df["text"].head(5))


=== AFTER JOINING TOKENS BACK INTO STRINGS (Train) ===
0    bromwel high cartoon comedi ran time program s...
1    homeless houseless georg carlin state issu yea...
2    brilliant overact lesley ann warren best drama...
3    easili underr film inn brook cannon sure flaw ...
4    typic mel brook film much less slapstick movi ...
Name: text, dtype: object


## Feature Extraction

### Using CountVectorizer for the Bag-of-Words Approach

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer (bigrams, min_df=1, max_df=0.9 to focus on relevant terms)
vectorizer = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 2))

# Fit and transform the review column
X_train = vectorizer.fit_transform(train_df["text"])

# Labels (target variable)
# According to the prompt: positive=0, negative=1
# If your 'label' column is already 0 or 1, you can just use it directly.
# If not, map them accordingly.
y_train = train_df["label"].apply(lambda x: 1 if x == 1 else 0)

## Training and Evaluation

### Training

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV


# Build a pipeline that includes vectorization and classification
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 3))),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10, 100],  # Regularization strength
    "classifier__solver": ["lbfgs", "liblinear"],  # Different solvers
    "vectorizer__ngram_range": [(1, 2), (1, 3)],  # Bi-grams or Tri-grams
    "vectorizer__max_df": [0.75, 0.9, 1.0],  # Remove too frequent terms
    "vectorizer__min_df": [1, 2, 3]  # Remove rare terms
}

# Perform RandomizedSearchCV
grid_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=50,  # Number of random combinations to try
    cv=5,  # 5-fold cross-validation
    scoring="f1",
    verbose=1,
    n_jobs=-1,  # Use all processors
    random_state=42
)

# Fit the grid search on the training data
grid_search.fit(train_df["text"], y_train)

# Extract the best model
best_model = grid_search.best_estimator_
print("Best model parameters:", grid_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best model parameters: {'vectorizer__ngram_range': (1, 3), 'vectorizer__min_df': 1, 'vectorizer__max_df': 0.9, 'classifier__solver': 'liblinear', 'classifier__C': 100}


### Loading & Preprocessing the Test Set

In [15]:
# Similar approach for test data
test_df = pd.read_json("imdb_reviews/train.json")

print("\n-------------------- HEAD (Test) --------------------")
print(test_df.head())

# Basic cleanup and checks
test_df = test_df.drop_duplicates()
test_df["text"] = test_df["text"].str.lower()
test_df["text"] = test_df["text"].apply(lambda x: re.sub(r"[^a-z\s$!]", "", x))
test_df["text"] = test_df["text"].apply(word_tokenize)
test_df["text"] = test_df["text"].apply(lambda x: [word for word in x if word not in stop_words])
test_df["text"] = test_df["text"].apply(lambda x: [stemmer.stem(word) for word in x])
test_df["text"] = test_df["text"].apply(lambda x: " ".join(x))

X_test = best_model.named_steps["vectorizer"].transform(test_df["text"])
y_test = test_df["label"].apply(lambda x: 1 if x == 1 else 0)


-------------------- HEAD (Test) --------------------
                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1


### Making Predictions on the Test Set

In [13]:
# Ensure X_test contains the raw text (not preprocessed)
# The pipeline will handle preprocessing and vectorization internally
test_predictions = best_model.predict(test_df["text"])  # Pass raw test reviews to the pipeline
test_prediction_probabilities = best_model.predict_proba(test_df["text"])  # Same here for probabilities

# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score

acc = accuracy_score(y_test, test_predictions)
f1 = f1_score(y_test, test_predictions)

print(f"\nAccuracy on test set: {acc:.4f}")
print(f"F1-score on test set: {f1:.4f}")


Accuracy on test set: 1.0000
F1-score on test set: 1.0000


### Using joblib for Saving Models

In [16]:
import joblib

# Save the trained model
model_filename = 'imdb_sentiment_nb_model.joblib'
joblib.dump(best_model, model_filename)

print(f"Model saved to {model_filename}")

Model saved to imdb_sentiment_nb_model.joblib
