Import Libraries

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

Load IMDB Dataset

In [12]:
df = pd.read_csv("IMDB-Dataset.csv")  # Make sure this file is in your working directory
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Preprocessing 

In [13]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

Apply preprocessing

In [14]:
df['clean_review'] = df['review'].apply(clean_text)

Encode Sentiment (positive -> 1, negative -> 0)

In [15]:
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

Split dataset

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_review'], df['label'], test_size=0.2, random_state=42)

TF-IDF Vectorization

In [17]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


##Train and Evaluate Models

Logistic Regression

In [18]:
lr_model = LogisticRegression()
lr_model.fit(X_train_vec, y_train)
lr_preds = lr_model.predict(X_test_vec)

In [19]:
print("Logistic Regression Results:\n")
print(classification_report(y_test, lr_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_preds))
print("Accuracy:", accuracy_score(y_test, lr_preds))

Logistic Regression Results:

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Confusion Matrix:
 [[4399  562]
 [ 458 4581]]
Accuracy: 0.898


Save Model and Vectorizer

In [20]:
joblib.dump(lr_model, "logistic_regression_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
print("\n✅ Model and Vectorizer saved as .pkl files.")


✅ Model and Vectorizer saved as .pkl files.


Naive Bayes

In [21]:
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)
nb_preds = nb_model.predict(X_test_vec)


In [22]:
print("\nNaive Bayes Results:\n")
print(classification_report(y_test, nb_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, nb_preds))
print("Accuracy:", accuracy_score(y_test, nb_preds))


Naive Bayes Results:

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      4961
           1       0.85      0.87      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion Matrix:
 [[4198  763]
 [ 643 4396]]
Accuracy: 0.8594


Predict on Custom Review

In [23]:
def predict_sentiment(review):
    cleaned = clean_text(review)
    vector = tfidf.transform([cleaned])
    prediction = lr_model.predict(vector)[0]
    return "Positive" if prediction == 1 else "Negative"

Example

In [24]:
sample_review = "This movie was absolutely wonderful and touching!"
print("\nSample Prediction:", predict_sentiment(sample_review))


Sample Prediction: Positive
