In [38]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

from sklearn.metrics import accuracy_score

from joblib import dump, load

In [2]:
df = pd.read_csv('Data/imdbdataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

### Text Cleaning

In [5]:
df.shape

(50000, 2)

In [6]:
# import random
# import pandas as pd
# from faker import Faker

# fake = Faker()

# # Parameters
# N = 50000
# spam_ratio = 0.5
# n_spam = int(N * spam_ratio)
# n_ham = N - n_spam

# # Some templates for variety
# spam_templates = [
#     "Congratulations! You've won {prize}. Click here to claim: {link}",
#     "Limited offer: Get {discount}% off on {product}. Visit {link} now!",
#     "Dear user, your account has been selected for a reward. Confirm here: {link}",
#     "You have an unclaimed parcel waiting. Track it here: {link}",
#     "Earn ${amount} per day working from home! Apply now at {link}",
#     "Act fast! Your {service} subscription expires soon. Renew at {link}",
#     "Your invoice for ${amount} is ready. View it online: {link}",
#     "Claim your free {product} sample today! Only while supplies last: {link}"
# ]

# ham_templates = [
#     "Hey {name}, just checking if you're available for a quick call tomorrow.",
#     "Meeting reminder: {event} at {time} on {date}.",
#     "Here are the notes from our last discussion. Let me know your feedback.",
#     "Hi {name}, please find attached the report for this week.",
#     "Thanks for your help with {project}, really appreciate it.",
#     "Dinner at {restaurant} tonight? Let me know what time works for you.",
#     "Can you review the presentation before our {time} meeting?",
#     "Happy birthday {name}! Hope you have a wonderful day!"
# ]

# def generate_spam(n):
#     data = []
#     for i in range(n):
#         template = random.choice(spam_templates)
#         text = template.format(
#             prize=random.choice(["a brand new iPhone", "a $500 gift card", "a free vacation", "exclusive voucher"]),
#             link=fake.uri(),
#             discount=random.randint(20, 90),
#             product=random.choice(["headphones", "smartwatch", "laptop", "tablet"]),
#             service=random.choice(["Netflix", "Amazon Prime", "Spotify"]),
#             amount=random.randint(100, 5000)
#         )
#         data.append(text)
#     return data

# def generate_ham(n):
#     data = []
#     for i in range(n):
#         template = random.choice(ham_templates)
#         text = template.format(
#             name=fake.first_name(),
#             event=random.choice(["team meeting", "doctor appointment", "project review"]),
#             time=fake.time(pattern="%I:%M %p"),
#             date=fake.date_this_year().strftime("%b %d"),
#             project=random.choice(["the website redesign", "marketing campaign", "budget analysis"]),
#             restaurant=fake.company()
#         )
#         data.append(text)
#     return data

# # Generate data
# spam_texts = generate_spam(n_spam)
# ham_texts = generate_ham(n_ham)

# # Combine
# data = pd.DataFrame({
#     "id": range(1, N + 1),
#     "text": spam_texts + ham_texts,
#     "label": ["spam"] * n_spam + ["ham"] * n_ham
# })

# # Shuffle for realism
# data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# # Save
# data.to_csv("mail_dataset.csv", index=False)
# print("✅ Dataset generated: mail_dataset.csv with", len(data), "rows.")

### Mapping Binary class to 0 and 1

In [7]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


### Removing the HTML tags using BS4 from review

In [9]:
df['review'] = df['review'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
df['review'] = df['review'].str.strip()

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


### Converting the reviews in lower case

In [11]:
df['review'] = df['review'].str.lower()

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


### Removing the special characters

In [13]:
df['review'] = df['review'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]', '', x))

In [14]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


### Removing the stop words

In [15]:
stop_words = set(stopwords.words('english'))

In [16]:
df['review'] = df['review'].apply(lambda x: ' '.join(
    [word for word in word_tokenize(x.lower()) if word not in stop_words]
))

In [17]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


### Stemming

In [18]:
stemmer = PorterStemmer()

In [19]:

df['review'] = df['review'].apply(lambda x: ' '.join(
    [stemmer.stem(word) for word in word_tokenize(x)]
))

In [20]:
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


### Let's initialize CountVectorizer

In [21]:
cv = CountVectorizer(max_features=1000)

In [22]:
X = cv.fit_transform(df['review']).toarray()

In [23]:
X.shape

(50000, 1000)

In [24]:
y = df.iloc[:, -1].values

In [25]:
y.shape

(50000,)

### Train Test Split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((35000, 1000), (15000, 1000), (35000,), (15000,))

### Model Training

In [28]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [29]:
gnb.fit(X_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [30]:
mnb.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [31]:
bnb.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


In [32]:
y_pred1 = gnb.predict(X_test)

In [33]:
y_pred2 = mnb.predict(X_test)

In [34]:
y_pred3 = bnb.predict(X_test)

In [35]:
y_test.shape == y_pred3.shape

True

In [36]:
print('Gaussian:', accuracy_score(y_test, y_pred1))
print('Multinomial:', accuracy_score(y_test, y_pred2))
print('Bernoulli:', accuracy_score(y_test, y_pred3))

Gaussian: 0.7816666666666666
Multinomial: 0.8268666666666666
Bernoulli: 0.8259333333333333


In [37]:
# Example new review
new_review = "Not worth watching. The story was confusing and the dialogues were cringy."

# Preprocess (match your training preprocessing)
def preprocess(text):
    text = text.lower()  # lowercase
    text = re.sub(r'<[^>]+>', '', text)  # remove HTML
    text = re.sub(r'[^a-z0-9\s]', '', text)  # remove special characters
    words = word_tokenize(text)
    words = [stemmer.stem(w) for w in words if w not in stop_words]  # remove stopwords + stem
    return ' '.join(words)

clean_review = preprocess(new_review)

# Convert to numeric features using your trained vectorizer
review_vec = cv.transform([clean_review])

# Predict using your trained model
prediction = mnb.predict(review_vec)

# Show result
print("Predicted sentiment:", "positive" if prediction[0]==1 else "negative")


Predicted sentiment: negative


In [39]:
dump(cv, 'Model/vectorizer.joblib')

['Model/vectorizer.joblib']

In [41]:
dump(mnb, 'Model/model.joblib')

['Model/model.joblib']