### Assignment - 1
## Sentiment Analysis project 
### with IMBD reviews.

In [12]:
# 1. Import Libraries : 
import pandas as pd
import numpy as np
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
import nltk

#these libraries cover data handling, preprocessing , text vectorization, modeling, and evaluation

In [13]:
# 2. Load Data
df = pd.read_csv('IMDB_Dataset.csv')
df.head() #phli 5 lines dega df (data) ki.

#Loading the data and inspecting the first few rows

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [14]:
# 3. Text Preprocessing
# defining a function for : Lowercasing, Removing punctuation, Removing digits, Removing stopwords

nltk.download('stopwords')  #downloading stopwords...

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deeplatiyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
#defining pre processing function :
stop_words = set(stopwords.words('english'))

def clean_text(text):
    #Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    #remove digits
    text = re.sub(r'\d+', '', text)
    
    #remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

#applying preprocessing
df['clean_review'] = df['review'].apply(clean_text)
df.head()

#clean and noramlize text to help the model focus on actial sentiment rather than irrelevant symbols

Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


In [16]:
# 4. Label Encoding Target Varibale

df['label'] = df['sentiment'].map({'positive' :1, 'negative': 0})

#model require numerical targets.

In [17]:
# 5. Split Data

X = df['clean_review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#separates training and testing data to evaluate model performance on unseen data

In [18]:
# 6. Feature Extraction
# a)Bag of Words(CountVectorizer)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [19]:
# b)TF-IDF vectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

#above both are ways to transform text into numeric features, will compare to which will work better for the given data 

In [27]:
# 7. Train and Evaluate Naive Bayes Models 
# will do for both CV and TF-IDF features
#creating a helper function to train/test models :

from sklearn.metrics import accuracy_score, classification_report

def evaluate_model(model, X_tr, X_te, y_tr, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    acc= accuracy_score(y_te, y_pred)
    report = classification_report(y_te, y_pred)
    return acc, report

In [28]:
# a)With CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

models = {
    'MultinomialNB': MultinomialNB(),
    'BernoulliNB': BernoulliNB(),
}


results_cv = {}

for name, model in models.items():
    acc, report = evaluate_model(model, X_train_cv, X_test_cv, y_train, y_test)
    results_cv[name] = (acc, report)




In [29]:
#b)with TF-IDF
results_tfidf = {}

for name, model in models.items():
    acc, report = evaluate_model(model, X_train_tfidf, X_test_tfidf, y_train, y_test)
    results_tfidf[name] = (acc, report)


In [31]:
# 8. Compare Results

print("===== CountVectorizer Results =====")
for name, (acc, report) in results_cv.items():
    print(f"\nModel: {name}")
    print(f"Accuracy: {acc:.4f}")
    print(report)

print("===== TF-IDF Results =====")
for name, (acc, report) in results_tfidf.items():
    print(f"\nModel: {name}")
    print(f"Accuracy: {acc:.4f}")
    print(report)


===== CountVectorizer Results =====

Model: MultinomialNB
Accuracy: 0.8478
              precision    recall  f1-score   support

           0       0.84      0.85      0.85      4961
           1       0.85      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000


Model: BernoulliNB
Accuracy: 0.8520
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

===== TF-IDF Results =====

Model: MultinomialNB
Accuracy: 0.8695
              precision    recall  f1-score   support

           0       0.86      0.89      0.87      4961
           1       0.88      0.85      0.87      5039

    

In [32]:
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.1, 0.5, 1.0]}
grid = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train_cv, y_train)

print(f"Best alpha: {grid.best_params_['alpha']}")
print(f"Best CV accuracy: {grid.best_score_}")


Best alpha: 0.1
Best CV accuracy: 0.8458249999999999


In [33]:
best_alpha = 0.1
best_mnb = MultinomialNB(alpha=best_alpha)
best_mnb.fit(X_train_cv, y_train)

y_pred = best_mnb.predict(X_test_cv)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Test Accuracy with alpha={best_alpha}: {acc:.4f}")
print("Classification Report:\n", report)

import joblib
joblib.dump(best_mnb, 'best_multinomial_nb_model.pkl')


Test Accuracy with alpha=0.1: 0.8476
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.85      0.85      4961
           1       0.85      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



['best_multinomial_nb_model.pkl']