## Assignment - 1:
### Sentiment Anaysis on IMBD Dataset with Naive Bayes

In [None]:
# Step 1: Load the Dataset

import pandas as pd

#loading CSV file
df = pd.read_csv('IMDB_Dataset.csv')

#adding numeric label (0=negative, 1=positive)
df['label'] = df['sentiment'].map({'negative' : 0, 'positive':1})

print(df.shape)
df.head()

#sentiment is converted to number (0 and 1)

(50000, 3)


Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [6]:
# Step 2: Train - Test Split
# splitting into training and validation sets

from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
print(X_train.shape, X_val.shape)

# stratify = y ensures equal distribution of positive/negative in both train & validation

(40000,) (10000,)


In [None]:
# Step 3: Text CLeaning
#converting reviews into cleaner text

# Theory : LowerCasing reduces vocab size.
#        : Remove HTML tags, URLS, punctaitons, digits (noise).
#        : Keep negations like "not" as they change sentiment.

import re, html
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords') #comented after downloading.

#to Keep negations
stop_words = set(stopwords.words('english'))
for neg in ['not', 'no', 'nor', 'never']:
    stop_words.discard(neg)
    
def clean_text(text):
    #removing html tags :
    text = re.sub('r<[^>]+>', ' ', text)
    
    #decoding HTML entities 
    text = html.unescape(text)
    
    #removing URLS :
    text = re.sub(r"http\S+|www.\S+", " ", text)
    
    #lowercasing : 
    text = text.lower()
    
    #removing punctuatuins/digits :
    text = re.sub(r"[^a-z\s]", " ", text)
    
    #removing extra spaces
    text = re.sub(r"\s+", " ",text).strip()
    
    return text


print(clean_text("I didn't LIKE this movie! <br> Visit: http://abc.com"))

#if we remove "not", nodek may misclassify "not good" as "good"
#normalised spacing after regex replacements

i didn t like this movie br visit


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deeplatiyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Step 4: Feature Extraction(Bag-of-Words & TF-IDF)
# convering text to numbers using COuntVectorizer(BOW) and TfidfVectorizer.

#Theory : 
#   Bag-of-Words : counts word frequency. works wll with MultinomialNB
#   TF-IDF : scales down very common words, boosts rare but useful words

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_vectorizer = CountVectorizer(preprocessor=clean_text,
                                 stop_words=stop_words,
                                 ngram_range=(1,2),   # unigrams + bigrams
                                 min_df=5, max_df=0.9)

tfidf_vectorizer = TfidfVectorizer(preprocessor=clean_text,
                                   stop_words=stop_words,
                                   ngram_range=(1,2),
                                   min_df=5, max_df=0.9)

#adding both bigrams (1,2) captures phrases like "not good".
#min_df = 5 ignores very rare words.
#max_df = 0.9 ignores extremnly frequent words.

In [16]:
# Step 5: Train Models (MultinomialNB, BernoulliNB)
# lets train n compare both NB classifiers 

#Theory : 
#MultinomialNB : Best for words counts or tf-idf(works on frequencies).
#BernoulliNB : Best when features are binary (word present / absent).

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

def train_and_eval(vec , clg, X_train, X_val, y_train, y_val):
    #vectorize : 
    X_train_vec = vec.fit_transform(X_train)
    X_val_vec = vec.tranform(X_val)
    
    #fit :
    clf.fit(X_train_vec, y_train)
    
    #predict :
    preds = clf.predict(X_val_vec)
    
    #evaluare :
    acc = accuracy_score(y_val, preds)
    print(f"Model: {clf.__class__.__name__}, Vectorizer: {vec.__class__.__name__}, Accuracy: {acc:.4f}")
    print(classification_report(y_val, preds, digits=4))
    print("=" *80)
    

# Run experiments
train_and_eval(bow_vectorizer, MultinomialNB(), X_train, X_val, y_train, y_val)
train_and_eval(tfidf_vectorizer, MultinomialNB(), X_train, X_val, y_train, y_val)

train_and_eval(bow_vectorizer, BernoulliNB(), X_train, X_val, y_train, y_val)
train_and_eval(tfidf_vectorizer, BernoulliNB(), X_train, X_val, y_train, y_val)

#accuracy n precision/recall are printed
#normally, MultinomialNB with TF-IDF gives best performance (!85-90%)
#BernoulliNB can be weaker unless features are stricly binary

InvalidParameterError: The 'stop_words' parameter of CountVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'just', 'with', 'this', 'which', 'against', 'mightn', 'ourselves', 'has', "it'd", 'mustn', 'its', "we'd", 'does', 'any', 'why', "you've", 'from', 'couldn', 'such', 'at', "mustn't", "couldn't", 'yourselves', 'those', 're', 'don', "haven't", 'of', 'should', 'our', 'them', 'you', 'didn', 'during', 'shouldn', 'into', "they'd", 'we', 'myself', 'wouldn', 'while', 'having', 'was', "he'll", 'm', 'being', 'weren', 'will', 'so', 'an', 'for', 'where', 'about', 'few', "she's", 'some', 'because', 'but', 'her', "i'll", 'or', "he'd", 'ain', 'doing', 'other', "hadn't", 'needn', 'shan', 'me', 'up', 'it', 'had', 'be', "that'll", 'between', 'i', 'before', "we've", 's', "they'll", 'to', 'now', 've', 'o', 'whom', 'who', 'been', "hasn't", 'his', 'd', 'all', 'above', 'my', 'are', "weren't", 'by', 'do', 'most', 'only', "wasn't", "aren't", 'they', 'themselves', 'ma', "i've", 'down', 'yourself', 'hasn', 't', 'over', 'wasn', "it'll", 'hers', 'again', "she'll", 'doesn', 'once', 'after', 'what', 'both', 'here', "they're", 'that', 'ours', 'yours', 'won', 'did', 'out', 'aren', 'through', "he's", 'isn', 'the', "we'll", "shouldn't", "isn't", 'as', "it's", "i'm", 'll', "shan't", 'haven', 'their', 'theirs', 'very', 'and', 'himself', 'than', "don't", "doesn't", 'he', "i'd", "they've", 'she', "you're", 'under', 'on', 'y', "you'd", 'there', 'each', 'is', 'itself', 'a', "didn't", "mightn't", 'these', 'can', 'more', 'same', 'too', "should've", "won't", 'off', 'your', 'in', "you'll", 'have', 'him', 'herself', 'below', 'until', "we're", 'if', 'further', 'am', 'how', 'were', "needn't", 'when', 'hadn', 'own', 'then', "wouldn't", "she'd"} instead.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords

stop_words = stopwords.words('english')   # list, not set
for neg in ['not','no','nor','never']:
    if neg in stop_words:
        stop_words.remove(neg)   # keep negations

bow_vectorizer = CountVectorizer(preprocessor=clean_text,
                                 stop_words=stop_words,
                                 ngram_range=(1,2),   
                                 min_df=5, max_df=0.9)

tfidf_vectorizer = TfidfVectorizer(preprocessor=clean_text,
                                   stop_words=stop_words,
                                   ngram_range=(1,2),
                                   min_df=5, max_df=0.9)

#to fix above error
#converted stopwords to list and
#fixed tying error

In [19]:
# Step 5: Train Models (MultinomialNB, BernoulliNB)
# Theory Notes:
# - MultinomialNB: Best for word counts or TF-IDF (works on frequencies).
# - BernoulliNB: Best when features are binary (word present / absent).

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

def train_and_eval(vec , clf, X_train, X_val, y_train, y_val):
    # 1. Vectorize text
    X_train_vec = vec.fit_transform(X_train)
    X_val_vec = vec.transform(X_val)
    
    # 2. Train model
    clf.fit(X_train_vec, y_train)
    
    # 3. Predict on validation data
    preds = clf.predict(X_val_vec)
    
    # 4. Evaluate model
    acc = accuracy_score(y_val, preds)
    print(f"Model: {clf.__class__.__name__}, Vectorizer: {vec.__class__.__name__}, Accuracy: {acc:.4f}")
    print(classification_report(y_val, preds, digits=4))
    print("=" *80)
    

# Run experiments
train_and_eval(bow_vectorizer, MultinomialNB(), X_train, X_val, y_train, y_val)
train_and_eval(tfidf_vectorizer, MultinomialNB(), X_train, X_val, y_train, y_val)

train_and_eval(bow_vectorizer, BernoulliNB(), X_train, X_val, y_train, y_val)
train_and_eval(tfidf_vectorizer, BernoulliNB(), X_train, X_val, y_train, y_val)

# Notes:
# - MultinomialNB + TF-IDF usually gives the best performance (≈85–90%).
# - BernoulliNB is better when features are binary (word presence/absence).

Model: MultinomialNB, Vectorizer: CountVectorizer, Accuracy: 0.8849
              precision    recall  f1-score   support

           0     0.8825    0.8880    0.8853      5000
           1     0.8873    0.8818    0.8845      5000

    accuracy                         0.8849     10000
   macro avg     0.8849    0.8849    0.8849     10000
weighted avg     0.8849    0.8849    0.8849     10000

Model: MultinomialNB, Vectorizer: TfidfVectorizer, Accuracy: 0.8919
              precision    recall  f1-score   support

           0     0.8942    0.8890    0.8916      5000
           1     0.8896    0.8948    0.8922      5000

    accuracy                         0.8919     10000
   macro avg     0.8919    0.8919    0.8919     10000
weighted avg     0.8919    0.8919    0.8919     10000

Model: BernoulliNB, Vectorizer: CountVectorizer, Accuracy: 0.8889
              precision    recall  f1-score   support

           0     0.8942    0.8822    0.8882      5000
           1     0.8838    0.8956  

In [22]:
#Step 6: Summary of Results
#comparing all models side by side

#Theory : We comapare which combination of features representation (BOW/TF-IDF) and classifier works best.

results = []

for vec in [bow_vectorizer, tfidf_vectorizer]:
    for clf in [MultinomialNB(), BernoulliNB()]:
        X_train_vec = vec.fit_transform(X_train)
        X_val_vec = vec.transform(X_val)
        
        clf.fit(X_train_vec, y_train)
        preds = clf.predict(X_val_vec)
        acc = accuracy_score(y_val, preds)
        
        results.append({
            "Vectorizer": vec.__class__.__name__,
            "Classifier": clf.__class__.__name__,
            "Accuracy": acc
        })

pd.DataFrame(results)

Unnamed: 0,Vectorizer,Classifier,Accuracy
0,CountVectorizer,MultinomialNB,0.8849
1,CountVectorizer,BernoulliNB,0.8889
2,TfidfVectorizer,MultinomialNB,0.8919
3,TfidfVectorizer,BernoulliNB,0.8889


In [23]:
#TfidfVectorizer + MultinomilaNB wins with accuracy = 89.19%

#NOTEs : 
# 1) Naive Bayes : simple, fast, interpretable, assumes features independence, works well for text
# 2) MultinomialNB : Best for counts/TF_IDF, assumes word frequencies as multinomial distribution.
# 3) BernoulliNB : Binary features (present/absent), useful if only word presence matters.
# 4) BOW vs TF-IDF : BOW=rawa frequency , TF-IDF : frequency weighted by importance, usually better for long text(like IMBD reviews)
# 5) Evaluation : use accuracy and classificartion report(precision/recall/F1), important because accuracy alone may hide class imbalance.