## Assignment - 1:
### Sentiment Anaysis on IMBD Dataset with Naive Bayes

In [None]:
# Step 1: Load the Dataset

import pandas as pd

#loading CSV file
df = pd.read_csv('IMDB_Dataset.csv')

#adding numeric label (0=negative, 1=positive)
df['label'] = df['sentiment'].map({'negative' : 0, 'positive':1})

print(df.shape)
df.head()

#sentiment is converted to number (0 and 1)

(50000, 3)


Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [6]:
# Step 2: Train - Test Split
# splitting into training and validation sets

from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
print(X_train.shape, X_val.shape)

# stratify = y ensures equal distribution of positive/negative in both train & validation

(40000,) (10000,)


In [None]:
# Step 3: Text CLeaning
#converting reviews into cleaner text

# Theory : LowerCasing reduces vocab size.
#        : Remove HTML tags, URLS, punctaitons, digits (noise).
#        : Keep negations like "not" as they change sentiment.

import re, html
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords') #comented after downloading.

#to Keep negations
stop_words = set(stopwords.words('english'))
for neg in ['not', 'no', 'nor', 'never']:
    stop_words.discard(neg)
    
def clean_text(text):
    #removing html tags :
    text = re.sub('r<[^>]+>', ' ', text)
    
    #decoding HTML entities 
    text = html.unescape(text)
    
    #removing URLS :
    text = re.sub(r"http\S+|www.\S+", " ", text)
    
    #lowercasing : 
    text = text.lower()
    
    #removing punctuatuins/digits :
    text = re.sub(r"[^a-z\s]", " ", text)
    
    #removing extra spaces
    text = re.sub(r"\s+", " ",text).strip()
    
    return text


print(clean_text("I didn't LIKE this movie! <br> Visit: http://abc.com"))

#if we remove "not", nodek may misclassify "not good" as "good"
#normalised spacing after regex replacements

i didn t like this movie br visit


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deeplatiyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Step 4: Feature Extraction(Bag-of-Words & TF-IDF)
# convering text to numbers using COuntVectorizer(BOW) and TfidfVectorizer.

#Theory : 
#   Bag-of-Words : counts word frequency. works wll with MultinomialNB
#   TF-IDF : scales down very common words, boosts rare but useful words

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_vectorizer = CountVectorizer(preprocessor=clean_text,
                                 stop_words=stop_words,
                                 ngram_range=(1,2),   # unigrams + bigrams
                                 min_df=5, max_df=0.9)

tfidf_vectorizer = TfidfVectorizer(preprocessor=clean_text,
                                   stop_words=stop_words,
                                   ngram_range=(1,2),
                                   min_df=5, max_df=0.9)

#adding both bigrams (1,2) captures phrases like "not good".
#min_df = 5 ignores very rare words.
#max_df = 0.9 ignores extremnly frequent words.

In [None]:
# Step 5: Train Models (MultinomialNB, BernoulliNB)
# lets train n compare both NB classifiers 

#Theory : 
#MultinomialNB : Best for words counts or tf-idf(works on frequencies).
#BernoulliNB : Best when features are binary (word present / absent).

form sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.mertics import accuracy_score, classification_report

def train_and_eval(vec , clg, X_train, X_val. y_train, y_val):
    #vectorize : 
    X_train_vec = vec.fit_transform(X_train)
    X_val_vec = vec.tranform(X_val)
    
    