In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# üé¨ Movie Review Sentiment Analysis (IMDB Dataset)

In this project, we perform **Sentiment Analysis** on movie reviews using the **IMDB 50K Movie Reviews Dataset**.  
The objective is to classify a given movie review as **Positive** or **Negative** based on its textual content.

---

## üìÇ Dataset Description

- **Dataset:** IMDB Dataset of 50K Movie Reviews  
- **Source:** Kaggle  
- **Total Samples:** 50,000  

### Columns:
- `review` ‚Üí Textual movie review  
- `sentiment` ‚Üí Target label (`positive`, `negative`)  

‚úî The dataset is **balanced**, containing an equal number of positive and negative reviews.

---

## ‚öôÔ∏è Data Preprocessing Steps

To prepare the raw text for modeling, the following preprocessing steps were applied:

- Removal of HTML tags  
- Conversion of text to lowercase  
- Removal of special characters and punctuation  
- Stopword removal using **NLTK**  
- Stemming using **Porter Stemmer**  
- Conversion of text data into numerical form using **CountVectorizer**

---

## üß† Model Building

We applied **Naive Bayes classifiers**, which are well-suited for text classification problems:

- **Multinomial Naive Bayes**
- **Bernoulli Naive Bayes**

The dataset was split into **training** and **testing** sets, and models were trained on vectorized text data.

---

## üìä Model Evaluation

Model performance was evaluated using:

- **Accuracy Score**
- Comparison between different Naive Bayes variants  

These metrics help determine how well the model predicts sentiment on unseen data.

---

## üöÄ Conclusion

This project demonstrates how **classical machine learning techniques**, combined with proper text preprocessing, can effectively solve sentiment analysis problems.  
Naive Bayes models perform efficiently on large text datasets and serve as a strong **baseline for NLP tasks**.


**In this step, we load the IMDb movie reviews dataset using pandas and take a quick look at the first few records to understand the structure of the data.**

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df.head()

**Statistical overview of all features.**

In [None]:
df.describe(include="all")


In [None]:
df['sentiment'].replace({'positive':1, 'negative':0}, inplace=True)

In [None]:
df.head

In [None]:
import matplotlib.pyplot as plt

sentiment_counts = df['sentiment'].value_counts()

plt.figure()
sentiment_counts.plot(kind='bar')
plt.xlabel("Sentiment (0 = Negative, 1 = Positive)")
plt.ylabel("Number of Reviews")
plt.title("Sentiment Distribution")
plt.show()

**In this step, HTML tags are removed from the movie reviews to clean the raw text data before further preprocessing.**

In [None]:
df['review'][0]

In [None]:
import re
clean = re.compile('<.*?>')
re.sub(clean, '',df.iloc[2].review)

In [None]:
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)



In [None]:
df['review']=df['review'].apply(clean_html)

In [None]:
df['review'][2]

**All review text is converted to lowercase to ensure consistency and reduce vocabulary size during text processing.**

In [None]:
def convert_lower(text):
    return text.lower()

df['review']=df['review'].apply(convert_lower)
df['review'][0]

**This step removes special characters and punctuation from the review text, keeping only alphanumeric characters to simplify text processing.**

In [None]:
def remove_special(text):
    x=''

    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x + ' '
    return x

In [None]:
df['review']=df['review'].apply(remove_special)

**Common English stopwords are removed from the review text to reduce noise and focus on meaningful words for sentiment classification.**

In [None]:
import nltk
from nltk.corpus import stopwords

def remove_stopwords(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y 
    

In [None]:
df['review']=df['review'].apply(remove_stopwords)

In [None]:
df

**Stemming is a text preprocessing technique that reduces words to their root or base form.  
It helps in minimizing vocabulary size and treating different word forms as the same feature, which improves model efficiency.
In this step, the Porter Stemmer is applied to each word in the review text.**

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z

df['review'] = df['review'].apply(stem_words)

df['review'][0]


In [None]:
# Converting list of words back to normal text
df['review'] = df['review'].apply(lambda x: " ".join(x))
df['review'][0]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:
X=cv.fit_transform(df['review'])

In [None]:
X.shape

In [None]:
y = df.iloc[:,-1].values
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape :", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape :", y_test.shape)


In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

In [None]:

clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [None]:

y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [None]:
y_pred2.shape

## üìà Model Accuracy Evaluation

To evaluate the performance of the trained models, we used **Accuracy Score** from `sklearn.metrics`.

Accuracy measures the proportion of correctly classified reviews out of the total test samples.

The following Naive Bayes models were evaluated:

- **Multinomial Naive Bayes**
- **Bernoulli Naive Bayes**

In [None]:
from sklearn.metrics import accuracy_score

print("Multinomial",accuracy_score(y_test,y_pred2))
print("Bernaulli",accuracy_score(y_test,y_pred3))