# TASK B : Text Classification

## Importing Required Libraries

In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\SANJAY
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Dataset (IMDb Reviews Dataset)
- Link of Dataset : https://www.kaggle.com/code/lakshmi25npathi/sentiment-analysis-of-imdb-movie-reviews/input?select=IMDB+Dataset.csv

In [4]:
df = pd.read_csv("IMDB Dataset.csv")

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Data Preprocessing:
- Clean the text data by removing unnecessary characters, digits, and stop words.
- Convert text to lowercase, and tokenize it (split text into words).

In [7]:
def clean_text(text):
    text = text.lower() 
    text = re.sub(f"[{string.punctuation}]", "", text)  
    text = re.sub("\d+", "", text)  
    text = " ".join([word for word in text.split() if word not in stop_words])  
    return text

In [9]:
df["cleaned_review"] = df["review"].apply(clean_text)

In [10]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


## Feature Extraction:
- Use TF-IDF (Term Frequency-Inverse Document Frequency) to convert the text into numerical features.
- TF-IDF is better than Bag of Words because it gives more importance to meaningful words and less importance to common words

In [11]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["cleaned_review"])
y = df["sentiment"].map({"positive": 1, "negative": 0}) 

## Train-Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Selection and Training

In [14]:
model = MultinomialNB()
model.fit(X_train, y_train)

## Predictions

In [17]:
y_pred = model.predict(X_test)

## Evaluation

In [18]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [19]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Accuracy: 0.85
Precision: 0.85
Recall: 0.85


## Manual Testing

In [26]:
def predict_sentiment(review):
    review_cleaned = clean_text(review)  
    review_tfidf = vectorizer.transform([review_cleaned])  
    prediction = model.predict(review_tfidf)  
    sentiment = "Positive" if prediction[0] == 1 else "Negative"
    return sentiment

print(predict_sentiment("This movie was not good!"))

Negative
