In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

nltk.download('stopwords')
from nltk.corpus import stopwords

df = pd.read_csv('IMDB Dataset.csv')


def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"https?://\S+|www\.\S+", " ", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = re.sub(r"\s+", " ", text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stopwords.words("english")]
    return " ".join(tokens)

df["clean_text"] = df["review"].astype(str).apply(clean_text)


le = LabelEncoder()
df["label"] = le.fit_transform(df["sentiment"])


X_train, X_test, y_train, y_test = train_test_split(df["clean_text"], df["label"], test_size=0.2, random_state=42)


tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)


y_pred = nb_model.predict(X_test_vec)

print("\nMultinomial Naive Bayes Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


def predict_sentiment(text):
    text_clean = clean_text(text)
    vectorized = tfidf.transform([text_clean])
    pred = nb_model.predict(vectorized)[0]
    prob = np.max(nb_model.predict_proba(vectorized))
    return le.inverse_transform([pred])[0], prob

text = "This movie was absolutely wonderful and emotionally touching."
sentiment, confidence = predict_sentiment(text)
print(f"\nSample Prediction:\nSentiment: {sentiment}, Confidence: {confidence:.2f}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhumi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Multinomial Naive Bayes Results:
Accuracy: 0.8579
Confusion Matrix:
 [[4176  785]
 [ 636 4403]]
Classification Report:
               precision    recall  f1-score   support

    negative       0.87      0.84      0.85      4961
    positive       0.85      0.87      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000


Sample Prediction:
Sentiment: positive, Confidence: 0.87


In [9]:
text = "This movie was  worst  and emotionally hillarious."
sentiment, confidence = predict_sentiment(text)
print(f"\nSample Prediction:\nSentiment: {sentiment}, Confidence: {confidence:.2f}")


Sample Prediction:
Sentiment: negative, Confidence: 0.66
