In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:

df = pd.read_csv("D:/ML Data Sets/IMDB Dataset.csv")

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['text'] = df['sentiment'].fillna('') + " " + df['review'].fillna('')
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

df['text'] = df['text'].apply(clean_text)

In [4]:
df['label'] = pd.Series(np.random.randint(0, 3, size=len(df)))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], 
    df['label'], 
    test_size=0.2, 
    random_state=2
)

In [6]:
vect = TfidfVectorizer(stop_words='english', max_features=2000)
X_train_vec = vect.fit_transform(X_train)
X_test_vec = vect.transform(X_test)

In [7]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Dission Tree": RandomForestClassifier(),
    "KNN": KNeighborsClassifier()
}


In [None]:


#  Naive Bayes
nb = MultinomialNB().fit(X_train_vec, y_train)
accuracies = {'Naive Bayes': accuracy_score(y_test, nb.predict(X_test_vec))}

#  Logistic Regression
lr = LogisticRegression(max_iter=1000).fit(X_train_vec, y_train)
accuracies['Logistic Regression'] = accuracy_score(y_test, lr.predict(X_test_vec))

# Decision Tree
dt = DecisionTreeClassifier().fit(X_train_vec, y_train)
accuracies['Decision Tree'] = accuracy_score(y_test, dt.predict(X_test_vec))

# Random Forest
rf = RandomForestClassifier().fit(X_train_vec, y_train)
accuracies['Random Forest'] = accuracy_score(y_test, rf.predict(X_test_vec))


# K-Nearest Neighbors
knn = KNeighborsClassifier().fit(X_train_vec, y_train)
accuracies['KNN'] = accuracy_score(y_test, knn.predict(X_test_vec))




In [None]:
plt.figure(figsize=(8, 5))
plt.bar(accuracies.keys(), accuracies.values(), color=['skyblue', 'lightgreen', 'salmon'])

In [None]:
for model, acc in accuracies.items():
    plt.text(model, acc + 0.01, f"{acc:.2f}", ha='center')

plt.title("Model Accuracies Comparison")
plt.xlabel("Models")
plt.ylabel("Accuracy Score")
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()