In [None]:
import os
import pandas as pd

DATASET_PATH = "/kaggle/input/datasets/shreann/imdb-dataset"
print(os.listdir(DATASET_PATH))

messages = pd.read_csv(f"{DATASET_PATH}/IMDBDataset.csv")
messages.head()


In [None]:
# Data Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

corpus = []
for i in range(len(messages)):
    rev = re.sub('[^a-zA-Z]', ' ', str(messages['review'][i]))
    rev = rev.lower()
    rev = rev.split()

    clean_words = []
    for word in rev:
        if word not in stop_words:
            clean_words.append(lemmatizer.lemmatize(word))

    rev = ' '.join(clean_words)
    corpus.append(rev)

# Bag of Words
cv = CountVectorizer(max_features=2000)
X = cv.fit_transform(corpus).toarray()

y = pd.get_dummies(messages['sentiment'])
y = y.iloc[:, 1].values

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# Naive Bayes Model
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
confusion_m = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(confusion_m)
print(accuracy)
