# Explore here

In [None]:
# Your code here

import pandas as pd

# Step 1: Load the dataset
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)

print("Dataset shape:", df.shape)
print(df.head())

# Step 2: Preprocessing
# Remove the app name column as it is not relevant for sentiment analysis
df = df.drop('package_name', axis=1)

# Standardize the review text: remove extra spaces and convert to lowercase
df['review'] = df['review'].str.strip().str.lower()

# Define predictor and target variables
X = df['review']
y = df['polarity']

# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Transform text data into numeric features using Bag-of-Words
from sklearn.feature_extraction.text import CountVectorizer
vec_model = CountVectorizer(stop_words="english")
X_train_vec = vec_model.fit_transform(X_train).toarray()
X_test_vec = vec_model.transform(X_test).toarray()

# Step 3: Train and evaluate Naive Bayes models

from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# MultinomialNB (recommended for text data)
nb_m = MultinomialNB()
nb_m.fit(X_train_vec, y_train)
y_pred_m = nb_m.predict(X_test_vec)
print("MultinomialNB accuracy:", accuracy_score(y_test, y_pred_m))
print(classification_report(y_test, y_pred_m))

# BernoulliNB (good for binary word occurrence features)
nb_b = BernoulliNB()
nb_b.fit(X_train_vec, y_train)
y_pred_b = nb_b.predict(X_test_vec)
print("BernoulliNB accuracy:", accuracy_score(y_test, y_pred_b))
print(classification_report(y_test, y_pred_b))

# GaussianNB (not ideal for text, but included for comparison)
nb_g = GaussianNB()
nb_g.fit(X_train_vec, y_train)
y_pred_g = nb_g.predict(X_test_vec)
print("GaussianNB accuracy:", accuracy_score(y_test, y_pred_g))
print(classification_report(y_test, y_pred_g))

# Step 4: Random Forest as an alternative
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf.fit(X_train_vec, y_train)
y_pred_rf = rf.predict(X_test_vec)
print("Random Forest accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Step 5: Save the models
import joblib
import os

os.makedirs('models', exist_ok=True)
joblib.dump(nb_m, 'models/best_naive_bayes.pkl')
joblib.dump(rf, 'models/random_forest_vec.pkl')
print("Models saved to 'models/' directory.")

# Step 6: Try another model: Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train)
y_pred_lr = lr.predict(X_test_vec)
print("Logistic Regression accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
