# Explore here

Step 1: Load and Explore Data


In [5]:
# Your code here
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


# Load data
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)

# Show first 5 rows
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


Step 2: Preprocess Text Data

In [6]:
# Remove irrelevant column
df = df.drop(columns=['package_name'])

# Clean text: lowercase + remove extra spaces
df['review'] = df['review'].str.strip().str.lower()

# Split data
X = df['review']
y = df['polarity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Step 3: Convert Text to Numbers

In [7]:
# Create word-count matrix (ignore English stopwords like "the", "and")
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()

Step 4: Build & Compare Naive Bayes Models

In [8]:
# Initialize models
models = {
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB(),
    "GaussianNB": GaussianNB()
}

# Train and evaluate
results = {}
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

# Show results
pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy'])

Unnamed: 0,Accuracy
MultinomialNB,0.815642
BernoulliNB,0.77095
GaussianNB,0.804469


Step 5: Optimize the Best Model


In [9]:
# Tune MultinomialNB (best performer)
alphas = [0.1, 0.5, 1.0, 2.0]  # Smoothing parameters

for alpha in alphas:
    model = MultinomialNB(alpha=alpha)
    model.fit(X_train_vec, y_train)
    acc = model.score(X_test_vec, y_test)
    print(f"Alpha {alpha}: Accuracy = {acc:.4f}")

Alpha 0.1: Accuracy = 0.8324
Alpha 0.5: Accuracy = 0.8268
Alpha 1.0: Accuracy = 0.8156
Alpha 2.0: Accuracy = 0.8324


Step 6: Compare with Random Forest


In [10]:
# Train Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_vec, y_train)
rf_acc = rf.score(X_test_vec, y_test)

print(f"Random Forest Accuracy: {rf_acc:.4f}")
print(f"Best NB Accuracy: {max(results.values()):.4f}")

Random Forest Accuracy: 0.8045
Best NB Accuracy: 0.8156


Step 7: Save the Model


In [None]:
# Save best model
best_model = MultinomialNB(alpha=0.5)
best_model.fit(X_train_vec, y_train)
pickle.dump(best_model, open('naive_bayes_model.pkl', 'wb'))

# Save vectorizer too (for new text preprocessing)
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))
