# Explore here

In [96]:
import pandas as pd

import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [97]:


url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [98]:
df = df.drop(columns=["package_name"]) 

In [99]:
df["review"] = df["review"].astype(str).str.strip().str.lower()

Division

In [100]:
X = df["review"]
y = df["polarity"]

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [102]:
X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [103]:
y.unique()

array([0, 1])

In [104]:

vectorizer = TfidfVectorizer(stop_words="english")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
mnb = MultinomialNB()
mnb.fit(X_train_vec, y_train)
y_pred_mnb = mnb.predict(X_test_vec)
print("♡ MultinomialNB:\n", classification_report(y_test, y_pred_mnb))

♡ MultinomialNB:
               precision    recall  f1-score   support

           0       0.78      0.98      0.87       126
           1       0.90      0.36      0.51        53

    accuracy                           0.80       179
   macro avg       0.84      0.67      0.69       179
weighted avg       0.82      0.80      0.77       179



In [106]:
bnb = BernoulliNB()
bnb.fit(X_train_vec, y_train)
y_pred_bnb = bnb.predict(X_test_vec)
print("♡ BernoulliNB:\n", classification_report(y_test, y_pred_bnb))


♡ BernoulliNB:
               precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179



In [107]:
gnb = GaussianNB()
X_train_dense = X_train_vec.toarray()
X_test_dense = X_test_vec.toarray()

gnb.fit(X_train_dense, y_train)
y_pred_gnb = gnb.predict(X_test_dense)
print("♡ GaussianNB:\n", classification_report(y_test, y_pred_gnb))

♡ GaussianNB:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       126
           1       0.70      0.62      0.66        53

    accuracy                           0.81       179
   macro avg       0.78      0.76      0.76       179
weighted avg       0.81      0.81      0.81       179



In [108]:

accuracy = accuracy_score(y_test, y_pred_bnb)
precision = precision_score(y_test, y_pred_bnb)
recall = recall_score(y_test, y_pred_bnb)
f1 = f1_score(y_test, y_pred_bnb)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Accuracy: 0.77
Precision: 0.70
Recall: 0.40
F1-score: 0.51


In [109]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_vec, y_train)
y_pred_rf = rf.predict(X_test_vec)

print("🌲 Random Forest:\n", classification_report(y_test, y_pred_rf))

🌲 Random Forest:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87       126
           1       0.69      0.68      0.69        53

    accuracy                           0.82       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.81      0.82      0.82       179



In [110]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Entrenar modelo de regresión logística
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_vec, y_train)

# Realizar predicciones
y_pred_lr = lr_model.predict(X_test_vec)

# Reporte de clasificación
print("📈 Regresión Logística:\n", classification_report(y_test, y_pred_lr))

📈 Regresión Logística:
               precision    recall  f1-score   support

           0       0.82      0.94      0.88       126
           1       0.79      0.49      0.60        53

    accuracy                           0.81       179
   macro avg       0.80      0.72      0.74       179
weighted avg       0.81      0.81      0.79       179



In [111]:
print("🔍 MultinomialNB Metrics")
print("Accuracy:", accuracy_score(y_test, y_pred_mnb))
print("Precision:", precision_score(y_test, y_pred_mnb))
print("Recall:", recall_score(y_test, y_pred_mnb))
print("F1 Score:", f1_score(y_test, y_pred_mnb))

🔍 MultinomialNB Metrics
Accuracy: 0.7988826815642458
Precision: 0.9047619047619048
Recall: 0.3584905660377358
F1 Score: 0.5135135135135135


Optimizar

In [112]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_vec, y_train)


In [113]:
y_pred_rf = rf.predict(X_test_vec)

In [114]:
print("🌲 Random Forest Results")
print(classification_report(y_test, y_pred_rf))

# Métricas adicionales
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))

🌲 Random Forest Results
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       126
           1       0.69      0.68      0.69        53

    accuracy                           0.82       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.81      0.82      0.82       179

Accuracy: 0.8156424581005587
Precision: 0.6923076923076923
Recall: 0.6792452830188679
F1 Score: 0.6857142857142857


In [117]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
random_search.fit(X_train_vec, y_train)

print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'fit_prior': False, 'alpha': np.float64(2.3192462311557787)}


In [119]:
hyperparams = {
    "alpha": np.linspace(0.01, 10.0, 200),
    "fit_prior": [True, False]
}

# We initialize the random search
random_search = RandomizedSearchCV(model, hyperparams, n_iter = 50, scoring = "accuracy", cv = 5, random_state = 42)
random_search

In [124]:
model = MultinomialNB(alpha = 1.917638190954774, fit_prior = False)
model.fit(X_train_vec, y_train)
model.fit(X_train_vec, y_train)
y_pred_mnb = model.predict(X_test_vec)
accuracy_score(y_test, y_pred_mnb)

0.8324022346368715

In [125]:

print("♡ MultinomialNB:\n", classification_report(y_test, y_pred_mnb))

♡ MultinomialNB:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89       126
           1       0.78      0.60      0.68        53

    accuracy                           0.83       179
   macro avg       0.81      0.77      0.78       179
weighted avg       0.83      0.83      0.83       179

