# Explore here

In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"

df = pd.read_csv(url)

df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [2]:
df.shape
df.info()
df["polarity"].value_counts()

<class 'pandas.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   package_name  891 non-null    str  
 1   review        891 non-null    str  
 2   polarity      891 non-null    int64
dtypes: int64(1), str(2)
memory usage: 21.0 KB


polarity
0    584
1    307
Name: count, dtype: int64

In [3]:
df = df.drop(columns=["package_name"])

In [4]:
df["review"] = df["review"].astype(str).str.strip().str.lower()

In [5]:
df = df[df["review"].str.len() > 0].copy()

In [6]:
X = df["review"]
y = df["polarity"]

In [7]:
df.shape, df["polarity"].value_counts()

((891, 2),
 polarity
 0    584
 1    307
 Name: count, dtype: int64)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# 1) División train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2) Vectorización
vec_model = CountVectorizer(stop_words="english")

X_train = vec_model.fit_transform(X_train)
X_test = vec_model.transform(X_test)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1) Split (mantenemos proporción de clases)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2) Vectorización (conteos)
vec = CountVectorizer(stop_words="english")
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

models = {
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB(),   # (mejor si fuera binario, pero probamos)
    "GaussianNB": GaussianNB()      # (no ideal para conteos, pero probamos)
}

for name, clf in models.items():
    # GaussianNB requiere denso
    if name == "GaussianNB":
        Xtr = X_train_vec.toarray()
        Xte = X_test_vec.toarray()
    else:
        Xtr = X_train_vec
        Xte = X_test_vec

    clf.fit(Xtr, y_train)
    y_pred = clf.predict(Xte)

    print(f"\n===== {name} =====")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))
    print("Reporte:\n", classification_report(y_test, y_pred))


===== MultinomialNB =====
Accuracy: 0.8547486033519553
Matriz de confusión:
 [[112   5]
 [ 21  41]]
Reporte:
               precision    recall  f1-score   support

           0       0.84      0.96      0.90       117
           1       0.89      0.66      0.76        62

    accuracy                           0.85       179
   macro avg       0.87      0.81      0.83       179
weighted avg       0.86      0.85      0.85       179


===== BernoulliNB =====
Accuracy: 0.7821229050279329
Matriz de confusión:
 [[113   4]
 [ 35  27]]
Reporte:
               precision    recall  f1-score   support

           0       0.76      0.97      0.85       117
           1       0.87      0.44      0.58        62

    accuracy                           0.78       179
   macro avg       0.82      0.70      0.72       179
weighted avg       0.80      0.78      0.76       179


===== GaussianNB =====
Accuracy: 0.8156424581005587
Matriz de confusión:
 [[104  13]
 [ 20  42]]
Reporte:
               prec

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, recall_score, precision_score

# 1) Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2) Vectorización
vec = CountVectorizer(stop_words="english")
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# 3) Valores a probar
alphas = [0.01, 0.05, 0.1, 0.3, 0.5, 1.0, 2.0, 5.0]
threshold = 0.3

results = []

for a in alphas:
    model = MultinomialNB(alpha=a)
    model.fit(X_train_vec, y_train)

    # Probabilidades clase 1 (positiva)
    y_proba = model.predict_proba(X_test_vec)[:, 1]
    # Predicción con threshold elegido
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_test, y_pred)
    prec1 = precision_score(y_test, y_pred, pos_label=1)
    rec1 = recall_score(y_test, y_pred, pos_label=1)
    f1_1 = f1_score(y_test, y_pred, pos_label=1)

    results.append((a, acc, prec1, rec1, f1_1))

# 4) Mostrar ranking (ordenado por F1 de la clase 1)
results_sorted = sorted(results, key=lambda x: x[4], reverse=True)

print("alpha | accuracy | precision_1 | recall_1 | f1_1")
for r in results_sorted:
    print(f"{r[0]:>5} | {r[1]:>8.3f} | {r[2]:>11.3f} | {r[3]:>8.3f} | {r[4]:>5.3f}")

alpha | accuracy | precision_1 | recall_1 | f1_1
  0.5 |    0.883 |       0.847 |    0.806 | 0.826
  0.3 |    0.877 |       0.833 |    0.806 | 0.820
  1.0 |    0.877 |       0.845 |    0.790 | 0.817
  2.0 |    0.872 |       0.882 |    0.726 | 0.796
  0.1 |    0.860 |       0.814 |    0.774 | 0.793
 0.05 |    0.849 |       0.807 |    0.742 | 0.773
 0.01 |    0.844 |       0.827 |    0.694 | 0.754
  5.0 |    0.821 |       0.875 |    0.565 | 0.686


In [12]:
import pickle

# Entrenamos modelo final con alpha óptimo
final_model = MultinomialNB(alpha=0.5)
final_model.fit(X_train_vec, y_train)

# Guardar modelo
pickle.dump(final_model, open("naive_bayes_alpha_0_5.pkl", "wb"))

# Guardar vectorizador
pickle.dump(vec, open("count_vectorizer.pkl", "wb"))