# Explore here

In [1]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
df.head(5)

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [2]:
df.drop(columns=['package_name'], inplace=True)

df['review'] = df['review'].str.strip().str.lower()

df.head()


Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [3]:
from sklearn.model_selection import train_test_split

X = df["review"]
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Tamaño de entrenamiento:", len(X_train))
print("Tamaño de prueba:", len(X_test))


Tamaño de entrenamiento: 712
Tamaño de prueba: 179


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words="english")

X_train_vec = vec_model.fit_transform(X_train).toarray()

X_test_vec = vec_model.transform(X_test).toarray()

print("Forma de X_train:", X_train_vec.shape)
print("Forma de X_test:", X_test_vec.shape)

Forma de X_train: (712, 3310)
Forma de X_test: (179, 3310)


In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

y_pred = nb_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8156424581005587

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179



In [6]:
from sklearn.naive_bayes import BernoulliNB

bnb_model = BernoulliNB()
bnb_model.fit(X_train_vec, y_train)
y_pred_bnb = bnb_model.predict(X_test_vec)

print("BernoulliNB - Accuracy:", accuracy_score(y_test, y_pred_bnb))
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred_bnb))


BernoulliNB - Accuracy: 0.770949720670391

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179



In [7]:
from sklearn.naive_bayes import GaussianNB

# Entrenar GaussianNB
gnb_model = GaussianNB()
gnb_model.fit(X_train_vec, y_train)
y_pred_gnb = gnb_model.predict(X_test_vec)


print("GaussianNB - Accuracy:", accuracy_score(y_test, y_pred_gnb))
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred_gnb))


GaussianNB - Accuracy: 0.8044692737430168

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179



Elegimos: MultinomialNB como nuestro modelo base.
Ahora, tal como indica el Paso 4 DEL EJERCICIO vamos a intentar optimizar con RANDOM FOREST

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vec, y_train)

y_pred_rf = rf_model.predict(X_test_vec)

print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred_rf))


Random Forest - Accuracy: 0.7988826815642458

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       126
           1       0.64      0.74      0.68        53

    accuracy                           0.80       179
   macro avg       0.76      0.78      0.77       179
weighted avg       0.81      0.80      0.80       179



Random Forest - Accuracy: 0.7988826815642458

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       126
           1       0.64      0.74      0.68        53

    accuracy                           0.80       179
   macro avg       0.76      0.78      0.77       179
weighted avg       0.81      0.80      0.80       179

Conclusión:

Nos quedamos con MultinomialNB como modelo principal.

Random Forest fue competitivo, pero no lo superó.

Explorar otras alternativas al modelo Naive Bayes.

In [9]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_vec, y_train)

y_pred_lr = lr_model.predict(X_test_vec)

print("Logistic Regression - Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred_lr))


Logistic Regression - Accuracy: 0.8324022346368715

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.91      0.84      0.88       126
           1       0.68      0.81      0.74        53

    accuracy                           0.83       179
   macro avg       0.80      0.83      0.81       179
weighted avg       0.85      0.83      0.84       179



In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Entrenar modelo XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_vec, y_train)

y_pred_xgb = xgb_model.predict(X_test_vec)

print("XGBoost - Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost - Accuracy: 0.8100558659217877

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       126
           1       0.68      0.68      0.68        53

    accuracy                           0.81       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.81      0.81      0.81       179



Comparativa Final de Modelos

Modelo	        Accuracy	Comentarios clave
MultinomialNB	81.6%	    Base robusto para texto

LogisticRegression	83.2%	Mejor en accuracy y recall para positivos.

Random Forest	    79.9%	Aceptable, pero con menor rendimiento.

GaussianNB	        80.4%	Funciona, pero no es ideal para texto.

BernoulliNB	        77.1%	El peor desempeño.

XGBoost	            81.0%	Muy parejo, pero no supera a regresión logística.

Conclusión y Recomendaciones

Modelo final recomendado: LogisticRegression

Mejor equilibrio entre precisión, recall y accuracy.