<a href="https://colab.research.google.com/github/4GeeksAcademy/Naive_Bayes/blob/main/Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
import pandas as pd

data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [46]:
# Eliminar la columna package_name
data.drop(columns=['package_name'], inplace=True)

# Aplicar strip() y lower() a la columna review
data['review'] = data['review'].str.strip().str.lower()

# Dividir el conjunto de datos en train y test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['review'], data['polarity'], test_size=0.2, random_state=42)


In [47]:
data.head(10)

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0
5,idk i can't edit my posts? things such as my p...,0
6,major flaws constant updates and always gettin...,0
7,video issues since i was forced into this upda...,0
8,this update completely destroyed my facebook. ...,0
9,"posting issues for the last week, there's been...",0


In [48]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Supongo que ya has cargado el dataset en el DataFrame 'data'

# Dividir el conjunto de datos en train y test
X = data["review"]
y = data["polarity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorizar las reseñas de las aplicaciones
vec_model = CountVectorizer(stop_words="english")
X_train_vec = vec_model.fit_transform(X_train)
X_test_vec = vec_model.transform(X_test)

# Inicializar los modelos
gaussian_nb = GaussianNB()
multinomial_nb = MultinomialNB()
bernoulli_nb = BernoulliNB()

# Entrenar los modelos
gaussian_nb.fit(X_train_vec.toarray(), y_train)
multinomial_nb.fit(X_train_vec, y_train)
bernoulli_nb.fit(X_train_vec, y_train)

# Hacer predicciones
gaussian_pred = gaussian_nb.predict(X_test_vec.toarray())
multinomial_pred = multinomial_nb.predict(X_test_vec)
bernoulli_pred = bernoulli_nb.predict(X_test_vec)

# Evaluar los modelos con más decimales
print("GaussianNB:")
print(classification_report(y_test, gaussian_pred, digits=4))
print("MultinomialNB:")
print(classification_report(y_test, multinomial_pred, digits=4))
print("BernoulliNB:")
print(classification_report(y_test, bernoulli_pred, digits=4))


GaussianNB:
              precision    recall  f1-score   support

           0     0.8473    0.8810    0.8638       126
           1     0.6875    0.6226    0.6535        53

    accuracy                         0.8045       179
   macro avg     0.7674    0.7518    0.7586       179
weighted avg     0.8000    0.8045    0.8015       179

MultinomialNB:
              precision    recall  f1-score   support

           0     0.8444    0.9048    0.8736       126
           1     0.7273    0.6038    0.6598        53

    accuracy                         0.8156       179
   macro avg     0.7859    0.7543    0.7667       179
weighted avg     0.8098    0.8156    0.8103       179

BernoulliNB:
              precision    recall  f1-score   support

           0     0.7852    0.9286    0.8509       126
           1     0.7000    0.3962    0.5060        53

    accuracy                         0.7709       179
   macro avg     0.7426    0.6624    0.6785       179
weighted avg     0.7600    0.7709 

El mejor modelo es el Multinomial

Optimizamos con Random Forest

In [49]:
from sklearn.ensemble import RandomForestClassifier

# Inicializar y entrenar el modelo RandomForest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_vec, y_train)

# Hacer predicciones
rf_pred = rf_model.predict(X_test_vec)

# Evaluar el modelo
print("RandomForestClassifier:")
print(classification_report(y_test, rf_pred, digits = 4))


RandomForestClassifier:
              precision    recall  f1-score   support

           0     0.8814    0.8254    0.8525       126
           1     0.6393    0.7358    0.6842        53

    accuracy                         0.7989       179
   macro avg     0.7604    0.7806    0.7683       179
weighted avg     0.8097    0.7989    0.8026       179



Sigue siendo mejor el modelo Multinomial, así que debemos intentarlo ahora con hiperparámetros, en éste caso Alpha y fit_prior.

In [56]:

import numpy as np
from sklearn.model_selection import RandomizedSearchCV

model = multinomial_nb

hyperparams = {
    "alpha": np.linspace(0.01, 20.0, 300),
    "fit_prior": [True, False]
}


random_search = RandomizedSearchCV(model, hyperparams, n_iter = 50, scoring = "accuracy", cv = 5, random_state = 42)
random_search

In [57]:
random_search.fit(X_train_vec, y_train)

print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'fit_prior': False, 'alpha': 1.8151170568561872}


In [58]:
model = MultinomialNB(alpha = 1.8151170568561872, fit_prior = False)
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, multinomial_pred, digits = 4))


              precision    recall  f1-score   support

           0     0.8444    0.9048    0.8736       126
           1     0.7273    0.6038    0.6598        53

    accuracy                         0.8156       179
   macro avg     0.7859    0.7543    0.7667       179
weighted avg     0.8098    0.8156    0.8103       179



Sale igual la exactitud

In [62]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Supresión de advertencias
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

In [63]:
# Modelo de Árbol de Decisión
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_vec, y_train)

# Predicciones
dt_pred = dt_model.predict(X_test_vec)

# Evaluación
print("Decision Tree Classifier:")
print(classification_report(y_test, dt_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_pred))

Decision Tree Classifier:
              precision    recall  f1-score   support

           0       0.82      0.76      0.79       126
           1       0.52      0.60      0.56        53

    accuracy                           0.72       179
   macro avg       0.67      0.68      0.67       179
weighted avg       0.73      0.72      0.72       179

Confusion Matrix:
 [[96 30]
 [21 32]]


In [64]:
# Modelo de Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_vec, y_train)

# Predicciones
gb_pred = gb_model.predict(X_test_vec)

# Evaluación
print("Gradient Boosting Classifier:")
print(classification_report(y_test, gb_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, gb_pred))

Gradient Boosting Classifier:
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       126
           1       0.55      0.49      0.52        53

    accuracy                           0.73       179
   macro avg       0.67      0.66      0.67       179
weighted avg       0.72      0.73      0.73       179

Confusion Matrix:
 [[105  21]
 [ 27  26]]


In [66]:
from sklearn.model_selection import train_test_split, GridSearchCV
# Modelo de XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'subsample': [0.6, 0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train_vec, y_train)

# Mejor estimador
best_xgb = grid_search.best_estimator_

# Predicciones
xgb_pred = best_xgb.predict(X_test_vec)

# Evaluación
print("XGBoost Classifier:")
print(classification_report(y_test, xgb_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_pred))
print(f"Best hyperparameters: {grid_search.best_params_}")

XGBoost Classifier:
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       126
           1       0.63      0.64      0.64        53

    accuracy                           0.78       179
   macro avg       0.74      0.74      0.74       179
weighted avg       0.78      0.78      0.78       179

Confusion Matrix:
 [[106  20]
 [ 19  34]]
Best hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200, 'subsample': 1.0}


In [67]:
from pickle import dump

dump(model, open("naive_bayes.sav", "wb"))
dump(model, open("naive_bayes.csv", "wb"))