In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import uniform
import joblib

# 1.1 Importamos el dataframe

In [2]:
df = pd.read_csv('../data/raw/playstore_reviews.csv')
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [3]:
print(df.shape)

(891, 3)


In [4]:
# Eliminamos la columna package_name
df = df.drop(columns=['package_name'])
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [5]:
# Eliminamos espacios y convertimos a min√∫sculas
df["review"] = df["review"].str.strip().str.lower()

# 2.1 Division Train Test

In [6]:
review = df["review"]
polarity = df["polarity"]

In [7]:
# Dividimos los datos en conjuntos de entrenamiento y testeo
X_train, X_test, y_train, y_test = train_test_split(review, polarity, test_size=0.2, random_state=42)

# 3.1 Vectorizacion

In [8]:
vectorizer = CountVectorizer()

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 4.1 Modelo Multinomial

In [9]:
bayes_multinomial= MultinomialNB().fit(X_train_vec, y_train)
y_pred_multinomial = bayes_multinomial.predict(X_test_vec)
print(classification_report(y_test, y_pred_multinomial))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90       126
           1       0.84      0.58      0.69        53

    accuracy                           0.84       179
   macro avg       0.84      0.77      0.79       179
weighted avg       0.84      0.84      0.83       179



# 4.2 Modelo Gaussian

In [10]:
bayes_gaussian = GaussianNB().fit(X_train_vec.toarray(), y_train)
y_pred_gaussian = bayes_gaussian.predict(X_test_vec.toarray())
print(classification_report(y_test, y_pred_gaussian))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86       126
           1       0.70      0.60      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179



# 4.3 Modelo Bernoulli

In [11]:
bayes_bernoulli = BernoulliNB().fit(X_train_vec, y_train)
y_pred_bernoulli = bayes_bernoulli.predict(X_test_vec)
print(classification_report(y_test, y_pred_bernoulli))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90       126
           1       0.81      0.64      0.72        53

    accuracy                           0.85       179
   macro avg       0.84      0.79      0.81       179
weighted avg       0.85      0.85      0.84       179



# 4.4 Comparativa modelos

In [12]:
# Calculamos las m√©tricas para cada modelo
modelos = ['MultinomialNB', 'GaussianNB', 'BernoulliNB']
predicciones = [y_pred_multinomial, y_pred_gaussian, y_pred_bernoulli]

resultados = []
for modelo, pred in zip(modelos, predicciones):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    
    resultados.append({
        'Modelo': modelo,
        'Accuracy': round(accuracy, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1-Score': round(f1, 4)
    })

# Creamos un DataFrame con los resultados
comparacion = pd.DataFrame(resultados)
print(comparacion)

# Identificamos el mejor modelo por accuracy
mejor_modelo = comparacion.loc[comparacion['Accuracy'].idxmax()]
print(f"\nüèÜ Mejor modelo: {mejor_modelo['Modelo']} con Accuracy: {mejor_modelo['Accuracy']}")

          Modelo  Accuracy  Precision  Recall  F1-Score
0  MultinomialNB    0.8436     0.8429  0.8436    0.8343
1     GaussianNB    0.8045     0.7987  0.8045    0.8002
2    BernoulliNB    0.8492     0.8460  0.8492    0.8436

üèÜ Mejor modelo: BernoulliNB con Accuracy: 0.8492


# 5.1 Hiperparametros

# 5.1.2 B√öSQUEDA EXTENSIVA CON GRIDSEARCHCV

In [13]:
# Definimos el espacio de hiperpar√°metros para BernoulliNB
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # Suavizado de Laplace
    'fit_prior': [True, False],  # Si debe aprender las probabilidades a priori
    'binarize': [0.0, 0.5, 1.0]  # Umbral para binarizaci√≥n de features
}

# Creamos el GridSearchCV
grid_search = GridSearchCV(
    estimator=BernoulliNB(),
    param_grid=param_grid,
    cv=5,  # Validaci√≥n cruzada con 5 folds
    scoring='accuracy',
    n_jobs=-1,  # Usa todos los procesadores disponibles
    verbose=1
)

# Entrenamos con GridSearch
print("Buscando mejores hiperpar√°metros para BernoulliNB...")
grid_search.fit(X_train_vec, y_train)

# Mostramos los mejores par√°metros
print(f"\nüéØ Mejores hiperpar√°metros: {grid_search.best_params_}")
print(f"üìä Mejor score (accuracy): {grid_search.best_score_:.4f}")

# Evaluamos el modelo optimizado en el conjunto de prueba
best_model = grid_search.best_estimator_
y_pred_optimized = best_model.predict(X_test_vec)

print("\nüìà Classification Report del modelo optimizado:")
print(classification_report(y_test, y_pred_optimized))

Buscando mejores hiperpar√°metros para BernoulliNB...
Fitting 5 folds for each of 30 candidates, totalling 150 fits

üéØ Mejores hiperpar√°metros: {'alpha': 1.0, 'binarize': 0.0, 'fit_prior': False}
üìä Mejor score (accuracy): 0.8132

üìà Classification Report del modelo optimizado:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       126
           1       0.80      0.68      0.73        53

    accuracy                           0.85       179
   macro avg       0.84      0.80      0.82       179
weighted avg       0.85      0.85      0.85       179



# 5.1.3 Busqueda intensiva Randomized search

In [14]:
# Definimos distribuciones de hiperpar√°metros para BernoulliNB
param_distributions = {
    'alpha': uniform(0.01, 2.0),  # Distribuci√≥n uniforme entre 0.01 y 2.01 (loc + scale)
    'fit_prior': [True, False],
    'binarize': uniform(0.0, 1.5)  # Distribuci√≥n uniforme entre 0.0 y 1.5 (loc + scale)
}

# Creamos el RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=BernoulliNB(),
    param_distributions=param_distributions,
    n_iter=50,  # N√∫mero de combinaciones aleatorias a probar
    cv=5,  # Validaci√≥n cruzada con 5 folds
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Entrenamos con RandomizedSearch
print("Buscando mejores hiperpar√°metros con RandomizedSearchCV...")
random_search.fit(X_train_vec, y_train)

# Mostramos los mejores par√°metros
print(f"\nüéØ Mejores hiperpar√°metros: {random_search.best_params_}")
print(f"üìä Mejor score (accuracy): {random_search.best_score_:.4f}")

# Evaluamos el modelo optimizado en el conjunto de prueba
best_model_random = random_search.best_estimator_
y_pred_random = best_model_random.predict(X_test_vec)

print("\nüìà Classification Report del modelo optimizado:")
print(classification_report(y_test, y_pred_random))

Buscando mejores hiperpar√°metros con RandomizedSearchCV...
Fitting 5 folds for each of 50 candidates, totalling 250 fits

üéØ Mejores hiperpar√°metros: {'alpha': np.float64(0.787354579378964), 'binarize': np.float64(0.40702354766084387), 'fit_prior': True}
üìä Mejor score (accuracy): 0.8258

üìà Classification Report del modelo optimizado:
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       126
           1       0.79      0.70      0.74        53

    accuracy                           0.85       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.85      0.85      0.85       179



# 6.1 Comparacion modelos

In [15]:
# Comparamos el modelo base vs los optimizados
modelos_comparacion = ['BernoulliNB Base', 'BernoulliNB + GridSearch', 'BernoulliNB + RandomizedSearch']
predicciones_comparacion = [y_pred_bernoulli, y_pred_optimized, y_pred_random]

resultados_comparacion = []
for modelo, pred in zip(modelos_comparacion, predicciones_comparacion):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    
    resultados_comparacion.append({
        'Modelo': modelo,
        'Accuracy': round(accuracy, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1-Score': round(f1, 4)
    })

# Creamos un DataFrame con los resultados
comparacion_final = pd.DataFrame(resultados_comparacion)
print("üìä Comparaci√≥n de modelos BernoulliNB:")
print(comparacion_final)

# Identificamos el mejor modelo
mejor_modelo_final = comparacion_final.loc[comparacion_final['Accuracy'].idxmax()]
print(f"\nüèÜ Mejor configuraci√≥n: {mejor_modelo_final['Modelo']}")
print(f"   Accuracy: {mejor_modelo_final['Accuracy']}")
print(f"   F1-Score: {mejor_modelo_final['F1-Score']}")

# Calculamos la mejora respecto al modelo base
mejora_grid = (comparacion_final.iloc[1]['Accuracy'] - comparacion_final.iloc[0]['Accuracy']) * 100
mejora_random = (comparacion_final.iloc[2]['Accuracy'] - comparacion_final.iloc[0]['Accuracy']) * 100

print(f"\nüìà Mejora con GridSearch: {mejora_grid:+.2f}%")
print(f"üìà Mejora con RandomizedSearch: {mejora_random:+.2f}%")

üìä Comparaci√≥n de modelos BernoulliNB:
                           Modelo  Accuracy  Precision  Recall  F1-Score
0                BernoulliNB Base    0.8492     0.8460  0.8492    0.8436
1        BernoulliNB + GridSearch    0.8547     0.8515  0.8547    0.8511
2  BernoulliNB + RandomizedSearch    0.8547     0.8517  0.8547    0.8521

üèÜ Mejor configuraci√≥n: BernoulliNB + GridSearch
   Accuracy: 0.8547
   F1-Score: 0.8511

üìà Mejora con GridSearch: +0.55%
üìà Mejora con RandomizedSearch: +0.55%


# 7.1 Guardamos el modelo con las mejores metricas

In [16]:
# Guardamos el mejor modelo (BernoulliNB + GridSearch) y el vectorizador
joblib.dump(best_model, '../models/bernoulli_gridsearch.pkl')
joblib.dump(vectorizer, '../models/vectorizer.pkl')

print("‚úÖ Modelo guardado: bernoulli_gridsearch.pkl")
print("‚úÖ Vectorizador guardado: vectorizer.pkl")

‚úÖ Modelo guardado: bernoulli_gridsearch.pkl
‚úÖ Vectorizador guardado: vectorizer.pkl
