# Explore here

In [24]:

import pandas as pd

# Cargar el dataset
df = pd.read_csv('../data/raw/playstore_reviews.csv')

# Ver las dimensiones del dataset
print(df.shape)

# Información general del dataset
df_info = df.info()

# Descripción estadística de las columnas numéricas
print (df.describe())


(891, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB
         polarity
count  891.000000
mean     0.344557
std      0.475490
min      0.000000
25%      0.000000
50%      0.000000
75%      1.000000
max      1.000000


In [25]:
# 1. Verificar si existen duplicados
duplicados = df.duplicated().sum()
print(f"Número de filas duplicadas: {duplicados}")

# Si existen duplicados, los eliminamos
if duplicados > 0:
    df = df.drop_duplicates()
    print(f"Duplicados eliminados. El nuevo tamaño del dataset es: {df.shape}")

# 2. Análisis de valores nulos
valores_nulos = df.isnull().sum()
print("\nValores nulos por columna:")
print(valores_nulos)

# Mostrar el porcentaje de valores nulos por columna
porcentaje_nulos = (valores_nulos / df.shape[0]) * 100
print("\nPorcentaje de valores nulos por columna:")
print(porcentaje_nulos)


Número de filas duplicadas: 0

Valores nulos por columna:
package_name    0
review          0
polarity        0
dtype: int64

Porcentaje de valores nulos por columna:
package_name    0.0
review          0.0
polarity        0.0
dtype: float64


In [26]:
# Eliminar la columna 'package_name'
df = df.drop('package_name', axis=1)

# Mostrar las primeras filas para verificar el resultado
print(df.head())

df["review"] = df["review"].str.strip().str.lower()

# Mostrar las primeras filas para verificar el resultado
print(df.head())

                                              review  polarity
0   privacy at least put some option appear offli...         0
1   messenger issues ever since the last update, ...         0
2   profile any time my wife or anybody has more ...         0
3   the new features suck for those of us who don...         0
4   forced reload on uploading pic on replying co...         0
                                              review  polarity
0  privacy at least put some option appear offlin...         0
1  messenger issues ever since the last update, i...         0
2  profile any time my wife or anybody has more t...         0
3  the new features suck for those of us who don'...         0
4  forced reload on uploading pic on replying com...         0


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer

# Supongamos que tienes los datos originales en X (texto) y y (etiquetas)
X = df['review']  # Esto asume que 'review' es la columna de texto
y = df['polarity']  # Esto asume que 'polarity' es la etiqueta binaria

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Verifica el tamaño antes de aplicar el vectorizador
print("Tamaño antes del vectorizador:")
print("X_train:", len(X_train))
print("y_train:", len(y_train))

# Convertir texto a características con CountVectorizer
vec_model = CountVectorizer(stop_words="english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

# Verifica el tamaño después de aplicar el vectorizador
print("Tamaño después del vectorizador:")
print("X_train:", X_train.shape)
print("y_train:", len(y_train))



Tamaño antes del vectorizador:
X_train: 623
y_train: 623
Tamaño después del vectorizador:
X_train: (623, 3090)
y_train: 623


In [36]:
# Entreno el modelo Gaussian
model = GaussianNB()
model.fit(X_train, y_train)

In [56]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score


# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilidad para la clase positiva

# Evaluar el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_proba))

Accuracy: 0.7985074626865671
Confusion Matrix:
 [[161  27]
 [ 27  53]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86       188
           1       0.66      0.66      0.66        80

    accuracy                           0.80       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.80      0.80      0.80       268

AUC-ROC Score: 0.7594414893617022


Optimización mediante grid y mediante randomized

In [43]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

# Definir el espacio de hiperparámetros para la búsqueda
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)  # Explora valores en una escala logarítmica
}

# Aplicar GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Mostrar los mejores parámetros y la precisión
print("Mejores hiperparámetros encontrados con GridSearchCV:")
print(grid_search.best_params_)
print("Precisión en el conjunto de prueba:", accuracy_score(y_test, grid_search.predict(X_test)))



Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ....................var_smoothing=0.657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ....................var_smoothing=0.6

In [66]:
# Aplicar RandomizedSearchCV
random_search_gaussian = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=20, cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search_gaussian.fit(X_train, y_train)

# Mostrar los mejores parámetros y la precisión
print("\nMejores hiperparámetros encontrados con RandomizedSearchCV:")
print(random_search_gaussian.best_params_)
print("Precisión en el conjunto de prueba:", accuracy_score(y_test, random_search_gaussian.predict(X_test)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ................var_smoothing=2.848035868435799e-08; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-08; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-08; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-08; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-08; total time=   0.0s
[CV] END ...............var_smoothing=1.5199110829529332e-05; total time=   0.0s
[CV] END ...............var_smoothing=1.5199110829529332e-05; total time=   0.0s
[CV] END ...............var_smoothing=1.5199110829529332e-05; total time=   0.0s
[CV] END ...............var_smoothing=4.3287612810830526e-07; total time=   0.0s
[CV] END ...............var_smoothing=1.5199110829529332e-05; total time=   0.0s
[CV] END ...............var_smoothing=4.3287612810830526e-07; total time=   0.0s
[CV] END ...............var_smoothing=1.5199110

Vamos a observar que pasa si aplico las otras arquitecturas para entrenar.

In [53]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
# Entreno el modelo MultinomialNB
model_multinomial = MultinomialNB()
model_multinomial.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model_multinomial.predict(X_test)
y_proba = model_multinomial.predict_proba(X_test)[:, 1]  # Probabilidad para la clase positiva

# Evaluar el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_proba))

Accuracy: 0.8134328358208955
Confusion Matrix:
 [[167  21]
 [ 29  51]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       188
           1       0.71      0.64      0.67        80

    accuracy                           0.81       268
   macro avg       0.78      0.76      0.77       268
weighted avg       0.81      0.81      0.81       268

AUC-ROC Score: 0.8488696808510638


In [60]:

model_bernoulli = BernoulliNB()
model_bernoulli.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model_bernoulli.predict(X_test)
y_proba = model_bernoulli.predict_proba(X_test)[:, 1]  # Probabilidad para la clase positiva

# Evaluar el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_proba))
      
      

Accuracy: 0.7686567164179104
Confusion Matrix:
 [[175  13]
 [ 49  31]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.93      0.85       188
           1       0.70      0.39      0.50        80

    accuracy                           0.77       268
   macro avg       0.74      0.66      0.67       268
weighted avg       0.76      0.77      0.75       268

AUC-ROC Score: 0.843218085106383


Optimización multinomial y bernou


In [62]:
# Definir el espacio de hiperparámetros para MultinomialNB
param_grid_multinomial = {
    'alpha': np.logspace(-3, 1, 10)  # Explora valores de alpha en una escala logarítmica
}

# Aplicar RandomizedSearchCV para MultinomialNB
random_search_multinomial = RandomizedSearchCV(estimator=model_multinomial, param_distributions=param_grid_multinomial, n_iter=10, cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search_multinomial.fit(X_train, y_train)

print("\nMejores hiperparámetros para MultinomialNB con RandomizedSearchCV:")
print(random_search_multinomial.best_params_)
print("Precisión en el conjunto de prueba (MultinomialNB):", accuracy_score(y_test, random_search_multinomial.predict(X_test)))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................alpha=0.0027825594022071257; total time=   0.0s
[CV] END ........................alpha=0.0027825594022071257; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................alpha=0.0027825594022071257; total time=   0.0s
[CV] END ........................alpha=0.0027825594022071257; total time=   0.0s
[CV] END ........................alpha=0.0027825594022071257; total time=   0.0s
[CV] END .........................alpha=0.007742636826811269; total time=   0.0s
[CV] END .........................alpha=0.007742

In [64]:
# Definir el espacio de hiperparámetros para BernoulliNB
param_grid_bernoulli = {
    'alpha': np.logspace(-3, 1, 10),  # Explora valores de alpha en una escala logarítmica
    'binarize': np.linspace(0.0, 1.0, 10)  # Umbral para binarizar los datos
}
# Aplicar RandomizedSearchCV para BernoulliNB
random_search_bernoulli = RandomizedSearchCV(estimator=model_bernoulli, param_distributions=param_grid_bernoulli, n_iter=10, cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search_bernoulli.fit(X_train, y_train)

print("\nMejores hiperparámetros para BernoulliNB con RandomizedSearchCV:")
print(random_search_bernoulli.best_params_)
print("Precisión en el conjunto de prueba (BernoulliNB):", accuracy_score(y_test, random_search_bernoulli.predict(X_test)))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END alpha=3.593813663804626, binarize=0.3333333333333333; total time=   0.0s
[CV] END alpha=3.593813663804626, binarize=0.3333333333333333; total time=   0.0s
[CV] END alpha=3.593813663804626, binarize=0.3333333333333333; total time=   0.0s
[CV] END alpha=3.593813663804626, binarize=0.3333333333333333; total time=   0.0s
[CV] END alpha=3.593813663804626, binarize=0.3333333333333333; total time=   0.0s
[CV] END alpha=0.1668100537200059, binarize=0.3333333333333333; total time=   0.0s
[CV] END alpha=0.1668100537200059, binarize=0.3333333333333333; total time=   0.0s
[CV] END alpha=0.1668100537200059, binarize=0.3333333333333333; total time=   0.0s
[CV] END alpha=0.1668100537200059, binarize=0.3333333333333333; total time=   0.0s
[CV] END .............alpha=1.2915496650148828, binarize=0.0; total time=   0.1s
[CV] END alpha=0.1668100537200059, binarize=0.3333333333333333; total time=   0.0s
[CV] END .............alpha=1.291

In [68]:
### Evaluación Final para ambos modelos ###
# Generar informes de clasificación detallados

print("\nInforme de clasificación para Gaussian (RandomSearchCV):")
print(classification_report(y_test, random_search_gaussian.predict(X_test)))

print("\nInforme de clasificación para MultinomialNB (RandomSearchCV):")
print(classification_report(y_test, random_search_multinomial.predict(X_test)))

print("\nInforme de clasificación para BernoulliNB (RandomSearchCV):")
print(classification_report(y_test, random_search_bernoulli.predict(X_test)))


Informe de clasificación para Gaussian (RandomSearchCV):
              precision    recall  f1-score   support

           0       0.87      0.80      0.84       188
           1       0.61      0.72      0.66        80

    accuracy                           0.78       268
   macro avg       0.74      0.76      0.75       268
weighted avg       0.79      0.78      0.78       268


Informe de clasificación para MultinomialNB (RandomSearchCV):
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       188
           1       0.70      0.59      0.64        80

    accuracy                           0.80       268
   macro avg       0.77      0.74      0.75       268
weighted avg       0.80      0.80      0.80       268


Informe de clasificación para BernoulliNB (RandomSearchCV):
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       188
           1       0.70      0.70      0.70        80



In [69]:
from pickle import dump

dump(model, open("naive_bayes_gaussian.sav", "wb"))
dump(model_multinomial, open("naive_bayes_multinomial.sav", "wb"))
dump(model_bernoulli, open("naive_bayes_bernoulli.sav", "wb"))
