# ALGORITMO DE NAIVE BAYES

In [44]:
# Importaciones necesarias
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import joblib
import pickle

## Paso 1: Carga del conjunto de datos

In [9]:
df = pd.read_csv('../data/raw/playstore_reviews.csv')
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


## Paso 2: Estudio de variables y su contenido

In [10]:
# Eliminamos la columna package_name ya que no aporta informacion requerida
df = df.drop(['package_name'],axis=1)
df.shape

(891, 2)

In [11]:
df['review'] = df['review'].str.strip().str.lower()
df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well. ads aren't as annoyi...,1


In [13]:
# Separamos características y etiquetas
X = df["review"]
y = df["polarity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X.shape

(891,)

In [15]:
# Transformar el texto en una matriz de recuento de palabras
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [20]:
vec_model.get_feature_names_out()

array(['000', '04', '0x', ..., 'žŕ', 'žŕľ', 'ˇŕ'],
      shape=(3310,), dtype=object)

## Paso 3: Construye un Naive Bayes

> Observación: De las 3 implementaciones del modelo se elige la multinomial que es la que nos permite el conteo de palabras, la implementacion Gaussina esta establecida para datos numericos y Bernoulli para datos binarios

### Modelo Naive Bayes Multinomial

In [24]:
model_multi = MultinomialNB()
model_multi.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [27]:
# Obtener las predicciones
y_pred_m = model_multi.predict(X_test)
y_pred_m

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [26]:
# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred_m))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179



### Modelo Naive Bayes Gaussiano

In [32]:
model_gauss = GaussianNB()
model_gauss.fit(X_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [34]:
# Obtener las predicciones
y_pred_g = model_gauss.predict(X_test)
y_pred_g

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0])

In [35]:
# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred_g))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179



### Modelo Naive Bayes BernoulliNB

In [36]:
model_bern = BernoulliNB()
model_bern.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


In [37]:
# Obtener las predicciones
y_pred_b = model_bern.predict(X_test)
y_pred_b

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [38]:
# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred_b))

              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179



> Observacion: Como era de esperarse la implementacion multinomial es la que ha obtenido mejores metricas, será entonces la escogida a optimizar en el siguiente paso

## Paso 4: Optimiza el modelo anterior

In [None]:
# Optimización: RandomForestClassifier sobre el mismo dataset vectorizado
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [41]:
# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.88      0.83      0.85       126
           1       0.64      0.74      0.68        53

    accuracy                           0.80       179
   macro avg       0.76      0.78      0.77       179
weighted avg       0.81      0.80      0.80       179



### Comparacion de resultados:
|     | Modelo   | precision | recall   | f1-score |
|-----|----------|-----------|----------|----------|
|  1  | NB Multi | 0.84      | 0.90     | 0.87     | 
|  1  |     RF   | 0.88      | 0.83     | 0.85     |
|  0  | NB Multi | 0.73      | 0.60     | 0.66     |
|  0  |     RF   | 0.64      | 0.74     | 0.68     |

Ambos modelos tienen sus fortalezas distintas:
- MultinomialNB es más fuerte detectando reseñas positivas (clase 1).
- Random Forest mejora el recall en reseñas negativas (clase 0) a cambio de precision mas baja.


## Paso 5: Guarda el modelo

In [None]:
# Guardar el modelo multinomial
joblib.dump(model_multi, '../models/Naive_bayes_multi.pkl')

['../models/Naive_bayes_multi.pkl']

## Paso 6: Explora otras alternativas

In [45]:
# Otro modelo que podria ser efectivo en estos casos es el de regresion lineal logistica
model_rl = LogisticRegression(max_iter=1000)
model_rl.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [46]:
# Obtener las predicciones
y_pred_rl = model_rl.predict(X_test)
y_pred_rl

array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0])

In [49]:
# Evaluar el rendimiento del modelo
print("=== Modelo: Regresión Logística ===")
print(classification_report(y_test, y_pred_rl))
print("=== Modelo: Random Forest ===")
print(classification_report(y_test, y_pred_rf))
print("=== Modelo: Naive Bayes Multinomial ===")
print(classification_report(y_test, y_pred_m))


=== Modelo: Regresión Logística ===
              precision    recall  f1-score   support

           0       0.91      0.84      0.88       126
           1       0.68      0.81      0.74        53

    accuracy                           0.83       179
   macro avg       0.80      0.83      0.81       179
weighted avg       0.85      0.83      0.84       179

=== Modelo: Random Forest ===
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       126
           1       0.64      0.74      0.68        53

    accuracy                           0.80       179
   macro avg       0.76      0.78      0.77       179
weighted avg       0.81      0.80      0.80       179

=== Modelo: Naive Bayes Multinomial ===
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.7

Observaciones por metrica:
- Precision: Lo ha hecho mejor el modelo de Regresion logistica teniendo un alto porcentaje en los reviews negativos
- Recall:   Es superior tambien el modelo de Regresion logistica teniendo un buen balance prediciendo reviews positvos y negativos
- f1-score: El modelo de Regresion logistica mejor por mucho que ambos.

Basados en el recall que es una buena metrica para este tipo de casos se concluye que para detectar casos negativos el modelo de Regresión Logística es el ideal y para casos positivos el mejor es Naive Bayes Multinomial.

