# Explore here

In [197]:
# IMPORTS
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

# PASO 1: PLANTEAMIENTO DEL PROBLEMA

- OBJETIVO: Crear un clasificador de reseñas de la tienda de Google Play.

In [198]:
# CARGAR EL DATASET

total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

total_data

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


# PASO 2: EXPLORACÓN Y LIMPIEZA DE DATOS

In [199]:
total_data.shape

(891, 3)

In [200]:
# OBTENER INFORMACIÓN SOBRE LOS TIPOS DE DATOS Y VALORES NO NULOS
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [201]:
total_data.describe()

Unnamed: 0,polarity
count,891.0
mean,0.344557
std,0.47549
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [202]:
print(total_data.isnull().sum())

package_name    0
review          0
polarity        0
dtype: int64


In [203]:
total_data.duplicated().sum()

np.int64(0)

ELIMINAR INFORMACIÓN IRRELEVANTE

In [204]:
total_data.drop(["package_name"], axis = 1, inplace = True)
total_data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [205]:
# Eliminar espacios y convertir a minúsculas el texto:
total_data["review"] = total_data["review"].str.strip().str.lower()

In [206]:
# Dividir el dataset en train y test: 
X = total_data["review"]
y = total_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 80)

X_train.head()

795    trust worthy as always! have been a long time ...
94     facebook ripoff used to see my connections new...
759    ads? really? ads in the kantipur app? you guys...
834    well done nicely designed .....this app had co...
113    groups??? ** edit: i changed my rating from on...
Name: review, dtype: object

In [207]:
# Transformar el texto en una matriz de recuento de palabras
vec_model = CountVectorizer(stop_words = "english")

X_train = vec_model.fit_transform(X_train).toarray()

X_test = vec_model.transform(X_test).toarray()

# PASO 3: Construye un Naive Bayes

In [208]:
# Inicialización y entrenamiento del modelo con GaussianNB

model_Gaussian = GaussianNB()
model_Gaussian.fit(X_train, y_train)

In [209]:
# Predicción del modelo con GaussianNB

y_pred = model_Gaussian.predict(X_test)
y_pred

array([0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1])

In [None]:
# Probar el Accuracy del modelo GaussianNB

accuracy_score(y_test, y_pred)

0.776536312849162

In [211]:
# Inicialización y entrenamiento del modelo con MultinomialNB

model_multinomial = MultinomialNB()
model_multinomial.fit(X_train, y_train)

In [212]:
# Predicción del modelo

y_pred = model_multinomial.predict(X_test)
y_pred

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 1])

In [213]:
# Probar el Accuracy del modelo

accuracy_score(y_test, y_pred)

0.8603351955307262

In [217]:
# Inicialización y entrenamiento del modelo con BernoulliNB

model_Bernoulli = BernoulliNB()
model_Bernoulli.fit(X_train, y_train)

In [218]:
# Predicción del modelo con BernoulliNB

y_pred = model_Bernoulli.predict(X_test)
y_pred

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0])

In [219]:
# Probar el Accuracy del modelo BernoulliNB

accuracy_score(y_test, y_pred)

0.7932960893854749

## Despues de probar con las tres implementaciones del Naive Bayes, procedemos a optimizar el modelo con la implementación que dió mejores resultados, es decir, MultinomialNB con un 0.86 de accuracy.

# Optimización del modelo

In [214]:
from sklearn.model_selection import GridSearchCV

hyperparams = {
    "alpha": np.linspace(0.01, 3.0, 200),
    "force_alpha": [True, False],
    "fit_prior": [True, False]    
}

model_multinomial = MultinomialNB()
grid = GridSearchCV(model_multinomial, hyperparams, scoring = "accuracy", cv = None)
grid

In [215]:
grid.fit(X_train, y_train)

In [216]:
model_multinomial = MultinomialNB(force_alpha = True, alpha = 2.939899497487437, fit_prior = False)
model_multinomial.fit(X_train, y_train)
model_multinomial.fit(X_train, y_train)
y_pred = model_multinomial.predict(X_test)
accuracy_score(y_test, y_pred)

0.8715083798882681

In [221]:
from pickle import dump

ruta = "..\models/naive_bayes_force_alpha_True_alpha_2-93989949748_fit_prior_False_80.sav"
dump(model_multinomial, open(ruta, "wb"))

  ruta = "..\models/naive_bayes_force_alpha_True_alpha_2-93989949748_fit_prior_False_80.sav"
