# Explore here

In [12]:
# Your code here
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [4]:
df.drop('package_name', axis=1, inplace=True)
df


Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0
...,...,...
886,loved it i loooooooooooooovvved it because it...,1
887,all time legendary game the birthday party le...,1
888,ads are way to heavy listen to the bad review...,0
889,fun works perfectly well. ads aren't as annoy...,1


In [5]:
# Limpiar el texto eliminando espacios en blanco y convirtiendo a minúsculas
df['review'] = df['review'].str.strip().str.lower()
df


Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well. ads aren't as annoyi...,1


In [6]:


# Dividir los datos en variables predictoras y variable objetivo
X = df['review']
y = df['polarity']  # Suponiendo que la columna objetivo se llama 'target'

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()


331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [7]:


# Crear el vectorizador con stop words en inglés
vec_model = CountVectorizer(stop_words='english')

# Entrenar el vectorizador con el conjunto de entrenamiento y transformar los textos
X_train_vec = vec_model.fit_transform(X_train).toarray()

# Transformar el conjunto de prueba utilizando el mismo vectorizador entrenado
X_test_vec = vec_model.transform(X_test).toarray()

print(X_train_vec)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
# Crear el modelo MultinomialNB 
modelo_MultinomialNB = MultinomialNB()

# Entrenar el modelo MultinomialNB
modelo_MultinomialNB.fit(X_train_vec, y_train)

# Predecir en el conjunto de entrenamiento y prueba
y_pred_train_modelo_MultinomialNB = modelo_MultinomialNB.predict(X_train_vec)
y_pred_test_modelo_MultinomialNB = modelo_MultinomialNB.predict(X_test_vec)

# Evaluar el modelo MultinomialNB
accuracy_test_modelo_MultinomialNB = accuracy_score(y_test, y_pred_test_modelo_MultinomialNB)
accuracy_train_modelo_MultinomialNB = accuracy_score(y_train, y_pred_train_modelo_MultinomialNB)
print("Exactitud del modelo MultinomialNB en test:", accuracy_test_modelo_MultinomialNB)
print("Exactitud del modelo MultinomialNB en train:", accuracy_train_modelo_MultinomialNB)
print("Reporte de Clasificación del modelos MultinomialNB:\n", classification_report(y_test, y_pred_test_modelo_MultinomialNB))

Exactitud del modelo MultinomialNB en test: 0.8156424581005587
Exactitud del modelo MultinomialNB en train: 0.9606741573033708
Reporte de Clasificación del modelos MultinomialNB:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179



In [9]:
# Crear el modelo Random Forest
modelo_randomforest = RandomForestClassifier(random_state=42)

# Definir el grid de hiperparámetros para la búsqueda
parametros_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Configurar la búsqueda de hiperparámetros
grid_search = GridSearchCV(estimator=modelo_randomforest, param_grid=parametros_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')


In [10]:
# Entrenar el modelo con la búsqueda de hiperparámetros
grid_search.fit(X_train_vec, y_train)



Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.1s
[CV] END m

In [11]:
# Obtener los mejores hiperparámetros
best_params = grid_search.best_params_
print("LOS MEJORES HIPERPARAMETROS CON GRIDSEARCHCV SON :", best_params)

# Usar el mejor modelo encontrado
mejor_modelo_randomforest = grid_search.best_estimator_

# Predecir en el conjunto de prueba
y_pred_test_randomforest = mejor_modelo_randomforest.predict(X_test_vec)
y_pred_train_randomforest = mejor_modelo_randomforest.predict(X_train_vec)

# Evaluar el modelo Random Forest
accuracy_test_randomforest = accuracy_score(y_test, y_pred_test_randomforest)
accuracy_train_randomforest = accuracy_score(y_train, y_pred_train_randomforest)
print("Exactitud del test de Random Forest:", accuracy_test_randomforest)
print("Exactitud del train de Random Forest:", accuracy_train_randomforest)
print("Reporte de Clasificación de Random Forest:\n", classification_report(y_test, y_pred_test_randomforest))


LOS MEJORES HIPERPARAMETROS CON GRIDSEARCHCV SON : {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Exactitud del test de Random Forest: 0.8100558659217877
Exactitud del train de Random Forest: 1.0
Reporte de Clasificación de Random Forest:
               precision    recall  f1-score   support

           0       0.90      0.83      0.86       126
           1       0.65      0.77      0.71        53

    accuracy                           0.81       179
   macro avg       0.77      0.80      0.78       179
weighted avg       0.82      0.81      0.81       179



In [13]:
# Crear y entrenar el modelo Gradient Boosting
modelo_gradientboosting = GradientBoostingClassifier(random_state=42)
modelo_gradientboosting.fit(X_train_vec, y_train)

# Predecir y evaluar el modelo Gradient Boosting
y_pred_test_gradientboosting = modelo_gradientboosting.predict(X_test_vec)
y_pred_train_gradientboosting = modelo_gradientboosting.predict(X_train_vec)
accuracy_test_gradientboosting = accuracy_score(y_test, y_pred_test_gradientboosting)
accuracy_train_gradientboosting = accuracy_score(y_train, y_pred_train_gradientboosting)
print("Exactitud del test de Gradient Boosting:", accuracy_test_gradientboosting)
print("Exactitud del train de Gradient Boosting:", accuracy_train_gradientboosting)
print("Reporte de Clasificación de Gradient Boosting:\n", classification_report(y_test, y_pred_test_gradientboosting))


Exactitud del test de Gradient Boosting: 0.7318435754189944
Exactitud del train de Gradient Boosting: 0.9058988764044944
Reporte de Clasificación de Gradient Boosting:
               precision    recall  f1-score   support

           0       0.80      0.83      0.81       126
           1       0.55      0.49      0.52        53

    accuracy                           0.73       179
   macro avg       0.67      0.66      0.67       179
weighted avg       0.72      0.73      0.73       179



In [1]:
import os
import joblib

# Definir la ruta donde se guardará el modelo
ruta_modelo = '/workspaces/Proyecto-Naive-Bayes-Jorge3127/models/modelo_multinominal_nb.pkl'


# Verificar si el directorio existe, si no, crearlo
directorio = os.path.dirname(ruta_modelo)
if not os.path.exists(directorio):
    os.makedirs(directorio)

# Guardar el modelo MultinomialNB
joblib.dump(modelo_MultinomialNB, ruta_modelo)

print(f"Modelo guardado en: {ruta_modelo}")


NameError: name 'modelo_MultinomialNB' is not defined

DESPUES DE EXPLORAR CON GRADIENT BOOSTING Y RANDOM FOREST EL MODELO QUE MEJOR SE COMPORTA ES EL MULTINOMINALNB, POR LO TANTO, GUARDO ESE MODELO EN LA CARPETA DE DATA/PROCESSED PARA PROXIMOS USOS.