# Explore here

In [306]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

In [307]:
df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


In [308]:
df = df.drop("package_name", axis=1)
df["review"] = df["review"].str.strip().str.lower()

df.head()


Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [309]:
X = df["review"]
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [310]:
# Realizamos el conteo de palabras en los textos usando CountVectorizer
vec_model = CountVectorizer(stop_words = "english")
X_train_vec = vec_model.fit_transform(X_train).toarray()
X_test_vec = vec_model.transform(X_test).toarray()

X_train_vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [311]:
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [312]:
y_pred = model.predict(X_test_vec)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [313]:
accuracy_score(y_test, y_pred)

0.8156424581005587

In [314]:
# Inicializar y entrenar el clasificador Naive Bayes Multinomial
clf = MultinomialNB().fit(X_train_vec, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = clf.predict(X_test_vec)

# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179



In [315]:
# Inicializar y entrenar el clasificador Naive Bayes BernoulliNB
clf = BernoulliNB().fit(X_train_vec, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = clf.predict(X_test_vec)

# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179



In [316]:
# Inicializar y entrenar el clasificador Naive Bayes GaussianNB
clf = GaussianNB().fit(X_train_vec, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = clf.predict(X_test_vec)

# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179



EL MODELO ESTA DESBALANCEADO POR LO QUE 'accuracy' NO ES RECOMENDABLE TOMARLO.

SE TOMA COMO SCORING 'recall' POR LO ANTES MENCIONADO Y UN N-ITER DE 300 MIENTRAS MAS PRUEBAS HAGA MEJOR SERAN LOS RESULTADOS.

In [329]:
hyperparams = {
    "alpha": np.linspace(0.01, 10.0, 200),
    "fit_prior": [True, False]
}

# Se inicializa randomsearch
random_search = RandomizedSearchCV(model, hyperparams, n_iter = 300, scoring = "recall", cv = 5, random_state = 42)
random_search

In [330]:
random_search.fit(X_train_vec, y_train)

print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'fit_prior': False, 'alpha': 0.3614070351758794}


SE INICIALIZA CON LOS MEJORES HIPERPARAMETROS ANTES BUSCADOS.


In [331]:
clf = MultinomialNB(alpha = 0.3614070351758794, fit_prior = False).fit(X_train_vec, y_train)
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87       126
           1       0.69      0.68      0.69        53

    accuracy                           0.82       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.81      0.82      0.82       179



- Precisión: Indica la proporción de identificaciones positivas que fueron realmente correctas. Para la clase 0 (reseñas negativas), la precisión es del 87%, lo que significa que de todas las reseñas que el modelo predijo como negativas, el 86% de esas predicciones fueron correctas. Para la clase 1 (reseñas positivas), la precisión es del 69%.
- Recall (Exhaustividad): Mide la proporción de positivos reales que fueron identificados correctamente. Para la clase 0, el recall es del 87%, lo que indica que el modelo identificó correctamente el 87% de todas las reseñas negativas reales. Para la clase 1, el recall es del 68%.
- F1-Score: Es el promedio armónico de la precisión y el recall. Ofrece un equilibrio entre estas dos métricas. Un F1-score de 0.87 para la clase 0 y 0.69 para la clase 1 muestra un rendimiento generalmente bueno para la clase 0 y moderado para la clase 1.

In [332]:
clf = BernoulliNB(alpha = 0.3614070351758794, fit_prior = False).fit(X_train_vec, y_train)
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.83      0.85       126
           1       0.64      0.74      0.68        53

    accuracy                           0.80       179
   macro avg       0.76      0.78      0.77       179
weighted avg       0.81      0.80      0.80       179



EN CONCLUSION EL MODELO PARECE SER MAS EFECTIVO Y CONFIABLE AL IDENTIFICAR RESEÑAS NEGATIVAS QUE RESEÑAS POSITIVAS.

NO SE PRESENTAN CAMBIOS SIGNIFICATIVOS.

In [333]:
from pickle import dump

dump(model, open("modelo_otto", "wb"))