### **Naive Bayes: GaussianNB, MultinomialNB, BernoulliNB**

In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_iris, fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
import re
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from utils import get_classifier_metrics

#### **Carga de datos**

In [3]:
df = pd.read_csv('/workspaces/GuilloMansa-MachineLearning/data/raw/playstore_reviews.csv', sep=',')
df.head(3)

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0


In [4]:
df.drop(['package_name'],axis=1,inplace=True)

In [5]:
df.columns

Index(['review', 'polarity'], dtype='object')

In [6]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Length: 891, dtype: bool

In [7]:
df["review"] = df["review"].str.strip().str.lower()

#### **Split**

In [8]:
# Dividimos el conjunto de datos en muestras de train y test
X = df.drop(["polarity"], axis=1)
y = df["polarity"]

# Instancia de: from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=18)

In [9]:
len(X_train), len(y_train)

(712, 712)

In [10]:
X_train

Unnamed: 0,review
464,my texts aren't delivering. i've checked my in...
871,classic game ruined by ads. uninstalled this o...
506,brilliant! the best application i've found so ...
147,"suggestion. i given 5 stars to this game, bec..."
488,excellent my entire team would be lost without...
...,...
702,"easy to play and always running, jumping and d..."
837,love this app all in one nepalese app ...
120,not happy with this app not one of my life or ...
275,download stops the iteam iam downloading stop ...


In [11]:
y.unique()

array([0, 1])

In [12]:
len(X_train)

712

In [13]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train["review"]).toarray()
X_test = vec_model.transform(X_test["review"]).toarray()

In [14]:
len(X_train)

712

#### **Multinomial**

In [15]:
# Modelo y entrenamiento
mbmodel = MultinomialNB()
mbmodel.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [16]:
# Predicción
y_pred_train_mb = mbmodel.predict(X_train)

y_pred_test_mb = mbmodel.predict(X_test)
y_pred_test_mb

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0])

In [17]:
# Métricas
metrics = {"Accuracy Test ": accuracy_score(y_test, y_pred_test_mb),
           "Accuracy Train: ": accuracy_score(y_train, y_pred_train_mb)}

metrics

{'Accuracy Test ': 0.8156424581005587, 'Accuracy Train: ': 0.9550561797752809}

In [18]:
mb_met = get_classifier_metrics(y_pred_test_mb, y_test, y_pred_train_mb, y_train, average='weighted')
mb_met

Unnamed: 0,Accuracy,F1 Score,Precision,Recall
Train set,0.955056,0.954964,0.954939,0.955056
Test set,0.815642,0.813142,0.813341,0.815642


#### **Gaussian**

In [19]:
model = GaussianNB()
model.fit(X_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [20]:
# Predicción
y_pred_train = model.predict(X_train)

y_pred_test = model.predict(X_test)
y_pred_test

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0])

In [21]:
# Métricas
metrics = {"Accuracy Test ": accuracy_score(y_test, y_pred_test),
           "Accuracy Train: ": accuracy_score(y_train, y_pred_train)}

metrics

{'Accuracy Test ': 0.7877094972067039, 'Accuracy Train: ': 0.9845505617977528}

#### **Bernoulli**

In [22]:
model = BernoulliNB()
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


In [23]:
# Predicción
y_pred_train = model.predict(X_train)

y_pred_test = model.predict(X_test)
y_pred_test

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [24]:
# Métricas
metrics = {"Accuracy Test ": accuracy_score(y_test, y_pred_test),
           "Accuracy Train: ": accuracy_score(y_train, y_pred_train)}

metrics

{'Accuracy Test ': 0.7150837988826816, 'Accuracy Train: ': 0.9073033707865169}

#### Observaciones: luego de realizar los tres entrenamientos, para este tipo de dataset nos quedamos con el MultinomialNB que nos da mayor precisión.

#### **Hiperparametros**

In [25]:
hyperparams = {"alpha": np.linspace(0.01, 10.0, 200),
               "fit_prior": [True, False]}

# We initialize the random search
random_search = RandomizedSearchCV(mbmodel, hyperparams,
                                   n_iter=50,
                                   scoring="accuracy",
                                   cv=5,
                                   random_state=18)
random_search

0,1,2
,estimator,MultinomialNB()
,param_distributions,"{'alpha': array([ 0.01 ... 10. ]), 'fit_prior': [True, False]}"
,n_iter,50
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,18

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [26]:
random_search.fit(X_train, y_train)

random_search.best_params_

{'fit_prior': True, 'alpha': np.float64(0.1606030150753769)}

#### **Best estimator**

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

param_grid_multi = {'alpha': [0.1, 15,200],'fit_prior': [True, False]}

grid_search_multi = GridSearchCV(MultinomialNB(),
    param_grid_multi,
    cv=5,
    n_jobs=-1,
    verbose=1)

grid_search_multi.fit(X_train, y_train)


Fitting 5 folds for each of 6 candidates, totalling 30 fits


0,1,2
,estimator,MultinomialNB()
,param_grid,"{'alpha': [0.1, 15, ...], 'fit_prior': [True, False]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,0.1
,force_alpha,True
,fit_prior,False
,class_prior,


- Random Forest Classifier

In [28]:
rfc = RandomForestClassifier(random_state=21, class_weight='balanced')

param_grid = {'n_estimators': list(range(70, 90, 5)), 
              'criterion':['gini','entropy'],
              'max_depth' : [5, 7],
              'min_samples_leaf': [3, 4, 5]}

grid = GridSearchCV(rfc,
                    param_grid,
                    cv=5,
                    scoring='f1_macro') # Nos interesa las dos clases por igual, tanto reseñas negativas como positivas

In [29]:
# Entrenamos el grid con los hiperparametros
grid.fit(X_train, y_train)
# Devolvemos los mejores parametros despues de entrenarlo
grid.best_params_

{'criterion': 'gini',
 'max_depth': 7,
 'min_samples_leaf': 4,
 'n_estimators': 80}

In [30]:
# Modelos con los mejores parametros
rfc_grid = grid.best_estimator_

In [31]:
# Repetimos el entrenamiento pero ahora con el grid que tiene los hiperparametros establecidos
rfc_grid.fit(X_train, y_train)
# Predicciones
y_pred_test_grid = rfc_grid.predict(X_test)
y_pred_train_grid = rfc_grid.predict(X_train)

In [32]:
rfc_metrics = get_classifier_metrics(y_pred_test_grid, y_test, y_pred_train_grid, y_train, average='weighted')
rfc_metrics

Unnamed: 0,Accuracy,F1 Score,Precision,Recall
Train set,0.851124,0.853216,0.859292,0.851124
Test set,0.798883,0.802081,0.81503,0.798883


In [33]:
report_grid_rfc= classification_report(y_test, y_pred_test_grid)
report_grid_rfc

'              precision    recall  f1-score   support\n\n           0       0.89      0.78      0.83       114\n           1       0.68      0.83      0.75        65\n\n    accuracy                           0.80       179\n   macro avg       0.79      0.81      0.79       179\nweighted avg       0.82      0.80      0.80       179\n'

In [34]:
data = {'Métrica': ['Accuracy', 'F1 Score', 'Precision', 'Recall'],
        'MultinomialNB': [0.815, 0.813, 0.813, 0.815],
        'Random Forest': [0.798, 0.802, 0.815, 0.798]}

comparacion = pd.DataFrame(data)
comparacion

Unnamed: 0,Métrica,MultinomialNB,Random Forest
0,Accuracy,0.815,0.798
1,F1 Score,0.813,0.802
2,Precision,0.813,0.815
3,Recall,0.815,0.798


In [35]:
with open('/workspaces/GuilloMansa-MachineLearning/models/07-Naive-Bayes-playstore-reviews.pkl', 'wb') as file:
    pickle.dump(mbmodel, file)