### **Naive Bayes: GaussianNB, MultinomialNB, BernoulliNB**

In [265]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_iris, fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

#### **Carga de datos**

In [247]:
df = pd.read_csv('/workspaces/GuilloMansa-MachineLearning/data/raw/playstore_reviews.csv', sep=',')
df.head(3)

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0


In [248]:
df.drop(['package_name'],axis=1,inplace=True)

In [249]:
df.columns

Index(['review', 'polarity'], dtype='object')

In [250]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Length: 891, dtype: bool

In [251]:
df["review"] = df["review"].str.strip().str.lower()

#### **Split**

In [252]:
# Dividimos el conjunto de datos en muestras de train y test
X = df.drop(["polarity"], axis=1)
y = df["polarity"]

# Instancia de: from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=18)

In [253]:
len(X_train), len(y_train)

(712, 712)

In [254]:
X_train

Unnamed: 0,review
464,my texts aren't delivering. i've checked my in...
871,classic game ruined by ads. uninstalled this o...
506,brilliant! the best application i've found so ...
147,"suggestion. i given 5 stars to this game, bec..."
488,excellent my entire team would be lost without...
...,...
702,"easy to play and always running, jumping and d..."
837,love this app all in one nepalese app ...
120,not happy with this app not one of my life or ...
275,download stops the iteam iam downloading stop ...


In [255]:
y.unique()

array([0, 1])

In [256]:
len(X_train)

712

In [257]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train["review"]).toarray()
X_test = vec_model.transform(X_test["review"]).toarray()

In [258]:
len(X_train)

712

#### **Multinomial**

In [259]:
# Modelo y entrenamiento
model = MultinomialNB()
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [260]:
# Predicción
y_pred_train = model.predict(X_train)

y_pred_test = model.predict(X_test)
y_pred_test

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0])

In [261]:
# Métricas
metrics = {"Accuracy Test ": accuracy_score(y_test, y_pred_test),
           "Accuracy Train: ": accuracy_score(y_train, y_pred_train)}

metrics

{'Accuracy Test ': 0.8156424581005587, 'Accuracy Train: ': 0.9550561797752809}

#### **Gaussian**

In [262]:
model = GaussianNB()
model.fit(X_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [263]:
# Predicción
y_pred_train = model.predict(X_train)

y_pred_test = model.predict(X_test)
y_pred_test

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0])

In [264]:
# Métricas
metrics = {"Accuracy Test ": accuracy_score(y_test, y_pred_test),
           "Accuracy Train: ": accuracy_score(y_train, y_pred_train)}

metrics

{'Accuracy Test ': 0.7877094972067039, 'Accuracy Train: ': 0.9845505617977528}

#### **Bernoulli**

In [266]:
model = BernoulliNB()
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


In [267]:
# Predicción
y_pred_train = model.predict(X_train)

y_pred_test = model.predict(X_test)
y_pred_test

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [268]:
# Métricas
metrics = {"Accuracy Test ": accuracy_score(y_test, y_pred_test),
           "Accuracy Train: ": accuracy_score(y_train, y_pred_train)}

metrics

{'Accuracy Test ': 0.7150837988826816, 'Accuracy Train: ': 0.9073033707865169}

#### Observaciones: luego de realizar los tres entrenamientos, para este tipo de dataset nos quedamos con el MultinomialNB que nos da mayor precisión.

#### **Hiperparametros**

In [275]:
hyperparams = {"alpha": np.linspace(0.01, 10.0, 200),
               "fit_prior": [True, False]}

# We initialize the random search
random_search = RandomizedSearchCV(model, hyperparams,
                                   n_iter=50,
                                   scoring="accuracy",
                                   cv=5,
                                   random_state=18)
random_search

0,1,2
,estimator,BernoulliNB()
,param_distributions,"{'alpha': array([ 0.01 ... 10. ]), 'fit_prior': [True, False]}"
,n_iter,50
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,18

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


In [276]:
random_search.fit(X_train, y_train)

random_search.best_params_

{'fit_prior': False, 'alpha': np.float64(0.01)}

#### **Best estimator**

In [284]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

param_grid_multi = {'alpha': [0.1, 15,200],'fit_prior': [True, False]}

grid_search_multi = GridSearchCV(MultinomialNB(),
    param_grid_multi,
    cv=5,
    n_jobs=-1,
    verbose=1)

grid_search_multi.fit(X_train, y_train)


Fitting 5 folds for each of 6 candidates, totalling 30 fits


0,1,2
,estimator,MultinomialNB()
,param_grid,"{'alpha': [0.1, 15, ...], 'fit_prior': [True, False]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,0.1
,force_alpha,True
,fit_prior,False
,class_prior,


In [285]:
with open('/workspaces/GuilloMansa-MachineLearning/models/Naive-Bayes.pkl', 'wb') as file:
    pickle.dump(model, file)