In [166]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

## Recopilación de datos

In [167]:
data =  pd.read_csv('/workspaces/Gupir11-machine-learning/data/raw/playstore_reviews.csv')
data

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


## Procesamiento de texto

In [168]:
data.drop(['package_name'], axis = 1, inplace = True)
data

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0
...,...,...
886,loved it i loooooooooooooovvved it because it...,1
887,all time legendary game the birthday party le...,1
888,ads are way to heavy listen to the bad review...,0
889,fun works perfectly well. ads aren't as annoy...,1


In [169]:
data["review"] = data["review"].str.strip().str.lower()

## Split

In [170]:
X = data.drop('polarity', axis=1)
y = data['polarity']


In [171]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=18,
)

## VEctorizacion

In [172]:
vec_model = TfidfVectorizer(stop_words="english")
X_train = vec_model.fit_transform(X_train["review"]).toarray()
X_test = vec_model.transform(X_test["review"]).toarray()

## Modelos

### GaussianNB

In [173]:
model = GaussianNB()
model.fit(X_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [174]:
y_pred = model.predict(X_test)
y_pred

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0])

In [175]:
accuracy_score(y_test, y_pred)

0.7877094972067039

El resultado es de un 78% vamos a probar con otro modelo para comparar resultados y elegir cual es mejor. 

### MultinomialNB

In [176]:
model = MultinomialNB()

model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [177]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [178]:
metrics = {"Accuracy Train: ": accuracy_score(y_train, y_pred_train),
           "Accuracy Test ": accuracy_score(y_test, y_pred_test)}

metrics

{'Accuracy Train: ': 0.8665730337078652, 'Accuracy Test ': 0.7206703910614525}

Como podemos ver, el Accuracy es de 86% lo cual mejora mucho el resultado que nos dejo el modelo de GaussianNB por lo cual vamos a mejorar este con hiperparametrizacion 

### Randomized

In [179]:
hyperparams = {"alpha": np.linspace(0.01, 10.0, 200),
               "fit_prior": [True, False]}

# We initialize the random search
random_search = RandomizedSearchCV(model, hyperparams,
                                   n_iter=50,
                                   scoring="accuracy",
                                   cv=5,
                                   random_state=18)
random_search

0,1,2
,estimator,MultinomialNB()
,param_distributions,"{'alpha': array([ 0.01 ... 10. ]), 'fit_prior': [True, False]}"
,n_iter,50
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,18

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [180]:
random_search.fit(X_train, y_train)

random_search.best_params_, random_search.best_score_, random_search.best_estimator_

({'fit_prior': False, 'alpha': np.float64(1.2650251256281408)},
 np.float64(0.8047867625332416),
 MultinomialNB(alpha=np.float64(1.2650251256281408), fit_prior=False))

Realicé una búsqueda aleatoria de hiperparámetros para optimizar el clasificador Multinomial Naive Bayes. El mejor modelo utiliza alpha = 1.265 y fit_prior = False, lo que reduce el sobreajuste y mejora la capacidad de generalización.