In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_iris, fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [26]:
df = pd.read_csv("../data/raw/playstore_reviews.csv")

In [27]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


# Eliminar la columna package_name

In [28]:
df = df.drop(columns=["package_name"])

# Limpiar texto: minúsculas y espacios


In [29]:
df["review"] = df["review"].str.strip().str.lower()

# Separar variables predictoras y objetivo

In [30]:
X = df["review"]
y = df["polarity"]

# Dividir en entrenamiento y test

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Transformar texto en matriz de conteo

In [32]:
vec_model = CountVectorizer(stop_words="english")

X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

# Construcción de los modelos Naive Bayes

In [33]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)



0,1,2
,priors,
,var_smoothing,1e-09


In [52]:
y_pred_train_gnb = gnb.predict(X_train)
y_pred_test_gnb = gnb.predict(X_test)

metrics_gnb = {"Accuracy Train": accuracy_score(y_train, y_pred_train_gnb),
               "Accuracy Test":  accuracy_score(y_test, y_pred_test_gnb)}
metrics_gnb

{'Accuracy Train': 0.9850299401197605, 'Accuracy Test': 0.7937219730941704}

In [35]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [None]:
y_pred_train = mnb.predict(X_train)
y_pred_test = mnb.predict(X_test)


metrics = {"Accuracy Train: ": accuracy_score(y_train, y_pred_train),
           "Accuracy Test ": accuracy_score(y_test, y_pred_test)}

metrics

{'Accuracy Train: ': 0.9655688622754491, 'Accuracy Test ': 0.7847533632286996}

In [37]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


In [54]:
y_pred_train_bnb = bnb.predict(X_train)
y_pred_test_bnb = bnb.predict(X_test)

metrics_bnb = {"Accuracy Train": accuracy_score(y_train, y_pred_train_bnb),
               "Accuracy Test":  accuracy_score(y_test, y_pred_test_bnb)}
metrics_bnb

{'Accuracy Train': 0.9221556886227545, 'Accuracy Test': 0.7623318385650224}

# Intento de optimización con Random search

In [57]:
mnb_base = MultinomialNB()

param_dist = {"alpha": [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0],
              "fit_prior": [True, False]}

In [59]:
random_search = RandomizedSearchCV(
        estimator=mnb_base,
        param_distributions=param_dist,
        n_iter=10,              
        cv=5,                  
        scoring="accuracy",
        random_state=42,
        n_jobs=-1 )

random_search.fit(X_train, y_train)

0,1,2
,estimator,MultinomialNB()
,param_distributions,"{'alpha': [0.01, 0.05, ...], 'fit_prior': [True, False]}"
,n_iter,10
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,False
,class_prior,


In [60]:
random_search.best_params_

{'fit_prior': False, 'alpha': 1.0}

In [61]:
random_search.best_score_

np.float64(0.8202783077095723)

In [None]:
best_mnb = random_search.best_estimator_

y_pred_train_best = best_mnb.predict(X_train)
y_pred_test_best = best_mnb.predict(X_test)

metrics_best = {"Accuracy Train (best)": accuracy_score(y_train, y_pred_train_best),
                "Accuracy Test (best)":  accuracy_score(y_test, y_pred_test_best)}

metrics_best
 

{'Accuracy Train (best)': 0.9655688622754491,
 'Accuracy Test (best)': 0.7982062780269058}