# Explore here

In [1]:
# Your code here
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from imblearn.metrics import specificity_score
from sklearn.metrics import *
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from pickle import dump
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier


In [2]:
def get_metrics(y_train, y_test, y_pred_train, y_pred_test):
    # Calcular métricas para el conjunto de entrenamiento
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)
    train_specificity = specificity_score(y_train, y_pred_train)

    # Calcular métricas para el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_specificity = specificity_score(y_test, y_pred_test)

    # Calcular la diferencia entre métricas de entrenamiento y prueba
    diff_accuracy = train_accuracy - test_accuracy
    diff_f1 = train_f1 - test_f1
    diff_auc = train_auc - test_auc
    diff_precision = train_precision - test_precision
    diff_recall = train_recall - test_recall
    diff_specificity = train_specificity - test_specificity

    # Crear un DataFrame con los resultados
    metrics_df = pd.DataFrame([[train_accuracy, train_f1, train_auc, train_precision, train_recall, train_specificity],[test_accuracy, test_f1, test_auc, test_precision, test_recall, test_specificity],[diff_accuracy, diff_f1, diff_auc, diff_precision, diff_recall, diff_specificity]],
                              columns = ['Accuracy', 'F1', 'AUC', 'Precision', 'Recall', 'Specificity'],
                              index = ['Train','Test', 'Diferencia'])

    return metrics_df

In [3]:
df = pd.read_csv('/workspaces/pille9-naive-bayes/data/raw/playstore_reviews.csv')
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [4]:
#Eliminar variable package_name
df = df.drop('package_name', axis=1)

#Eliminar espacios y convertir a minúsculas el texto
df['review'] = df['review'].str.strip().str.lower()
df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well. ads aren't as annoyi...,1


In [5]:
# Dividir el conjunto de datos en train y test
X = df['review']
y = df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Transformar el texto en una matriz de recuento de palabras
vec_model = CountVectorizer(stop_words = "english")
X_train_vec = vec_model.fit_transform(X_train).toarray()
X_test_vec = vec_model.transform(X_test).toarray()

In [7]:
# Multinomial
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [8]:
y_pred_test = model.predict(X_test_vec)
y_pred_test

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [9]:
y_pred_train = model.predict(X_train_vec)

In [10]:
get_metrics(y_train, y_test, y_pred_train, y_pred_test)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.960674,0.944223,0.954527,0.955645,0.933071,0.975983
Test,0.815642,0.659794,0.754268,0.727273,0.603774,0.904762
Diferencia,0.145032,0.284429,0.200259,0.228372,0.329297,0.071221


In [11]:
model_gaussian = GaussianNB()
model_gaussian.fit(X_train_vec, y_train)

In [12]:
y_pred_test_gaussian = model_gaussian.predict(X_test_vec)
y_pred_test_gaussian

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0])

In [13]:
y_pred_train_gaussian = model_gaussian.predict(X_train_vec)

In [14]:
get_metrics(y_train, y_test, y_pred_train_gaussian, y_pred_test_gaussian)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.985955,0.980695,0.989083,0.962121,1.0,0.978166
Test,0.804469,0.653465,0.751797,0.6875,0.622642,0.880952
Diferencia,0.181486,0.32723,0.237286,0.274621,0.377358,0.097214


In [15]:
model_bernoulli = BernoulliNB()
model_bernoulli.fit(X_train_vec, y_train)

In [16]:
y_pred_test_bernoulli = model_bernoulli.predict(X_test_vec)
y_pred_test_bernoulli

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [17]:
y_pred_train_bernoulli = model_bernoulli.predict(X_train_vec)

In [18]:
get_metrics(y_train, y_test, y_pred_train_bernoulli, y_pred_test_bernoulli)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.919944,0.875817,0.891302,0.980488,0.791339,0.991266
Test,0.77095,0.506024,0.662399,0.7,0.396226,0.928571
Diferencia,0.148994,0.369793,0.228904,0.280488,0.395112,0.062695


El Multinomial es teóricamente el que mejor se adapta ya que, es para datos discretos.

Observando las métricas, se corrobora que es el modelo que mejor se adapta, con un mayor accuracy y F1 Score y un AUC más alto.

In [19]:
# Optimización

hyperparams = {
    "alpha": np.linspace(0.01, 2.0, 200),
    "fit_prior": [True, False]
}
random_search = RandomizedSearchCV(model, hyperparams, n_iter = 100, scoring = "accuracy", cv = 5, random_state = 42)
random_search

In [20]:
random_search.fit(X_train_vec, y_train)
print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'fit_prior': False, 'alpha': np.float64(1.93)}


In [21]:
model_opt = MultinomialNB(alpha = 1.93, fit_prior = False)
model_opt.fit(X_train_vec, y_train)
y_pred_test_opt = model_opt.predict(X_test_vec)

In [22]:
y_pred_train_opt = model_opt.predict(X_train_vec)

In [23]:
get_metrics(y_train, y_test, y_pred_train_opt, y_pred_test_opt)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.950843,0.930417,0.944254,0.939759,0.92126,0.967249
Test,0.821229,0.68,0.769167,0.723404,0.641509,0.896825
Diferencia,0.129614,0.250417,0.175087,0.216355,0.27975,0.070424


Los resultados han mejorado con respecto a antes de la optimización

In [24]:
# Guardar el modelo
dump(model_opt, open("../models/naive_bayes_alpha_1-9176382_fit_prior_False_42.sav", "wb"))

Prueba Random Forest para mejorar los resultados

In [25]:
# Transformación del texto usando TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000) 
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

In [34]:
# GridSearch para Random Forest
model_rf = RandomForestClassifier(random_state = 42)

hyperparams = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],   
    'min_samples_split': [5, 10, 15, 20],  
    'min_samples_leaf': [2, 4, 6, 8],    
    'bootstrap': [True, False]        
}

grid = GridSearchCV(model_rf, hyperparams, scoring = "accuracy", cv = 5)
grid

In [35]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

grid.fit(X_train_tfidf, y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 20, 'n_estimators': 200}


In [40]:
rf_opt = RandomForestClassifier(bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=20, n_estimators = 200, random_state = 42)
rf_opt.fit(X_train_tfidf, y_train)

In [41]:
y_pred_test_rf = rf_opt.predict(X_test_tfidf)
y_pred_train_rf = rf_opt.predict(X_train_tfidf)
get_metrics(y_train, y_test, y_pred_train_rf, y_pred_test_rf)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.985955,0.98008,0.982069,0.991935,0.968504,0.995633
Test,0.804469,0.660194,0.757263,0.68,0.641509,0.873016
Diferencia,0.181486,0.319886,0.224806,0.311935,0.326995,0.122617


Los resultados muestran que el Random Forest está sobreajustado.

El multinomial optimizado es el que mejor resultados ha dado generalizando mejor y obteniendo mejores resultados en las métricas.