# Explore here

In [8]:
import pandas as pd

# Cargar el conjunto de datos
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)

# Mostrar las primeras filas del conjunto de datos
print(df.head())

          package_name                                             review  \
0  com.facebook.katana   privacy at least put some option appear offli...   
1  com.facebook.katana   messenger issues ever since the last update, ...   
2  com.facebook.katana   profile any time my wife or anybody has more ...   
3  com.facebook.katana   the new features suck for those of us who don...   
4  com.facebook.katana   forced reload on uploading pic on replying co...   

   polarity  
0         0  
1         0  
2         0  
3         0  
4         0  


In [9]:
def apply_preprocess(df):
    df = df.drop("package_name", axis=1)
    df["review"] = df["review"].str.strip().str.lower()
    return df

df = apply_preprocess(df)

print(df.head())

                                              review  polarity
0  privacy at least put some option appear offlin...         0
1  messenger issues ever since the last update, i...         0
2  profile any time my wife or anybody has more t...         0
3  the new features suck for those of us who don'...         0
4  forced reload on uploading pic on replying com...         0


In [10]:
from sklearn.model_selection import train_test_split

X = df["review"]
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Tamaño del conjunto de entrenamiento: {len(X_train)}')
print(f'Tamaño del conjunto de prueba: {len(X_test)}')

Tamaño del conjunto de entrenamiento: 712
Tamaño del conjunto de prueba: 179


In [11]:
from sklearn.model_selection import train_test_split

X = df["review"]
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Tamaño del conjunto de entrenamiento: {len(X_train)}')
print(f'Tamaño del conjunto de prueba: {len(X_test)}')

Tamaño del conjunto de entrenamiento: 712
Tamaño del conjunto de prueba: 179


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# Crear el vectorizador y ajustar el conjunto de entrenamiento
vec_model = CountVectorizer(stop_words='english')
X_train_vec = vec_model.fit_transform(X_train).toarray()
X_test_vec = vec_model.transform(X_test).toarray()

print(f'Tamaño de la matriz de características de entrenamiento: {X_train_vec.shape}')
print(f'Tamaño de la matriz de características de prueba: {X_test_vec.shape}')

Tamaño de la matriz de características de entrenamiento: (712, 3310)
Tamaño de la matriz de características de prueba: (179, 3310)


In [13]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

# Inicializar los modelos
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

# Entrenar y evaluar GaussianNB
gnb.fit(X_train_vec, y_train)
y_pred_gnb = gnb.predict(X_test_vec)
acc_gnb = accuracy_score(y_test, y_pred_gnb)

# Entrenar y evaluar MultinomialNB
mnb.fit(X_train_vec, y_train)
y_pred_mnb = mnb.predict(X_test_vec)
acc_mnb = accuracy_score(y_test, y_pred_mnb)

# Entrenar y evaluar BernoulliNB
bnb.fit(X_train_vec, y_train)
y_pred_bnb = bnb.predict(X_test_vec)
acc_bnb = accuracy_score(y_test, y_pred_bnb)

print(f'Exactitud de GaussianNB: {acc_gnb}')
print(f'Exactitud de MultinomialNB: {acc_mnb}')
print(f'Exactitud de BernoulliNB: {acc_bnb}')

Exactitud de GaussianNB: 0.8044692737430168
Exactitud de MultinomialNB: 0.8156424581005587
Exactitud de BernoulliNB: 0.770949720670391


In [14]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

hyperparams = {
    "alpha": np.linspace(0.01, 10.0, 200),
    "fit_prior": [True, False]
}

# Inicializar la búsqueda aleatoria
random_search = RandomizedSearchCV(mnb, hyperparams, n_iter=50, scoring="accuracy", cv=5, random_state=42)
random_search.fit(X_train_vec, y_train)

print(f"Mejores hiperparámetros: {random_search.best_params_}")

# Entrenar el modelo con los mejores hiperparámetros
best_mnb = MultinomialNB(alpha=random_search.best_params_['alpha'], fit_prior=random_search.best_params_['fit_prior'])
best_mnb.fit(X_train_vec, y_train)
y_pred_best_mnb = best_mnb.predict(X_test_vec)
acc_best_mnb = accuracy_score(y_test, y_pred_best_mnb)

print(f'Exactitud del modelo optimizado MultinomialNB: {acc_best_mnb}')

Mejores hiperparámetros: {'fit_prior': False, 'alpha': np.float64(1.917638190954774)}
Exactitud del modelo optimizado MultinomialNB: 0.8212290502793296


In [15]:
import joblib

# Guardar el mejor modelo MultinomialNB optimizado
joblib.dump(best_mnb, 'best_multinomial_nb_model.pkl')

print("Modelo optimizado guardado exitosamente.")

Modelo optimizado guardado exitosamente.


In [16]:
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

# Entrenar y evaluar SVM
svm_model = SVC(random_state=42)
svm_model.fit(X_train_vec, y_train)
y_pred_svm = svm_model.predict(X_test_vec)
acc_svm = accuracy_score(y_test, y_pred_svm)

# Entrenar y evaluar Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_vec, y_train)
y_pred_gb = gb_model.predict(X_test_vec)
acc_gb = accuracy_score(y_test, y_pred_gb)

print(f'Exactitud de SVM: {acc_svm}')
print(f'Exactitud de Gradient Boosting: {acc_gb}')

Exactitud de SVM: 0.8044692737430168
Exactitud de Gradient Boosting: 0.7318435754189944


In [17]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# SVM
svm_model = SVC(random_state=42)
svm_model.fit(X_train_vec, y_train)
y_pred_svm = svm_model.predict(X_test_vec)
acc_svm = accuracy_score(y_test, y_pred_svm)
print(f'Exactitud de SVM: {acc_svm}')

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_vec, y_train)
y_pred_rf = rf_model.predict(X_test_vec)
acc_rf = accuracy_score(y_test, y_pred_rf)
print(f'Exactitud del modelo Random Forest: {acc_rf}')

# Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_vec, y_train)
y_pred_gb = gb_model.predict(X_test_vec)
acc_gb = accuracy_score(y_test, y_pred_gb)
print(f'Exactitud de Gradient Boosting: {acc_gb}')

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_vec, y_train)
y_pred_lr = lr_model.predict(X_test_vec)
acc_lr = accuracy_score(y_test, y_pred_lr)
print(f'Exactitud de Logistic Regression: {acc_lr}')

Exactitud de SVM: 0.8044692737430168
Exactitud del modelo Random Forest: 0.7988826815642458
Exactitud de Gradient Boosting: 0.7318435754189944
Exactitud de Logistic Regression: 0.8324022346368715


In [18]:
import joblib

# Guardar el modelo de Regresión Logística
joblib.dump(lr_model, 'best_logistic_regression_model.pkl')

print("Modelo de Regresión Logística guardado exitosamente.")

Modelo de Regresión Logística guardado exitosamente.
