In [109]:
import pandas as pd
import numpy as np
import random
import joblib
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import CountVectorizer


from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')



url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"

Cargamos el conjunto de datos

In [84]:
df = pd.read_csv(url)
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


Eliminamos columna que no sirve

In [86]:
df = df.drop(columns=['package_name'])

Limpieza de texto

In [87]:
df["review"] = df["review"].str.strip().str.lower()

variables X e Y

In [88]:
X = df["review"]
y = df["polarity"]

Dividimos los datos

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [90]:
y_train.shape

(712,)

Vectorización

In [91]:
vec = CountVectorizer(stop_words="english")
X_train_vec = vec.fit_transform(X_train).toarray()
X_test_vec = vec.transform(X_test).toarray()

Pasamos con el model

In [92]:
modelos = {
 "multinomialNB": MultinomialNB(),
 "bernoulliNB": BernoulliNB(),
 "gaussianNB": GaussianNB()
           }

In [93]:
result = {}

In [94]:
for name, model in modelos.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    result[name] = accuracy_score(y_test, y_pred)

In [95]:
result

{'multinomialNB': 0.8156424581005587,
 'bernoulliNB': 0.770949720670391,
 'gaussianNB': 0.8044692737430168}

In [96]:
model = RandomForestClassifier()
model.fit(X_train_vec, y_train)

In [97]:
df = df.dropna(subset=['review', 'polarity'])

In [98]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_vec, y_train)
rf_pred = rf.predict(X_train_vec)

acc_rf = accuracy_score(y_train, rf_pred)
acc_rf

1.0

In [99]:
rf_predy = rf.predict(X_test_vec)
acc_rfy = accuracy_score(y_test, rf_predy)
acc_rfy

0.8156424581005587

In [101]:
os.makedirs("models", exist_ok=True)
joblib.dump(modelos["multinomialNB"], "models/multinomialnb_model.pkl")
joblib.dump(vec, "models/vectorizer.pkl")
print("✅ Modelo y vectorizador guardados correctamente.")

✅ Modelo y vectorizador guardados correctamente.


In [102]:
param_grid = {
    'alpha': [0.0, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
    'fit_prior': [True, False]}


grid = GridSearchCV(modelos["multinomialNB"], param_grid, scoring = "accuracy", cv = 5)
grid

In [103]:
grid.fit(X_train_vec, y_train)

In [105]:
grid_model = MultinomialNB(**grid.best_params_)

In [107]:
grid_model.fit(X_train_vec, y_train)

In [108]:
#Hacemos el predict
y_predy = grid_model.predict(X_test_vec)
acc_rfy = accuracy_score(y_test, y_predy)
acc_rfy

0.8212290502793296

In [110]:

grid_model = LogisticRegression()

grid_model.fit(X_train_vec, y_train)

y_predy = grid_model.predict(X_test_vec)
acc_rfy = accuracy_score(y_test, y_predy)
acc_rfy

0.8324022346368715

Guardamos el modelo con mejor rendimiento/resultados para usarlo en un futuro

In [111]:
joblib.dump(grid_model, "models/logistic_regression_model.pkl")
joblib.dump(vec, "models/vectorizer.pkl")
print("✅ Modelo de regresión logística y vectorizador guardados correctamente.")

✅ Modelo de regresión logística y vectorizador guardados correctamente.
