# Explore here

In [1]:
import pandas as pd

# URL del dataset
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"

# Cargar el dataset
df = pd.read_csv(url)

# Mostrar las primeras filas
print(df.head())

          package_name                                             review  \
0  com.facebook.katana   privacy at least put some option appear offli...   
1  com.facebook.katana   messenger issues ever since the last update, ...   
2  com.facebook.katana   profile any time my wife or anybody has more ...   
3  com.facebook.katana   the new features suck for those of us who don...   
4  com.facebook.katana   forced reload on uploading pic on replying co...   

   polarity  
0         0  
1         0  
2         0  
3         0  
4         0  


In [2]:
# Eliminar la columna 'package_name'
df.drop(columns=["package_name"], inplace=True)

# Verificamos que se haya eliminado
print(df.head())

                                              review  polarity
0   privacy at least put some option appear offli...         0
1   messenger issues ever since the last update, ...         0
2   profile any time my wife or anybody has more ...         0
3   the new features suck for those of us who don...         0
4   forced reload on uploading pic on replying co...         0


In [3]:
# Convertir a minúsculas y eliminar espacios extra en la columna 'review'
df["review"] = df["review"].str.lower().str.strip()

# Ver las primeras filas para comprobar
print(df.head())

                                              review  polarity
0  privacy at least put some option appear offlin...         0
1  messenger issues ever since the last update, i...         0
2  profile any time my wife or anybody has more t...         0
3  the new features suck for those of us who don'...         0
4  forced reload on uploading pic on replying com...         0


In [4]:
# X es el texto de las reseñas (variable predictora)
X = df["review"]

# y es la columna de clasificación (variable objetivo)
y = df["polarity"]

# Verificamos cómo quedó
print(X.head())
print(y.head())

0    privacy at least put some option appear offlin...
1    messenger issues ever since the last update, i...
2    profile any time my wife or anybody has more t...
3    the new features suck for those of us who don'...
4    forced reload on uploading pic on replying com...
Name: review, dtype: object
0    0
1    0
2    0
3    0
4    0
Name: polarity, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split

# Dividimos los datos (80% entrenamiento, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verificamos la cantidad de datos en cada conjunto
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

Train: 712, Test: 179


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Creamos el vectorizador
vectorizer = CountVectorizer()

# Ajustamos SOLO con los datos de entrenamiento
X_train_vectorized = vectorizer.fit_transform(X_train)

# Transformamos los datos de test con el mismo vectorizador
X_test_vectorized = vectorizer.transform(X_test)

# Ver la forma de la matriz resultante
print(X_train_vectorized.shape, X_test_vectorized.shape)  # (n_reviews, n_words)

(712, 3553) (179, 3553)


### Implementar el modelo con MultinomialNB

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Creamos el modelo
nb_model = MultinomialNB()

# Entrenamos con los datos de entrenamiento
nb_model.fit(X_train_vectorized, y_train)

# Hacemos predicciones
y_pred = nb_model.predict(X_test_vectorized)

# Evaluamos el modelo
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Reporte de clasificación
print(classification_report(y_test, y_pred))


Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.85      0.95      0.90       126
           1       0.84      0.58      0.69        53

    accuracy                           0.84       179
   macro avg       0.84      0.77      0.79       179
weighted avg       0.84      0.84      0.83       179



#### Prueba con Bernoulli

In [8]:
from sklearn.naive_bayes import BernoulliNB

# Creamos el modelo BernoulliNB
bernoulli_nb = BernoulliNB()

# Entrenamos
bernoulli_nb.fit(X_train_vectorized, y_train)

# Predicciones
y_pred_bernoulli = bernoulli_nb.predict(X_test_vectorized)

# Evaluación
print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bernoulli))
print(classification_report(y_test, y_pred_bernoulli))


BernoulliNB Accuracy: 0.8491620111731844
              precision    recall  f1-score   support

           0       0.86      0.94      0.90       126
           1       0.81      0.64      0.72        53

    accuracy                           0.85       179
   macro avg       0.84      0.79      0.81       179
weighted avg       0.85      0.85      0.84       179



### Prueba con GaussianNB 

In [9]:
from sklearn.naive_bayes import GaussianNB

# Convertimos los datos a una matriz densa
X_train_dense = X_train_vectorized.toarray()
X_test_dense = X_test_vectorized.toarray()

# Creamos el modelo GaussianNB
gaussian_nb = GaussianNB()

# Entrenamos
gaussian_nb.fit(X_train_dense, y_train)

# Predicciones
y_pred_gaussian = gaussian_nb.predict(X_test_dense)

# Evaluación
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gaussian))
print(classification_report(y_test, y_pred_gaussian))
from sklearn.naive_bayes import GaussianNB

# Convertimos los datos a una matriz densa
X_train_dense = X_train_vectorized.toarray()
X_test_dense = X_test_vectorized.toarray()

# Creamos el modelo GaussianNB
gaussian_nb = GaussianNB()

# Entrenamos
gaussian_nb.fit(X_train_dense, y_train)

# Predicciones
y_pred_gaussian = gaussian_nb.predict(X_test_dense)

# Evaluación
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gaussian))
print(classification_report(y_test, y_pred_gaussian))


GaussianNB Accuracy: 0.8044692737430168
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       126
           1       0.70      0.60      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179

GaussianNB Accuracy: 0.8044692737430168
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       126
           1       0.70      0.60      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179



### Optimizamos con un Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

# Creamos el modelo con 100 árboles
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Entrenamos
rf_model.fit(X_train_vectorized, y_train)

# Predicciones
y_pred_rf = rf_model.predict(X_test_vectorized)

# Evaluamos
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.8268156424581006
              precision    recall  f1-score   support

           0       0.84      0.93      0.88       126
           1       0.78      0.58      0.67        53

    accuracy                           0.83       179
   macro avg       0.81      0.76      0.77       179
weighted avg       0.82      0.83      0.82       179

