# Explore here

PROYECTO NAIVE BAYES

In [124]:
import pandas as pd
import numpy as np
import os
import requests
from pickle import dump



import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.feature_extraction.text import CountVectorizer


from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [100]:
archivo = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"


# 2Nombre y ruta de la carpeta para datos en crudo
raw_data_folder = "data/raw"
raw_data_path = os.path.join(raw_data_folder, "playstore_reviews.csv")

# Crear la carpeta si no existe
if not os.path.exists(raw_data_folder):
    os.makedirs(raw_data_folder)

# Petición para descargar el fichero
response = requests.get(archivo)

# Verificar si la descarga fue exitosa
if response.status_code == 200:
    # Guardar el contenido en la carpeta en crudo
    with open(raw_data_path, 'wb') as file:
        file.write(response.content)
    print("Archivo guardado en crudo exitosamente en:", raw_data_path)
else:
    print("Error al descargar el archivo:", response.status_code)

# Leer el archivo CSV descargado 
df = pd.read_csv(raw_data_path)

Archivo guardado en crudo exitosamente en: data/raw/playstore_reviews.csv


In [101]:
df.info

<bound method DataFrame.info of              package_name                                             review  \
0     com.facebook.katana   privacy at least put some option appear offli...   
1     com.facebook.katana   messenger issues ever since the last update, ...   
2     com.facebook.katana   profile any time my wife or anybody has more ...   
3     com.facebook.katana   the new features suck for those of us who don...   
4     com.facebook.katana   forced reload on uploading pic on replying co...   
..                    ...                                                ...   
886  com.rovio.angrybirds   loved it i loooooooooooooovvved it because it...   
887  com.rovio.angrybirds   all time legendary game the birthday party le...   
888  com.rovio.angrybirds   ads are way to heavy listen to the bad review...   
889  com.rovio.angrybirds   fun works perfectly well. ads aren't as annoy...   
890  com.rovio.angrybirds   they're everywhere i see angry birds everywhe...   

     po

In [102]:
df.shape

(891, 3)

In [103]:

#Eliminamos la variable package_name ya que no aporta
df.drop(["package_name"], axis =1, inplace = True)

In [104]:
df.shape

(891, 2)

In [105]:
#Eliminamos espacios y convertimos todo a minúsculas

df["review"] = df["review"].str.strip().str.lower()


In [106]:
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [108]:
# Dividimos el conjunto de datos en muestras de train y test
X = df["review"]
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [109]:
print(X_train)

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
                             ...                        
106    why can't i share my achievements? recently di...
270    beta is the best version of the chrome browser...
860    great little game. this is a great little game...
435    keeps crashing ever since i started using it m...
102    even though i am loving the new update, but th...
Name: review, Length: 712, dtype: object


In [110]:
print(y_train)

331    0
733    0
382    0
704    1
813    1
      ..
106    0
270    0
860    1
435    0
102    0
Name: polarity, Length: 712, dtype: int64


In [111]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()


In [112]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {len(y_train)}")

X_train shape: (712, 3310)
y_train shape: 712


MULTINOMIAL

In [116]:
# Entrenamiento del clasificador
model = MultinomialNB()

model.fit(X_train, y_train)

In [117]:
# predicción
y_pred_train = model.predict(X_train)

y_pred_test = model.predict(X_test)
y_pred_test

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [118]:
# metricas
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

print("Accuracy Test: ", accuracy_test)
print("Accuracy Train: ", accuracy_train)

Accuracy Test:  0.8156424581005587
Accuracy Train:  0.9606741573033708


In [126]:
dump(model, open("REVIEWS_MULTINOMIAL", "wb"))

In [181]:
model = MultinomialNB(alpha= 2.4, fit_prior= True)
model.fit(X_train, y_train)

In [182]:
# predicción
y_pred_train = model.predict(X_train)

y_pred_test = model.predict(X_test)
y_pred_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [183]:
# metricas
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

print("Accuracy Test: ", accuracy_test)
print("Accuracy Train: ", accuracy_train)

Accuracy Test:  0.8379888268156425
Accuracy Train:  0.9339887640449438


In [184]:
dump(model, open("REVIEWS_MULTINOMIAL_opt", "wb"))

A partir de un alpha superior a 2.4 perdemos accuracy en el train y apenas ganamos nada en el test

BERNOULLI

In [113]:
# Entrenamiento del clasificador
model = BernoulliNB()

model.fit(X_train, y_train)

In [114]:
# predicción
y_pred_train = model.predict(X_train)

y_pred_test = model.predict(X_test)
y_pred_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [115]:
# metricas
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

print("Accuracy Test: ", accuracy_test)
print("Accuracy Train: ", accuracy_train)

Accuracy Test:  0.770949720670391
Accuracy Train:  0.9199438202247191


In [125]:
dump(model, open("REVIEWS_BERNOULLI", "wb"))

GAUSSIANO

In [119]:
# Entrenamiento del clasificador
model = GaussianNB()

model.fit(X_train, y_train)

In [120]:
# predicción
y_pred_train = model.predict(X_train)

y_pred_test = model.predict(X_test)
y_pred_test

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0])

In [121]:
# metricas
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

print("Accuracy Test: ", accuracy_test)
print("Accuracy Train: ", accuracy_train)

Accuracy Test:  0.8044692737430168
Accuracy Train:  0.9859550561797753


In [127]:
dump(model, open("REVIEWS_GAUSSIANO", "wb"))