# Explore here

In [124]:
# Your code here
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report


import warnings
warnings.filterwarnings('ignore')

In [125]:
# Get data
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
# data.to_csv("../data/raw/data.csv", index=False)
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


In [126]:
df.shape

(891, 3)

In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [128]:
# Observamos que no haya otros valores distintos.
df['polarity'].unique()

array([0, 1])

In [129]:
df.isnull().sum()

package_name    0
review          0
polarity        0
dtype: int64

In [130]:
df.duplicated().sum()

np.int64(0)

In [131]:
df["review"] = df["review"].str.strip().str.lower()
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offlin...,0
1,com.facebook.katana,"messenger issues ever since the last update, i...",0
2,com.facebook.katana,profile any time my wife or anybody has more t...,0
3,com.facebook.katana,the new features suck for those of us who don'...,0
4,com.facebook.katana,forced reload on uploading pic on replying com...,0


In [132]:
# split
from sklearn.model_selection import train_test_split

X = df['review']
y = df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Mostrar X
print('El tamaño de X train es de:' ,X_train.shape)
print('El tamaño de Y train es de:' ,y_train.shape)
print('El tamaño de X test es de:' ,X_test.shape)
print('El tamaño de y test es de:' ,y_test.shape)

El tamaño de X train es de: (712,)
El tamaño de Y train es de: (712,)
El tamaño de X test es de: (179,)
El tamaño de y test es de: (179,)


In [133]:
vec_model = CountVectorizer(stop_words = "english")
X_train_vec = vec_model.fit_transform(X_train).toarray()
X_test_vec = vec_model.transform(X_test).toarray()

In [134]:
print('El tamaño de X train es de:' ,X_train_vec.shape)
print('El tamaño de X test es de:' ,X_test_vec.shape)

El tamaño de X train es de: (712, 3310)
El tamaño de X test es de: (179, 3310)


## GaussianNB

In [135]:
# modelo
model_gaus = GaussianNB()

# entrenamiento
model_gaus.fit(X_train_vec, y_train)

In [136]:
# predicción
y_pred_train_gaus = model_gaus.predict(X_train_vec)

y_pred_test_gaus = model_gaus.predict(X_test_vec)


In [137]:
# metricas
accuracy_test_gaus = accuracy_score(y_test, y_pred_test_gaus)
accuracy_train_gaus = accuracy_score(y_train, y_pred_train_gaus)

print("Accuracy Test: ", accuracy_test_gaus)
print("Accuracy Train: ", accuracy_train_gaus)

Accuracy Test:  0.8044692737430168
Accuracy Train:  0.9859550561797753


## MultinomialNB

In [138]:
# Entrenamiento del clasificador
model_multi = MultinomialNB()

model_multi.fit(X_train_vec, y_train)

In [139]:
# predicción
y_pred_train_mul = model_multi.predict(X_train_vec)
y_pred_test_mul = model_multi.predict(X_test_vec)

In [140]:
# metricas
accuracy_test_mul = accuracy_score(y_test , y_pred_test_mul)
accuracy_train_mul = accuracy_score(y_train , y_pred_train_mul)

print("Accuracy Test: ", accuracy_test_mul)
print("Accuracy Train: ", accuracy_train_mul)

Accuracy Test:  0.8156424581005587
Accuracy Train:  0.9606741573033708


## BernoulliNB

In [141]:
# Entrenamiento del clasificador
model_ber = BernoulliNB()

model_ber.fit(X_train_vec, y_train)

In [142]:
# predicción
y_pred_train_ber = model_ber.predict(X_train_vec)
y_pred_test_ber = model_ber.predict(X_test_vec)

In [143]:
# metricas
accuracy_test_ber = accuracy_score(y_test , y_pred_test_ber)
accuracy_train_ber = accuracy_score(y_train , y_pred_train_ber)

print("Accuracy Test: ", accuracy_test_ber)
print("Accuracy Train: ", accuracy_train_ber)

Accuracy Test:  0.770949720670391
Accuracy Train:  0.9199438202247191


In [144]:
# Comparamos los tres modelos y elegimos el mejor.
print('GaussianNB','Train:',accuracy_train_gaus,'Test:',accuracy_test_gaus)
print('MultinomialNB','Train:',accuracy_train_mul,'Test:',accuracy_test_mul)
print('BernoulliNB','Train:',accuracy_train_ber,'Test:',accuracy_test_ber)


GaussianNB Train: 0.9859550561797753 Test: 0.8044692737430168
MultinomialNB Train: 0.9606741573033708 Test: 0.8156424581005587
BernoulliNB Train: 0.9199438202247191 Test: 0.770949720670391


In [152]:
from sklearn.ensemble import RandomForestClassifier


rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_vec, y_train)


In [153]:
y_pred_rf = rf_model.predict(X_test_vec)

In [147]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Accuracy: 0.8156424581005587


In [148]:
print('MultinomialNB','Train:',accuracy_train_mul,'Test:',accuracy_test_mul)

MultinomialNB Train: 0.9606741573033708 Test: 0.8156424581005587


In [149]:
from sklearn.ensemble import RandomForestClassifier


rf_model_100 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_100.fit(X_train_vec, y_train)

In [154]:
y_pred_rf_100 = rf_model_100.predict(X_test_vec)

In [156]:
print("Accuracy 100:", accuracy_score(y_test, y_pred_rf_100))
print("Accuracy 200:", accuracy_score(y_test, y_pred_rf))
print("Accuracy Test: ", accuracy_test_mul)


Accuracy 100: 0.7988826815642458
Accuracy 200: 0.8156424581005587
Accuracy Test:  0.8156424581005587
