# Explore here

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from imblearn.metrics import specificity_score
import requests
from sklearn.preprocessing import MinMaxScaler
from pickle import dump
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB 
from sklearn.metrics import classification_report
import re
from lazypredict.Supervised import LazyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
response = requests.get(url).content.decode('utf-8')

file_name = '../data/raw/playstore_reviews.csv'

with open(file_name, 'w') as temp_file:
    temp_file.writelines(response)

In [4]:
df = pd.read_csv(file_name)
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [5]:
#Buscamos valores nulos, no hay.
print(df.isnull().mean()*100)

package_name   0.00
review         0.00
polarity       0.00
dtype: float64


In [6]:
#Eliminamos la columna package_name tal y como dicen las instrucciones del ejercicio al no ser relevante para este estudio

df.drop(['package_name'], axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [8]:
#Calculamos los porcentajes de reseñas positivas y negativas

pos_reviews = df['polarity'][df['polarity'] == 1].count()/len(df)
neg_reviews = df['polarity'][df['polarity'] == 0].count()/len(df)

print(pos_reviews)
print(neg_reviews)

0.3445566778900112
0.6554433221099888


In [9]:
# Eliminamos los espacios del principio y del final y ponemos todos los caracteres en minúsculas

df["review"] = df["review"].str.strip().str.lower()

In [10]:
# Eliminamos los signos de puntuación para dejar solo texto
df["review"] = list(map(lambda x: re.sub(r'[^a-z\s]', '', x) , df["review"]))


In [11]:
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,messenger issues ever since the last update in...,0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who dont...,0
4,forced reload on uploading pic on replying com...,0


In [12]:
X = list(df["review"])
y = list(df["polarity"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [14]:
X_test_vec

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 5205 stored elements and shape (179, 3630)>

In [15]:
vectorizer.get_feature_names_out()

array(['aa', 'aafnaii', 'aakhirat', ..., 'zespole', 'zoom', 'zooming'],
      shape=(3630,), dtype=object)

In [16]:
# Usamos el ejemplo de la clase para comprobar que la cosa pinta bien

print(f'Email de prueba: {X_test[1]}')
print('\nPalabras del conjunto de entrenamiento que también aparecen en el email de test junto con su aparición:\n')
for i, cont in enumerate(X_test_vec.toarray()[1]):
  if cont!=0:
    print(f'Palabra: "{vectorizer.get_feature_names_out()[i]}"')
    print(f'--> Aparece {cont} veces en el email.')

Email de prueba: whatsapp i use this app now that blackberry messenger has basically gone away my friends  family live all over the world this really helps keep us in touch

Palabras del conjunto de entrenamiento que también aparecen en el email de test junto con su aparición:

Palabra: "all"
--> Aparece 1 veces en el email.
Palabra: "app"
--> Aparece 1 veces en el email.
Palabra: "away"
--> Aparece 1 veces en el email.
Palabra: "basically"
--> Aparece 1 veces en el email.
Palabra: "family"
--> Aparece 1 veces en el email.
Palabra: "friends"
--> Aparece 1 veces en el email.
Palabra: "gone"
--> Aparece 1 veces en el email.
Palabra: "has"
--> Aparece 1 veces en el email.
Palabra: "helps"
--> Aparece 1 veces en el email.
Palabra: "in"
--> Aparece 1 veces en el email.
Palabra: "keep"
--> Aparece 1 veces en el email.
Palabra: "live"
--> Aparece 1 veces en el email.
Palabra: "messenger"
--> Aparece 1 veces en el email.
Palabra: "my"
--> Aparece 1 veces en el email.
Palabra: "now"
--> Aparece

In [17]:
# Inicializar y entrenar el clasificador Naive Bayes Multinomial
clf1 = BernoulliNB().fit(X_train_vec, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred1 = clf1.predict(X_test_vec)

# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred1))



              precision    recall  f1-score   support

           0       0.84      0.94      0.88       126
           1       0.79      0.57      0.66        53

    accuracy                           0.83       179
   macro avg       0.81      0.75      0.77       179
weighted avg       0.82      0.83      0.82       179



In [18]:
# Inicializar y entrenar el clasificador Naive Bayes Multinomial
clf2 = MultinomialNB().fit(X_train_vec, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred2 = clf2.predict(X_test_vec)

# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred2))




              precision    recall  f1-score   support

           0       0.84      0.94      0.89       126
           1       0.82      0.58      0.68        53

    accuracy                           0.84       179
   macro avg       0.83      0.76      0.79       179
weighted avg       0.84      0.84      0.83       179



Aunque elegí el algoritmo de BernoulliNB, al tratarse de datos binarios, el modelo mejora algo con el algoritmo MultinomialNB.
El algoritmo de GaussianNB no es viable al no ser datos continuos el resultado.

In [19]:
model = RandomForestClassifier(n_estimators=50,max_depth=10,min_samples_leaf=20,max_features=X_train_vec.shape[1]//2,random_state = 42)
model.fit(X_train_vec, y_train)

y_pred3 = model.predict(X_test_vec)



In [20]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82       126
           1       0.57      0.47      0.52        53

    accuracy                           0.74       179
   macro avg       0.68      0.66      0.67       179
weighted avg       0.73      0.74      0.73       179



Finalmente MultinomialNB resulta ser el algoritmo con mejóres métricas y, por tanto, el elegido.