# Explore here

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import *
import string
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('/workspaces/Naive_Bayes_RDH/data/raw/playstore_reviews.csv')
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


Eliminaremos la columna del tipo de aplicación porque no aportaría nada al modelo si lo que queremos es predecir a partir de los comentarios (por conteo de palabras) si es positivo o negativo

In [3]:
#Eliminar la columna de 'package_name'
df_sentences = df.drop(columns = 'package_name')

In [4]:
df_sentences

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0
...,...,...
886,loved it i loooooooooooooovvved it because it...,1
887,all time legendary game the birthday party le...,1
888,ads are way to heavy listen to the bad review...,0
889,fun works perfectly well. ads aren't as annoy...,1


In [5]:
#Limpiaremos los 'reviews' para quitar signos de puntuación

# Eliminar signos de puntuación de la columna 'review'
df_sentences['review_cleaned'] = df['review'].str.replace(f"[{string.punctuation}]", "", regex=True)
df_sentences

Unnamed: 0,review,polarity,review_cleaned
0,privacy at least put some option appear offli...,0,privacy at least put some option appear offli...
1,"messenger issues ever since the last update, ...",0,messenger issues ever since the last update i...
2,profile any time my wife or anybody has more ...,0,profile any time my wife or anybody has more ...
3,the new features suck for those of us who don...,0,the new features suck for those of us who don...
4,forced reload on uploading pic on replying co...,0,forced reload on uploading pic on replying co...
...,...,...,...
886,loved it i loooooooooooooovvved it because it...,1,loved it i loooooooooooooovvved it because it...
887,all time legendary game the birthday party le...,1,all time legendary game the birthday party le...
888,ads are way to heavy listen to the bad review...,0,ads are way to heavy listen to the bad review...
889,fun works perfectly well. ads aren't as annoy...,1,fun works perfectly well ads arent as annoyin...


In [6]:
#Eliminamos la columna review donde estan los comentarios con signos de puntuación.
df_sentences_clean = df_sentences.drop(columns = 'review')
df_sentences_clean.to_csv('df_sentences_clean.csv')

In [7]:
df_sentences_clean['review_cleaned'] = df_sentences_clean['review_cleaned'].str.strip().str.lower()
df_sentences_clean

Unnamed: 0,polarity,review_cleaned
0,0,privacy at least put some option appear offlin...
1,0,messenger issues ever since the last update in...
2,0,profile any time my wife or anybody has more t...
3,0,the new features suck for those of us who dont...
4,0,forced reload on uploading pic on replying com...
...,...,...
886,1,loved it i loooooooooooooovvved it because it ...
887,1,all time legendary game the birthday party lev...
888,0,ads are way to heavy listen to the bad reviews...
889,1,fun works perfectly well ads arent as annoying...


## Partición de datos

In [8]:
X = df_sentences_clean['review_cleaned']
y = df_sentences_clean['polarity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Realizamos el conteo de palabras en los textos usando CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [10]:
#Revisión de las palabras y caracteres seleccionados que aparecen en el array 'vectorizado'
vectorizer.get_feature_names_out()

array(['000', '10', '100', ..., 'žŕ', 'žŕľ', 'ˇŕ'],
      shape=(3770,), dtype=object)

In [11]:
# Inicializar y entrenar el clasificador Naive Bayes Multinomial
model = MultinomialNB().fit(X_train_vec, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test_vec)

# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89       126
           1       0.83      0.57      0.67        53

    accuracy                           0.84       179
   macro avg       0.84      0.76      0.78       179
weighted avg       0.84      0.84      0.83       179



### **Observaciones**
 - Referente al 'accuracy' el modelo predice con un 84% de forma acertiva las opiniones correctamente.
 - A priori, lo que podemos comentar es que el modelo puede predecir y clasifcar con mucha más facilidad y aciertos los comentarios negativos que los positivos. Sin embargo el objetivo es intentar identificar con mejor 'sensibilidad' 

## Optimización de Hiperparámetros

In [26]:
#Definición y búsqueda de hiperparámetros para mejorar la sensibilidad (menos falsos negativos)
model_nb = MultinomialNB()

param_grid = {
    'alpha': [0.1, 0.5, 1, 3, 5]
}
grid_search = GridSearchCV(estimator=model_nb, param_grid=param_grid, scoring='recall', cv=5)

grid_search.fit(X_train_vec, y_train) 
best_hp= grid_search.best_params_['alpha']

print(best_hp)

0.1


### Probamos el modelo con los parámetros optimizados

In [29]:
# Inicializar y entrenar el clasificador Naive Bayes Multinomial

model_hiper = MultinomialNB(alpha = 0.1).fit(X_train_vec, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred_hiper = model_hiper.predict(X_test_vec)

# Evaluar el rendimiento del modelo
print(classification_report(y_test, y_pred_hiper))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89       126
           1       0.76      0.70      0.73        53

    accuracy                           0.84       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.84      0.84      0.84       179



### **Conclusiones**
Si consideramos importante mejorar la sensibilidad del modelo, he tomado como parámetro de la optimización "Recall" en este caso considero que queremos minimizar los falsos negativos de modo que algún comentario negativo identificado correctamente sea lo menos probable y poder detectar en su mayoría los positivos identificados correctamente. El accuracy se ha mantenido en un muy buen porcentaje pero es verdad que hemos sacrificado un poco la precisión de los comentarios positivos, lo cual, desde mi punto de vista, poder identificar erróneamente un falso positivo no sería de mayor gravedad.