#### Importamos las cosas que necesitaremos

In [152]:
import pandas as pd
import numpy as np
import pandas as pd
from haversine import haversine, Unit
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB


#### Leemos el data set

In [153]:
df = pd.read_csv("/workspaces/Emiliano0041-IntroML/data/raw/playstore_reviews.csv")
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


#### Comenzamos a trabajar con el data set

- El ejercicio dice que deberiamos eliminar la columna "package_name" ya que no es relevante para nuestra prediccion

In [154]:
df.drop("package_name", axis=1, inplace=True)

- Eliminamos espacios dobles y convertimos todo el texto a minusculas

In [155]:
df["review"] = df["review"].str.strip().str.lower()
texto_ = df["review"]

Eliminamos todo lo que no sea texto, es decir, letra o espacio.

In [156]:
df["review"].apply(lambda x: re.sub(r'\d+', '', x))


0      privacy at least put some option appear offlin...
1      messenger issues ever since the last update, i...
2      profile any time my wife or anybody has more t...
3      the new features suck for those of us who don'...
4      forced reload on uploading pic on replying com...
                             ...                        
886    loved it i loooooooooooooovvved it because it ...
887    all time legendary game the birthday party lev...
888    ads are way to heavy listen to the bad reviews...
889    fun works perfectly well. ads aren't as annoyi...
890    they're everywhere i see angry birds everywher...
Name: review, Length: 891, dtype: object

- Hacemos el split, dividimos en train y test

In [157]:
X = df["review"]
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=22)

In [158]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()


#### Modelo Bernoulli

- Generamos el modelo, calculamos accuracy y el report.

In [159]:
modelo_B = BernoulliNB()
modelo_B.fit(X_train, y_train)
y_pred = modelo_B.predict(X_test)



In [160]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7541899441340782

In [161]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.96      0.83       112
           1       0.85      0.42      0.56        67

    accuracy                           0.75       179
   macro avg       0.79      0.69      0.69       179
weighted avg       0.78      0.75      0.73       179



#### Modelo Multinomial

- Generamos el modelo, calculamos accuracy y el report.

In [162]:
modelo_M = MultinomialNB()
modelo_M.fit(X_train, y_train)
y_pred_M = modelo_M.predict(X_test)


In [163]:
accuracy_M = accuracy_score(y_test, y_pred_M)
accuracy_M


0.8212290502793296

In [164]:
report_M = classification_report(y_test, y_pred_M)
print(report_M)

              precision    recall  f1-score   support

           0       0.82      0.92      0.87       112
           1       0.83      0.66      0.73        67

    accuracy                           0.82       179
   macro avg       0.82      0.79      0.80       179
weighted avg       0.82      0.82      0.82       179



#### Modelo de Gauss

- Generamos el modelo, calculamos accuracy y el report.

In [165]:
modelo_G = GaussianNB()
modelo_G.fit(X_train, y_train)
y_pred_G = modelo_G.predict(X_test)

In [166]:
accuracy_G = accuracy_score(y_test, y_pred_G)
accuracy_G

0.7653631284916201

In [167]:
report_G = classification_report(y_test, y_pred_G)
print(report_G)

              precision    recall  f1-score   support

           0       0.77      0.90      0.83       112
           1       0.77      0.54      0.63        67

    accuracy                           0.77       179
   macro avg       0.77      0.72      0.73       179
weighted avg       0.77      0.77      0.75       179

