# Explore here

It's recommended to use this notebook for exploration purposes.

For example: 

1. You could import the CSV generated by python into your notebook and explore it.
2. You could connect to your database using `pandas.read_sql` from this notebook and explore it.

In [None]:
!pip install pandas
!pip install numpy
!pip install sklearn

In [37]:
import pandas as pd 
import numpy as np
import unicodedata
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_recall_fscore_support
from sklearn import metrics
from sklearn.pipeline import Pipeline
import pickle

In [2]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews_dataset.csv')

In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [None]:
df_raw.sample(10)

1. Transform dataframe

In [4]:
df_raw['review'] = df_raw['review'].str.strip().str.lower()

In [5]:
df_raw['review'] = df_raw['review'].str.replace('!', '')
df_raw['review'] = df_raw['review'].str.replace(',', '')
df_raw['review'] = df_raw['review'].str.replace('&', '')
df_raw['review'] = df_raw['review'].str.normalize('NFKC') #toma el texto no latino y lo trata de arreglar
df_raw['review'] = df_raw['review'].str.replace(r'([a-zA-Z])\1{2,}', r'\1', regex=True) # REGEX detecta palabras que empiezan con Mayus o minisculas con letras repetidas tipo goooood se queda con good. Comprime el texto de una palabra grande tipo looooveeeee a love


In [6]:
df_raw.sample(10)

Unnamed: 0,package_name,review,polarity
410,com.facebook.orca,just a tool for them to throw ads at you they ...,0
363,com.google.android.talk,ok for the simple stuff feature lacking and po...,0
750,com.shirantech.kantipur,virus i think your site ìs infected as it is ...,0
760,com.shirantech.kantipur,good app all is good but sometime its cannot s...,1
14,com.facebook.katana,fix this please... i've tried editing two of m...,0
505,com.Slack,need swipe between images nice app but it woul...,0
397,com.facebook.orca,constantly erases text i have frequently lost ...,0
863,com.rovio.angrybirds,angry birds what looks easy can demand a bit o...,1
599,com.evernote,absolutely the best for saving documents forms...,1
115,com.linkedin.android,the old one is better the previous version kep...,0


In [7]:
def normalize_string (text_string):
    if text_string is not None:
        result = unicodedata.normalize('NFD', text_string).encode('ascii', 'ignore').decode()
    else:
        result = None

    return (result)

In [8]:
df_raw['review'] = df_raw['review'].apply(normalize_string)

In [None]:
df_raw.sample(10)

In [9]:
df = df_raw.copy()

2. Separate target from feature, and split your data.

In [10]:
X = df['review']
y = df['polarity']

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2007,stratify=y)

3. Vectorize your features and use Naive Bayes to classify the reviews as good or bad. We will not focus on hypertuning our model this time. This was an introduction project to sentiment analysis using Naive Bayes.

In [12]:
modelo_1 = Pipeline([('cont_vect', CountVectorizer()), ('clf', MultinomialNB())])
modelo_1.fit(X_train, y_train)
pred_1 = modelo_1.predict(X_test)

In [15]:
modelo_2 = Pipeline([('tfidf_vect', TfidfVectorizer()), ('clf', MultinomialNB())])
modelo_2.fit(X_train, y_train)
pred_2 = modelo_2.predict(X_test)

In [16]:
modelo_3 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
modelo_3.fit(X_train, y_train)
pred_3 = modelo_3.predict(X_test)

In [17]:
print('Naive Bayes Train Accuracy = ',metrics.accuracy_score(y_train,modelo_1.predict(X_train)))
print('Naive Bayes Test Accuracy = ',metrics.accuracy_score(y_test,modelo_1.predict(X_test)))

Naive Bayes Train Accuracy =  0.9625748502994012
Naive Bayes Test Accuracy =  0.820627802690583


In [18]:
print('Naive Bayes Train Accuracy = ',metrics.accuracy_score(y_train,modelo_2.predict(X_train)))
print('Naive Bayes Test Accuracy = ',metrics.accuracy_score(y_test,modelo_2.predict(X_test)))

Naive Bayes Train Accuracy =  0.7949101796407185
Naive Bayes Test Accuracy =  0.6995515695067265


In [19]:
print('Naive Bayes Train Accuracy = ',metrics.accuracy_score(y_train,modelo_3.predict(X_train)))
print('Naive Bayes Test Accuracy = ',metrics.accuracy_score(y_test,modelo_3.predict(X_test)))

Naive Bayes Train Accuracy =  0.7949101796407185
Naive Bayes Test Accuracy =  0.6995515695067265


Busqueda de Hiperparametros

In [24]:
n_iter_search = 5
parameters = {'cont_vect__ngram_range': [(1, 1), (1, 2)], 'clf__alpha': (1e-2, 1e-3)}
RS_CV_1 = RandomizedSearchCV(modelo_1, parameters, n_iter = n_iter_search)
RS_CV_1.fit(X_train, y_train)
pred_1_grid = RS_CV_1.predict(X_test)



In [25]:
RS_CV_1.best_params_

{'cont_vect__ngram_range': (1, 2), 'clf__alpha': 0.01}

In [27]:
n_iter_search = 5
parameters = {'clf__alpha': (1e-2, 1e-3)}
RS_CV_2 = RandomizedSearchCV(modelo_2, parameters, n_iter = n_iter_search)
RS_CV_2.fit(X_train, y_train)
pred_2_grid = RS_CV_2.predict(X_test)



In [28]:
RS_CV_2.best_params_

{'clf__alpha': 0.01}

In [29]:
n_iter_search = 5
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
RS_CV_3 = RandomizedSearchCV(modelo_3, parameters, n_iter = n_iter_search)
RS_CV_3.fit(X_train, y_train)
pred_3_grid = RS_CV_3.predict(X_test)

In [30]:
RS_CV_3.best_params_

{'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'clf__alpha': 0.01}

In [33]:
print('RS_CV_1 = ',metrics.accuracy_score(y_test,RS_CV_1.predict(X_test)))
print('RS_CV_2 = ',metrics.accuracy_score(y_test,RS_CV_2.predict(X_test)))
print('RS_CV_3 = ',metrics.accuracy_score(y_test,RS_CV_3.predict(X_test)))

RS_CV_1 =  0.8071748878923767
RS_CV_2 =  0.8071748878923767
RS_CV_3 =  0.7982062780269058


In [34]:
best_model = RS_CV_1.best_estimator_

In [35]:
print('The model with highest accuracy in the dataset, after hyperparameter tuning, is:', best_model)

The model with highest accuracy in the dataset, after hyperparameter tuning, is: Pipeline(steps=[('cont_vect', CountVectorizer(ngram_range=(1, 2))),
                ('clf', MultinomialNB(alpha=0.01))])


In [38]:
pickle.dump(best_model, open('../models/best_model.pickle', 'wb')) # save the model
# modelo = pickle.load(open('../models/best_model.pickle', 'rb')) # read the model in the future
# modelo.predict(X_test) # use it to predict with new data