### Step 1: Loading the dataset

In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from pickle import dump

In [39]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv'
data_reviews = pd.read_csv(url, sep=',')
data_reviews.to_csv('/workspaces/JLL_Naive_Bayes_ML/data/raw/playstore_reviews.csv', index = False)

### Step 2: Study of variables and their content

In [40]:
data_reviews = pd.read_csv('/workspaces/JLL_Naive_Bayes_ML/data/raw/playstore_reviews.csv', sep=',')
data_reviews.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


***Delete non-relevant information***

In [41]:
data_reviews.drop('package_name', axis=1, inplace=True)


***Removing spaces and converting the text to lowercase***

In [42]:
data_reviews["review"] = data_reviews["review"].str.strip().str.lower()
data_reviews.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


***Divide the dataset into train and test:***

In [43]:
X = data_reviews['review']
y = data_reviews['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

***Transform the text into a word count matrix***

In [44]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

### Step 3: Build a naive bayes model

*I select the MultinomialNB, because the data  with classification with features representing discrete counts or frequencies*

In [45]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [46]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [47]:
accuracy_score(y_test, y_pred)

0.8156424581005587

*I will test the other sklearn Naive Bayes models:*

In [48]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

for model_aux in [GaussianNB(), BernoulliNB()]:
    model_aux.fit(X_train, y_train)
    y_pred_aux = model_aux.predict(X_test)
    print(f"{model_aux} with accuracy: {accuracy_score(y_test, y_pred_aux)}")

GaussianNB() with accuracy: 0.8044692737430168
BernoulliNB() with accuracy: 0.770949720670391


***I can confirm that the best model is the one I have chosen based on its theoretical foundation***

### Optimize the model

In [71]:
hyperparams = {
    "alpha": np.linspace(0.01, 50.0, 50),
}

# We initialize the random search
random_search = RandomizedSearchCV(model, hyperparams, n_iter = 50, scoring = "accuracy", cv = 5, random_state = 42)
random_search

In [72]:
random_search.fit(X_train, y_train)

print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'alpha': 2.050408163265306}


In [73]:
model_2 = MultinomialNB(**random_search.best_params_)

model_2.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)


0.8212290502793296

***We improve the model score***

### Save the model

In [74]:
dump(model, open("/workspaces/JLL_Naive_Bayes_ML/models/naive_bayes_optimized.sav", "wb"))

### Exploratory other options