# Naive Bayes Project Tutorial

In this project you will practice with a dataset to create a review classifier for the Google Play store.

In this case, we have only 3 variables: 2 predictors and a dichotomous label. Of the two predictors, we are really only interested in the comment part, since the fact of classifying a comment as positive or negative will depend on its content, not on the application from which it was written. Therefore, the package_name variable should be removed.

When we work with text, as in this case, it does not make sense to do an EDA, the process is different, since the only variable we are interested in is the one that contains the text. In other cases where the text is part of a complex set with other numeric predictor variables and the prediction objective is different, then it makes sense to apply an EDA.

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score


from pickle import dump

In [2]:
resource_url = 'https://improved-yodel-7vr4wwrp4jv9cwp6q.github.dev/'

df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [3]:
# Initial Data Set up: dropping non-relevant columns "package_name"

df.drop(["package_name"], axis = 1, inplace = True)
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [4]:
# Removing spaces and converting the text to lowercase

df["review"] = df["review"].str.strip().str.lower()
df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well. ads aren't as annoyi...,1


In [None]:
# Predictor 'review' Text Data Vectorization
from sklearn.feature_extraction.text import CountVectorizer
vec_model = CountVectorizer(stop_words = "english")

# Vectorize the 'review' column before splitting
X_vec = vec_model.fit_transform(df['review']).toarray() 

In [6]:
# Data Split

import pandas as pd
from sklearn.model_selection import train_test_split

# STEP 4) Data Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vec, df['polarity'], test_size = 0.2, random_state = 42)

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
X_train.shape

(712, 3721)

In [10]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
# STEP 5) GaussianNB model

from sklearn.naive_bayes import GaussianNB

gaussianNB_model = GaussianNB()
gaussianNB_model.fit(X_train, y_train)

gaussianNB_y_pred_test = gaussianNB_model.predict(X_test)

gaussianNB_y_pred_train = gaussianNB_model.predict(X_train) 

# Evaluation:

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


print(f"\n \n gaussianNB Accuracy (test): {accuracy_score(y_test, gaussianNB_y_pred_test)}")
print(f"\n gaussianNB Accuracy (train): {accuracy_score(y_train, gaussianNB_y_pred_train)}") 
print(f"\n \n gaussianNB f1_score (test): {f1_score(y_test, gaussianNB_y_pred_test, average='micro')}") 
print(f"\n gaussianNB f1_score (train): {f1_score(y_train, gaussianNB_y_pred_train, average='micro')}") 
print(f"\n \n gaussianNB precision (test): {precision_score(y_test, gaussianNB_y_pred_test, average='micro')}") 
print(f"\n gaussianNB precision (train): {precision_score(y_train, gaussianNB_y_pred_train, average='micro')}") 
print(f"\n \n gaussianNB recall (test): {recall_score(y_test, gaussianNB_y_pred_test, average='micro')}") 
print(f"\n gaussianNB recall (train): {recall_score(y_train, gaussianNB_y_pred_train, average='micro')}")


from pickle import dump

dump(gaussianNB_model, open("gaussianNB_model.sav", "wb"))




 
 gaussianNB Accuracy (test): 0.8044692737430168

 gaussianNB Accuracy (train): 0.9859550561797753

 
 gaussianNB f1_score (test): 0.8044692737430168

 gaussianNB f1_score (train): 0.9859550561797753

 
 gaussianNB precision (test): 0.8044692737430168

 gaussianNB precision (train): 0.9859550561797753

 
 gaussianNB recall (test): 0.8044692737430168

 gaussianNB recall (train): 0.9859550561797753


In [13]:
# STEP 5.B) Optimized GaussianNB model


from sklearn.model_selection import GridSearchCV
import numpy as np

hyperparams = {
    "priors": ['n_classes', None],
    "var_smoothing": [float, 1e-9],
}


grid = GridSearchCV(gaussianNB_model, hyperparams, scoring=['accuracy', 'f1_micro', 'precision_micro', 'recall_micro'], refit='accuracy', cv=10) 
grid

# Identifying best hyperparams

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

grid.fit(X_train, y_train)

print(f"GaussianNB Best hyperparameters: {grid.best_params_}")


GaussianNB Best hyperparameters: {'priors': None, 'var_smoothing': 1e-09}


In [15]:
# STEP 5.C) Optimized GaussianNB model

# Running the Optimized GaussianNB model

optimized_gaussianNB_model = GaussianNB(priors = None, var_smoothing = 1e-09)

optimized_gaussianNB_model.fit(X_train, y_train)

optimized_gaussianNB_model_y_pred_test = gaussianNB_model.predict(X_test)

optimized_gaussianNB_model_y_pred_train = gaussianNB_model.predict(X_train) 


print(f"\n \n gaussianNB Accuracy (test): {accuracy_score(y_test, optimized_gaussianNB_model_y_pred_test)}")
print(f"\n gaussianNB Accuracy (train): {accuracy_score(y_train, optimized_gaussianNB_model_y_pred_train)}") 
print(f"\n \n gaussianNB f1_score (test): {f1_score(y_test, optimized_gaussianNB_model_y_pred_test, average='micro')}") 
print(f"\n gaussianNB f1_score (train): {f1_score(y_train, optimized_gaussianNB_model_y_pred_train, average='micro')}") 
print(f"\n \n gaussianNB precision (test): {precision_score(y_test, optimized_gaussianNB_model_y_pred_test, average='micro')}") 
print(f"\n gaussianNB precision (train): {precision_score(y_train, optimized_gaussianNB_model_y_pred_train, average='micro')}") 
print(f"\n \n gaussianNB recall (test): {recall_score(y_test, optimized_gaussianNB_model_y_pred_test, average='micro')}") 
print(f"\n gaussianNB recall (train): {recall_score(y_train, optimized_gaussianNB_model_y_pred_train, average='micro')}")

print("\n No improvement observed after gaussianNB optimization")

from pickle import dump

dump(optimized_gaussianNB_model, open("optimized_gaussianNB_model.sav", "wb"))



 
 gaussianNB Accuracy (test): 0.8044692737430168

 gaussianNB Accuracy (train): 0.9859550561797753

 
 gaussianNB f1_score (test): 0.8044692737430168

 gaussianNB f1_score (train): 0.9859550561797753

 
 gaussianNB precision (test): 0.8044692737430168

 gaussianNB precision (train): 0.9859550561797753

 
 gaussianNB recall (test): 0.8044692737430168

 gaussianNB recall (train): 0.9859550561797753

 No improvement observed after gaussianNB optimization


In [16]:
# Multinomial model

from sklearn.naive_bayes import MultinomialNB

multinomialNB_model = MultinomialNB()
multinomialNB_model.fit(X_train, y_train)

multinomialNB_y_pred_test = multinomialNB_model.predict(X_test)

multinomialNB_y_pred_train = multinomialNB_model.predict(X_train) 

# Evaluation:

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


print(f"\n \n MultinomialNB Accuracy (test): {accuracy_score(y_test, multinomialNB_y_pred_test)}")
print(f"\n MultinomialNB Accuracy (train): {accuracy_score(y_train, multinomialNB_y_pred_train)}") 
print(f"\n \n MultinomialNB f1_score (test): {f1_score(y_test, multinomialNB_y_pred_test, average='micro')}") 
print(f"\n MultinomialNB f1_score (train): {f1_score(y_train, multinomialNB_y_pred_train, average='micro')}") 
print(f"\n \n MultinomialNB precision (test): {precision_score(y_test, multinomialNB_y_pred_test, average='micro')}") 
print(f"\n MultinomialNB precision (train): {precision_score(y_train, multinomialNB_y_pred_train, average='micro')}") 
print(f"\n \n MultinomialNB recall (test): {recall_score(y_test, multinomialNB_y_pred_test, average='micro')}") 
print(f"\n MultinomialNB recall (train): {recall_score(y_train, multinomialNB_y_pred_train, average='micro')}")


from pickle import dump

dump(multinomialNB_model, open("multinomialNB_model.sav", "wb"))




 
 MultinomialNB Accuracy (test): 0.776536312849162

 MultinomialNB Accuracy (train): 0.9578651685393258

 
 MultinomialNB f1_score (test): 0.776536312849162

 MultinomialNB f1_score (train): 0.9578651685393258

 
 MultinomialNB precision (test): 0.776536312849162

 MultinomialNB precision (train): 0.9578651685393258

 
 MultinomialNB recall (test): 0.776536312849162

 MultinomialNB recall (train): 0.9578651685393258


In [17]:
# Optimized Multinomial model

from sklearn.model_selection import GridSearchCV
import numpy as np

hyperparams = {
    "alpha": np.linspace(0.00000000001,10),
    "fit_prior": [False, True],
}


grid = GridSearchCV(multinomialNB_model, hyperparams, scoring=['accuracy', 'f1_micro', 'precision_micro', 'recall_micro'], refit='accuracy', cv=10) 


# Identifying best hyperparams

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

grid.fit(X_train, y_train)

print(f"multinomialNB Model Best hyperparameters: {grid.best_params_}")


multinomialNB Model Best hyperparameters: {'alpha': np.float64(0.6122448979685715), 'fit_prior': False}


In [None]:
# Optimized Multinomial model

# Testing the optimized Multinomial model

optimized_multinomialNB_model = MultinomialNB(alpha = 0.6122448979685715, fit_prior = False)
optimized_multinomialNB_model.fit(X_train, y_train)

optimized_multinomialNB_y_pred_test = optimized_multinomialNB_model.predict(X_test)

optimized_multinomialNB_y_pred_train = optimized_multinomialNB_model.predict(X_train) 

# Evaluation:

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


print(f"\n \n MultinomialNB Accuracy (test): {accuracy_score(y_test, optimized_multinomialNB_y_pred_test)}")
print(f"\n MultinomialNB Accuracy (train): {accuracy_score(y_train, optimized_multinomialNB_y_pred_train)}") 
print(f"\n \n MultinomialNB f1_score (test): {f1_score(y_test, optimized_multinomialNB_y_pred_test, average='micro')}") 
print(f"\n MultinomialNB f1_score (train): {f1_score(y_train, optimized_multinomialNB_y_pred_train, average='micro')}") 
print(f"\n \n MultinomialNB precision (test): {precision_score(y_test, optimized_multinomialNB_y_pred_test, average='micro')}") 
print(f"\n MultinomialNB precision (train): {precision_score(y_train, optimized_multinomialNB_y_pred_train, average='micro')}") 
print(f"\n \n MultinomialNB recall (test): {recall_score(y_test, optimized_multinomialNB_y_pred_test, average='micro')}") 
print(f"\n MultinomialNB recall (train): {recall_score(y_train, optimized_multinomialNB_y_pred_train, average='micro')}")

print("\n No improvement observed after gaussianNB optimization")


from pickle import dump

dump(multinomialNB_model, open("optimized_multinomialNB_model.sav", "wb"))




 
 MultinomialNB Accuracy (test): 0.776536312849162

 MultinomialNB Accuracy (train): 0.9662921348314607

 
 MultinomialNB f1_score (test): 0.776536312849162

 MultinomialNB f1_score (train): 0.9662921348314607

 
 MultinomialNB precision (test): 0.776536312849162

 MultinomialNB precision (train): 0.9662921348314607

 
 MultinomialNB recall (test): 0.776536312849162

 MultinomialNB recall (train): 0.9662921348314607


In [19]:
# STEP 7) BernoulliNB model

from sklearn.naive_bayes import BernoulliNB

BernoulliNB_model = BernoulliNB()
BernoulliNB_model.fit(X_train, y_train)

BernoulliNB_y_pred_test = BernoulliNB_model.predict(X_test)

BernoulliNB_y_pred_train = BernoulliNB_model.predict(X_train) 

# Evaluation:

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


print(f"\n \n BernoulliNB_model Accuracy (test): {accuracy_score(y_test, BernoulliNB_y_pred_test)}")
print(f"\n BernoulliNB_model Accuracy (train): {accuracy_score(y_train, BernoulliNB_y_pred_train)}") 
print(f"\n \n BernoulliNB_model f1_score (test): {f1_score(y_test, BernoulliNB_y_pred_test, average='micro')}") 
print(f"\n BernoulliNB_model f1_score (train): {f1_score(y_train, BernoulliNB_y_pred_train, average='micro')}") 
print(f"\n \n BernoulliNB_model precision (test): {precision_score(y_test, BernoulliNB_y_pred_test, average='micro')}") 
print(f"\n BernoulliNB_model precision (train): {precision_score(y_train, BernoulliNB_y_pred_train, average='micro')}") 
print(f"\n \n BernoulliNB_model recall (test): {recall_score(y_test, BernoulliNB_y_pred_test, average='micro')}") 
print(f"\n BernoulliNB_model recall (train): {recall_score(y_train, BernoulliNB_y_pred_train, average='micro')}")


from pickle import dump

dump(BernoulliNB_model, open("BernoulliNB_model.sav", "wb"))


 
 BernoulliNB_model Accuracy (test): 0.7597765363128491

 BernoulliNB_model Accuracy (train): 0.9030898876404494

 
 BernoulliNB_model f1_score (test): 0.7597765363128491

 BernoulliNB_model f1_score (train): 0.9030898876404494

 
 BernoulliNB_model precision (test): 0.7597765363128491

 BernoulliNB_model precision (train): 0.9030898876404494

 
 BernoulliNB_model recall (test): 0.7597765363128491

 BernoulliNB_model recall (train): 0.9030898876404494
