# Naive Bayes - EDA and Model Training

In [114]:
# Import packages and DataSet  

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

total_data = pd.read_csv('/workspaces/gustavolima-naivebayes/data/raw/main_playstore_reviews.csv')

#### Dataset Exploration

In [115]:
# Dataset Shape

total_data.shape

(891, 3)

In [116]:
# Dataset Information

total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


#### Treat and Study The Variables

In [117]:
# Drop unwanted Features

total_data = total_data.drop('package_name', axis=1)

In [118]:
# Remove spaces and lower case text
total_data['review'] = total_data['review'].str.strip().str.lower()

In [119]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   review    891 non-null    object
 1   polarity  891 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 14.1+ KB


In [120]:
# Split the DataSet
from sklearn.model_selection import train_test_split

X = total_data.drop('polarity', axis=1)
y = total_data['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Convert X to series (Naive Bayes only works with Series)
X_train = list(X_train['review'])
X_test = list(X_test['review'])

In [121]:
# Convert the Data to a Word Count Matrix
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [122]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

#### Dataset Naive Bayes Training

##### Test with Bernoulli (as our results are Binary)

In [123]:
# Load the Model
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(X_train, y_train)

In [124]:
# Load predictive
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [125]:
# Accuracy Test
from sklearn.metrics import accuracy_score

print(f'Accuracy is: {accuracy_score(y_test, y_pred)}')

Accuracy is: 0.770949720670391


In [126]:
# Save the Model
from pickle import dump

dump(model, open("/workspaces/gustavolima-naivebayes/models/nbayes_bernoulli_default.sav", "wb"))

##### Optimize Bernoulli


In [127]:
# Optimize the Multinomial Model
from sklearn.model_selection import GridSearchCV

# We define the parameters by hand that we want to adjust
hyperparams = {
    'alpha': [0.1, 0.5, 1.0, 2.0],    
    'binarize': [0.0, 0.5, 1.0],     
    'fit_prior': [True, False],       
}

# We initialize the grid
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 5)
grid

In [128]:
# Retrieve the best parameters
grid.fit(X_train, y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'alpha': 0.1, 'binarize': 0.0, 'fit_prior': True}


In [129]:
# Retrain the Model
# Load the Model
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB(alpha=0.1, binarize=0.0, fit_prior=True)
model.fit(X_train, y_train)

In [130]:
# Load predictive
y_pred = model.predict(X_test)
y_pred

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [131]:
# Accuracy Test
from sklearn.metrics import accuracy_score

print(f'Accuracy is: {accuracy_score(y_test, y_pred)}')

Accuracy is: 0.8324022346368715


In [None]:
# Save the Model
from pickle import dump

dump(model, open("/workspaces/gustavolima-naivebayes/models/nbayes_bernoulli_opt_default.sav", "wb"))

#### Test with Multinomial (Gaussian doesn't make sense here as it's not continuous data)

In [102]:
# Load the Model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

In [103]:
# Load predictive
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [104]:
# Accuracy Test
from sklearn.metrics import accuracy_score

print(f'Accuracy is: {accuracy_score(y_test, y_pred)}')

Accuracy is: 0.8156424581005587


In [105]:
# Save the Model
from pickle import dump

dump(model, open("/workspaces/gustavolima-naivebayes/models/nbayes_multinomial_default.sav", "wb"))

#### Optimize Hyperparameters

##### Optimize Multinomial

In [106]:
# Optimize the Multinomial Model
from sklearn.model_selection import GridSearchCV

# We define the parameters by hand that we want to adjust
hyperparams = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
    'fit_prior': [True, False]
}

# We initialize the grid
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 5)
grid

In [107]:
# Retrieve the best parameters
grid.fit(X_train, y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'alpha': 0.01, 'fit_prior': False}


In [108]:
# Re-run the Model
model = MultinomialNB(alpha=0.01, fit_prior=False)
model.fit(X_train, y_train)

In [109]:
# Load predictive
y_pred = model.predict(X_test)
y_pred

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [110]:
# Accuracy Test
from sklearn.metrics import accuracy_score

print(f'Accuracy is: {accuracy_score(y_test, y_pred)}')

Accuracy is: 0.8212290502793296


In [111]:
# Save the Model
from pickle import dump

dump(model, open("/workspaces/gustavolima-naivebayes/models/nbayes_multinomial_opt_default.sav", "wb"))

## Conclusion of the Analysis

Depsite the Bernoulli Model giving better results, looking at the data we are working, it's best to go with the Multinomial Model because our data is in fact discrete.
We managed to achieve 82% Accuracy with optimizing the Multinomial Model. 

### Boosting the Model

In [132]:
# Import XGBoost
from xgboost import XGBClassifier

In [133]:
# Train the New Model
model = XGBClassifier(random_state = 42)
model.fit(X_train, y_train)

In [134]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [135]:
# Check Accuracy
accuracy_score(y_test, y_pred)

0.8100558659217877

In [None]:
# Save the Model
from pickle import dump

dump(model, open("/workspaces/gustavolima-naivebayes/models/nbayes_boosting_default.sav", "wb"))