# Naive Bayes - EDA and Model Training

In [20]:
# Import packages and DataSet  

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

total_data = pd.read_csv('/workspaces/gustavolima-naivebayes/data/raw/main_playstore_reviews.csv')

#### Dataset Exploration

In [21]:
# Dataset Shape

total_data.shape

(891, 3)

In [22]:
# Dataset Information

total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


#### Treat and Study The Variables

In [23]:
# Drop unwanted Features

total_data = total_data.drop('package_name', axis=1)

In [25]:
# Remove spaces and lower case text
total_data['review'] = total_data['review'].str.strip().str.lower()

In [27]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   review    891 non-null    object
 1   polarity  891 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 14.1+ KB


In [62]:
# Split the DataSet
from sklearn.model_selection import train_test_split

X = total_data.drop('polarity', axis=1)
y = total_data['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Convert X to series (Naive Bayes only works with Series)
X_train = list(X_train['review'])
X_test = list(X_test['review'])

In [63]:
# Convert the Data to a Word Count Matrix
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [66]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

#### Dataset Naive Bayes Training

##### Test with Bernoulli (as our results are Binary)

In [67]:
# Load the Model
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(X_train, y_train)

In [68]:
# Load predictive
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [70]:
# Accuracy Test
from sklearn.metrics import accuracy_score

print(f'Accuracy is: {accuracy_score(y_test, y_pred)}')

Accuracy is: 0.770949720670391


In [71]:
# Save the Model
from pickle import dump

dump(model, open("/workspaces/gustavolima-naivebayes/models/nbayes_bernoulli_default.sav", "wb"))

##### Test with Multinomial (Gaussian doesn't make sense here as it's not continuous data)

In [72]:
# Load the Model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

In [73]:
# Load predictive
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [74]:
# Accuracy Test
from sklearn.metrics import accuracy_score

print(f'Accuracy is: {accuracy_score(y_test, y_pred)}')

Accuracy is: 0.8156424581005587


In [75]:
# Save the Model
from pickle import dump

dump(model, open("/workspaces/gustavolima-naivebayes/models/nbayes_multinomial_default.sav", "wb"))

#### Optimize Hyperparameters