# Naive Bayes

In [30]:
# General Libraries
import warnings

# Data Analysis & Visualization
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from pickle import dump

warnings.filterwarnings("ignore")

In [31]:
original_df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')
original_df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [32]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [33]:
original_df.shape

(891, 3)

In [34]:
len(original_df[original_df.isna().any(axis=1)])

0

In [35]:
for i in original_df.columns:
    if len(original_df[i].unique()) > 15:
        print(len(original_df[i].unique()))
    else:
        print(original_df[i].unique())

23
891
[0 1]


In [36]:
for i in original_df.columns:
    if original_df[i].dtype == 'int64' or original_df[i].dtype == 'float64':
        print(i.capitalize())
        print('\tMin:', original_df[i].min())
        print('\tMax:', original_df[i].max())

Polarity
	Min: 0
	Max: 1


In [37]:
original_df[original_df.duplicated()]

Unnamed: 0,package_name,review,polarity


In [38]:
original_df.drop_duplicates(inplace=True)
original_df.shape

(891, 3)

In [39]:
original_df[original_df['review'].isnull()]
original_df[original_df['review'].isna()]

Unnamed: 0,package_name,review,polarity


### Processing the Data

In [40]:
original_df[original_df['review'] == ' aa nice']

Unnamed: 0,package_name,review,polarity
761,com.shirantech.kantipur,aa nice,1


In [41]:
processed_data = original_df.copy()
processed_data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [42]:
processed_data.drop('package_name', axis = 1, inplace=True)
processed_data['review'] = processed_data['review'].str.strip().str.lower()

In [43]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   review    891 non-null    object
 1   polarity  891 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 14.1+ KB


#### Dividing the data

In [44]:
x, y = processed_data["review"], processed_data['polarity']

xtrain, xtest, ytrain, ytest = train_test_split(
    x, y,
    test_size = 0.2,
    random_state = 42
)

xtrain.head()
xtest.head()

709    love/hate has bug and security issues. i tried...
439    whatsapp i use this app now that blackberry me...
840                             usefully verry  nice app
720    fonts why in the heck is this thing analysing ...
39     app doesn't work after latest upgrade the face...
Name: review, dtype: object

In [45]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(xtrain).toarray()
X_test = vec_model.transform(xtest).toarray()
y_train = ytrain
y_test = ytest

X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Create Naive Bayes Model

In [46]:
def get_best_score(dictionary):
    return f'{max(dictionary).title()}: {dictionary[max(dictionary)]}'

model_dict = {
    'gaussian': GaussianNB(),
    'bernoulli': BernoulliNB(),
    'multinomial': MultinomialNB()
}

scores = {
    'gaussian': 0,
    'bernoulli': 0,
    'multinomial': 0
}

for i in model_dict:
    model_dict[i].fit(X_train, y_train)
    scores[i] = accuracy_score(y_test, model_dict[i].predict(X_test))

print(get_best_score(scores))


Multinomial: 0.8156424581005587


### Optimizing the Model

In [47]:
hyperparams = {
    'alpha': np.linspace(0.01, 10, 300),
    'fit_prior': [True, False]
}

random_search = RandomizedSearchCV(
    model_dict['multinomial'],
    param_distributions = hyperparams,
    n_iter = 50,
    cv = 10,
    scoring = 'accuracy',
    random_state = 42
)

In [48]:
random_search.fit(X_train, y_train)

In [49]:
random_search.best_params_

{'fit_prior': True, 'alpha': 0.5111705685618729}

In [50]:
opt_model = MultinomialNB(
    alpha = random_search.best_params_['alpha'],
    fit_prior = random_search.best_params_['fit_prior']
)

In [51]:
opt_model.fit(X_train, y_train)

In [52]:
print(f'Model Accuracy after Optimization: {accuracy_score(y_test, opt_model.predict(X_test))}')

Model Accuracy after Optimization: 0.8268156424581006


### Save Model

In [53]:
dump(opt_model, open("../models/naive_bayes_multinomial-alpha_0-8268156424581006_fit_prior-True_rand_state-42.sav", "wb"))
