In [8]:
#importing the relevant libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

In [2]:
diabetes = pd.read_csv(r'/home/jeromemugita/Documents/Code/DataScience/Intermediate DS/Datasets/NB_diabetes.csv')
diabetes

Unnamed: 0,glucose,bloodpressure,diabetes
0,40,85,0
1,40,92,0
2,45,63,1
3,45,80,0
4,40,73,1
...,...,...,...
990,45,87,0
991,40,83,0
992,40,83,0
993,40,60,1


In [3]:
df = diabetes.copy()
df.shape

(995, 3)

In [6]:
#Selecting the features and targets to use
X = df[['glucose', 'bloodpressure']]
y = df['diabetes']

In [7]:
# checking for combatibility in rows and columns

print(X.shape)
print(y.shape)

(995, 2)
(995,)


In [10]:
# since numpy accepts array arguments for mathematical operation, 
# we transform the split data into numpy arrays

X = np.array(X)
y = np.array(y)

In [11]:
# Split the data into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [12]:
# Checking for rows and columns
X_train.shape, X_test.shape

((796, 2), (199, 2))

In [13]:
# Checking for rows and columns
y_train.shape, y_test.shape

((796,), (199,))

## Gaussian Naive Bayes

In [14]:
model = GaussianNB()

In [16]:
# fitting the model
model.fit(X_train, y_train)

In [17]:
# We first check for the accuracy score which is 92%. which is pretty accurate

model.score(X_test,y_test)

0.9296482412060302

In [18]:
# Then we make predictions on the test set results
y_pred = model.predict(X_test)
y_test,y_pred

(array([1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
        0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
        0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
        0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
        0]),
 array([1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
        1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 1, 0,

In [20]:
# We then evaluate the accuracy of the predictions we've made:
from sklearn.metrics import accuracy_score
acc_sc = accuracy_score(y_test,y_pred)
acc_sc

0.9296482412060302

In [21]:
# Finally, we can use the prediction to check for the diabetes status for a
# patient with glucose of 48 and blood pressure of 96

# we see that our patient has a 92% chance of being diabetic

patient1 = ([[48,75]])
status = model.predict(patient1)
status

array([1])

## Bernoulli Naives Bayes

In [22]:
modelBN = BernoulliNB()

In [23]:
# we fit model with our train samples

modelBN.fit(X_train, y_train)

In [26]:
# First we check for the training accuracy
modelBN.score(X_test, y_test)

# The model results in a 46% accuracy which is a very low accuracy score to rely 
# on for decision making. 

0.46733668341708545

In [32]:
statusBN = modelBN.predict(patient1)
statusBN

# as suspected, the Bernoulli Naive Bayes predicts that our patient is non diabetic
# give the 46% chance of this being correct, we may opt to use another model
# to make this prediction. 

array([0])

## Multinormial Naives Bayes

In [27]:
# first we make the model
modelMN = MultinomialNB()

In [28]:
# Then we fit the model
modelMN.fit(X_train, y_train)

In [29]:
# We then check for the accuracy of the training
modelMN.score(X_test, y_test)

0.7386934673366834

In [30]:
y_pred = modelMN.predict(X_test)
y_test,y_pred

(array([1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
        0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
        0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
        0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
        0]),
 array([1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0,

In [31]:
# If we were to use the multinormial naive bayes to predict the diabetes score
# for the same patient we had:

statusMN = modelMN.predict(patient1)
statusMN

#it results in the same answer by 74% chance which is not a very good prediction
# but given that the Gaussian Naive Bayes returns the same result, we
# can choose to accept the answer. 

array([1])