# Predicting diabetes using Naive Bayes Classifier

# Import libraries


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("/Users/amitnegi/Desktop/pima-data.csv")
df.shape


(768, 10)

In [2]:
df.head(5)


Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,True
1,1,85,66,29,0,26.6,0.351,31,1.1426,False
2,8,183,64,0,0,23.3,0.672,32,0.0,True
3,1,89,66,23,94,28.1,0.167,21,0.9062,False
4,0,137,40,35,168,43.1,2.288,33,1.379,True


In [3]:
df.tail(5)


Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
763,10,101,76,48,180,32.9,0.171,63,1.8912,False
764,2,122,70,27,0,36.8,0.34,27,1.0638,False
765,5,121,72,23,112,26.2,0.245,30,0.9062,False
766,1,126,60,0,0,30.1,0.349,47,0.0,True
767,1,93,70,31,0,30.4,0.315,23,1.2214,False


In [4]:
df.isnull().values.any()

False

In [5]:
del df['skin']

In [6]:
df.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,True
1,1,85,66,29,0,26.6,0.351,31,False
2,8,183,64,0,0,23.3,0.672,32,True
3,1,89,66,23,94,28.1,0.167,21,False
4,0,137,40,35,168,43.1,2.288,33,True


In [7]:
# Check datatypes, converting Booelean to numeric data

diabetes_map= {True:1, False:0}
df['diabetes'] = df['diabetes'].map(diabetes_map)
df.head(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
# checking true false ratio

num_true = len(df.loc[df['diabetes']==True])
num_false = len(df.loc[df['diabetes']==False])

print("Number of true cases {0}({1:2.2f}%)".format(num_true, (num_true/(num_true+num_false))*100))
print("Number of false cases {0}({1:2.2f}%)".format(num_false, (num_false/(num_true+num_false))*100))


Number of true cases 268(34.90%)
Number of false cases 500(65.10%)


In [23]:
# Spliting  the data into 70% training data and 30% test data using scikit learn

from sklearn.model_selection import train_test_split
feature_col_names = ['num_preg','glucose_conc','diastolic_bp','thickness','insulin','bmi','diab_pred','age']
predicted_class_name =['diabetes']

x= df[feature_col_names].values
y= df[predicted_class_name].values

split_test_size = 0.30

X_train, X_test, Y_train, Y_test = train_test_split(x, y,test_size = split_test_size, random_state=42)


print("{0:.2f}% in training set".format(len(X_train)/len(df.index)*100))     # Checking the 70:30 ratio for train and test sets
print("{0:.2f}% in test set".format(len(X_test)/len(df.index)*100))

#Imputing the missing values with the mean values

from sklearn.impute import SimpleImputer


#Impute with mean all 0 values

fill_0 = SimpleImputer(missing_values=0,strategy ="mean")
X_train= fill_0.fit_transform(X_train)
X_test= fill_0.fit_transform(X_test)

#Train the model with Naives Bayes Algorithm

from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, Y_train.ravel())
                    





69.92% in training set
30.08% in test set


GaussianNB()

In [24]:
# Performance on test data

nb_predict_train =  nb_model.predict(X_train)

from sklearn import metrics
print("Accuracy: {0: .4f}".format(metrics.accuracy_score(Y_train, nb_predict_train)))
print()


Accuracy:  0.7542



In [25]:
nb_predict_test =  nb_model.predict(X_test)

from sklearn import metrics
print("Accuracy: {0: .4f}".format(metrics.accuracy_score(Y_test, nb_predict_test)))
print()

Accuracy:  0.7359



In [27]:
# Metrics

print("Confusion matrix")                                 # Confusion matrix that compares the predicted natural results for diabetes
print("{0}".format(metrics.confusion_matrix(Y_test, nb_predict_test)))
print("")

print("Classification report")                                 # Classification Report for the statistics based on confusion matrix
print(metrics.classification_report(Y_test, nb_predict_test))


Confusion matrix
[[118  33]
 [ 28  52]]

Classification report
              precision    recall  f1-score   support

           0       0.81      0.78      0.79       151
           1       0.61      0.65      0.63        80

    accuracy                           0.74       231
   macro avg       0.71      0.72      0.71       231
weighted avg       0.74      0.74      0.74       231

