Importing the Dependencies

In [181]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

Data Collection and Analysis

PIMA Diabetes Dataset

In [182]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('diabetes_prediction_dataset.csv') 

In [183]:
# printing the first 5 rows of the dataset
diabetes_dataset.head(30)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


In [184]:
# number of rows and Columns in this dataset
diabetes_dataset.shape

(100000, 9)

In [185]:
# getting the statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [186]:
diabetes_dataset['diabetes'].value_counts()

0    91500
1     8500
Name: diabetes, dtype: int64

0 --> Non-Diabetic

1 --> Diabetic

In [187]:
diabetes_dataset.groupby('diabetes').mean()

  diabetes_dataset.groupby('diabetes').mean()


Unnamed: 0_level_0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level
diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,40.115187,0.058984,0.029235,26.887163,5.396761,132.85247
1,60.946588,0.245647,0.149059,31.988382,6.934953,194.094706


In [188]:
model = LabelEncoder()
diabetes_dataset['gender'] = model.fit_transform(diabetes_dataset['gender'])
diabetes_dataset['smoking_history'] = model.fit_transform(diabetes_dataset['smoking_history'])


In [189]:
# z = diabetes_dataset.drop(columns=['gender', 'smoking_history'], axis=2)

In [190]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'diabetes', axis=1)
Y = diabetes_dataset['diabetes']

In [191]:
print(X)

       gender   age  hypertension  heart_disease  smoking_history    bmi  \
0           0  80.0             0              1                4  25.19   
1           0  54.0             0              0                0  27.32   
2           1  28.0             0              0                4  27.32   
3           0  36.0             0              0                1  23.45   
4           1  76.0             1              1                1  20.14   
...       ...   ...           ...            ...              ...    ...   
99995       0  80.0             0              0                0  27.32   
99996       0   2.0             0              0                0  17.37   
99997       1  66.0             0              0                3  27.83   
99998       0  24.0             0              0                4  35.42   
99999       0  57.0             0              0                1  22.43   

       HbA1c_level  blood_glucose_level  
0              6.6                  140  
1  

In [192]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64


Train Test Split

In [193]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [194]:
print(X.shape, X_train.shape, X_test.shape)

(100000, 8) (80000, 8) (20000, 8)


Training the Model

In [195]:
#classifier = svm.SVC(kernel='linear')
classifier = GaussianNB()


In [196]:
#training the support vector Machine Classifier
#classifier.fit(X_train, Y_train)
classifier.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [197]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [198]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9033875


In [199]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [200]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.90515


Making a Predictive System

In [201]:
input_data = (0,56.0,0,0,3,60.03,9.0,240)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic




Saving the trained model

In [202]:
import pickle

In [203]:
filename = 'dp_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [204]:
# loading the saved model
loaded_model = pickle.load(open('dp_model.sav', 'rb'))

In [205]:
input_data = (0.00, 56.00, 0.00, 0.00, 3.00, 60.03, 9.00, 240.00)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic




In [206]:
for column in X.columns:
  print(column)

gender
age
hypertension
heart_disease
smoking_history
bmi
HbA1c_level
blood_glucose_level
