In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
dia = pd.read_csv('../dataset/diabetes_dataset.csv')
dia

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,80.0,0,1,25.19,6.6,140,0
1,54.0,0,0,27.32,6.6,80,0
2,28.0,0,0,27.32,5.7,158,0
3,36.0,0,0,23.45,5.0,155,0
4,76.0,1,1,20.14,4.8,155,0
...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0
99996,2.0,0,0,17.37,6.5,100,0
99997,66.0,0,0,27.83,5.7,155,0
99998,24.0,0,0,35.42,4.0,100,0


In [3]:
dia.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [4]:
dia['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [5]:
dia.fillna(0)
print(dia.dtypes)

age                    float64
hypertension             int64
heart_disease            int64
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object


In [6]:
dia['age'] = dia['age'].astype('int64')
dia['HbA1c_level'] = dia['HbA1c_level'].astype('int64')
dia['bmi'] = dia['bmi'].astype('int64')

In [7]:
dia.groupby('diabetes').mean()

Unnamed: 0_level_0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level
diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,40.104044,0.058984,0.029235,26.442284,5.002448,132.85247
1,60.946588,0.245647,0.149059,31.528118,6.531765,194.094706


In [8]:
X = dia.drop(columns='diabetes', axis=1)
Y = dia['diabetes']
print(X)

       age  hypertension  heart_disease  bmi  HbA1c_level  blood_glucose_level
0       80             0              1   25            6                  140
1       54             0              0   27            6                   80
2       28             0              0   27            5                  158
3       36             0              0   23            5                  155
4       76             1              1   20            4                  155
...    ...           ...            ...  ...          ...                  ...
99995   80             0              0   27            6                   90
99996    2             0              0   17            6                  100
99997   66             0              0   27            5                  155
99998   24             0              0   35            4                  100
99999   57             0              0   22            6                   90

[100000 rows x 6 columns]


In [9]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64


In [10]:
scalar = StandardScaler()
Standardized_data = scalar.fit_transform(X)
print(Standardized_data)

[[ 1.69176064 -0.28443945  4.93637859 -0.28223367  0.78789702  0.04770422]
 [ 0.53801538 -0.28443945 -0.20257766  0.01888303  0.78789702 -1.42620999]
 [-0.61572988 -0.28443945 -0.20257766  0.01888303 -0.12027881  0.48987848]
 ...
 [ 1.07051319 -0.28443945 -0.20257766  0.01888303 -0.12027881  0.41618277]
 [-0.79322915 -0.28443945 -0.20257766  1.22334981 -1.02845463 -0.93490525]
 [ 0.67113983 -0.28443945 -0.20257766 -0.73390871  0.78789702 -1.18055762]]


In [11]:
X = Standardized_data

Train Test Split

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [13]:
print(X.shape, X_train.shape, X_test.shape)

(100000, 6) (80000, 6) (20000, 6)


In [14]:
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, Y_train)

In [15]:
X_train_prediction = classifier.predict(X_train)
trained_accuracy = accuracy_score(X_train_prediction, Y_train)
print(trained_accuracy)

0.959625


In [16]:
X_test_prediction = classifier.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction, Y_test)
print(test_accuracy)

0.9622


In [17]:
input_data = (57.0,0,0,27.32,8.2,126)
input_data_as_numpy_array = np.array(input_data)

reshaped_input_data = input_data_as_numpy_array.reshape(1,-1)

std_data = scalar.transform(reshaped_input_data)

prediction = classifier.predict(std_data)
print(prediction)

if(prediction[0] == 0):
    print('The person is not Diabetic')
else:
    print('The person is Diabetic')

[1]
The person is Diabetic


Saving the model

In [18]:
import pickle
filename = '../saved_models/diabetes_model.sav'
pickle.dump(classifier, open(filename, 'wb'))
loaded_model = pickle.load(open('../saved_models/diabetes_model.sav', 'rb'))

In [19]:
input_data = (75.0,0,0,27.32,6.1,100)

input_data_as_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

std_data = scalar.transform(input_data_reshaped)

prediction = loaded_model.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not Diabetic')
else:
  print('The person is Diabetic')

[0]
The person is not Diabetic


In [20]:
column_headers = list(dia.columns)
print("The Column Header :", column_headers)

The Column Header : ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']
