Diabetes prediction using Support Vecor Machine Algorithm

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data collection and analyses


In [None]:
diabetes_dataset = pd.read_csv('/content/diabetes.csv') # Only for females

In [None]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
diabetes_dataset.shape


(768, 9)

In [None]:
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
diabetes_dataset["Pregnancies"].idxmax() # index where the number of preganancies value is more

159

In [None]:
print(diabetes_dataset.iloc[159])

Pregnancies                  17.000
Glucose                     163.000
BloodPressure                72.000
SkinThickness                41.000
Insulin                     114.000
BMI                          40.900
DiabetesPedigreeFunction      0.817
Age                          47.000
Outcome                       1.000
Name: 159, dtype: float64


In [None]:
diabetes_dataset.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [None]:
diabetes_dataset["Outcome"].value_counts() # non diabetic, 1 diabetic

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


In [None]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [None]:
# Target and Values
X=diabetes_dataset.drop(columns='Outcome',axis=1)
Y=diabetes_dataset['Outcome']

Data Standardization


In [None]:
scaler= StandardScaler()

In [None]:
trained_scaler=scaler.fit(X)

In [None]:
Standardized_Data=scaler.fit_transform(X) # Getting the data to a common range

In [None]:
X=Standardized_Data
Y=diabetes_dataset['Outcome']

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2) # startification keeps the proportion of training and test data similar

Training the model

In [None]:
model=svm.SVC(kernel='linear')

In [None]:
model.fit(X_train,Y_train)

Model Evaluation

In [None]:
accuracy_score(model.predict(X_train),Y_train)

0.7866449511400652

In [None]:
accuracy_score(model.predict(X_test),Y_test)

0.7727272727272727

Diabetes Prediction System

In [None]:
input_data=(6,148,72,35,0,33.6,0.627,50)
input_data=np.asarray(input_data)# 1D vector
input_data=input_data.reshape(1,-1) # converting flat vector in 2d (single row matrix )matrix. As an instance, [a,b,c]---> [[a,b,c]]// N samples N features
input_data=scaler.transform(input_data)
prediction=model.predict(input_data)
print(prediction)
if prediction[0]==0:
  print("Non Diabetic")
else:
  print("Diabetic")

[1]
Diabetic




Saving the trained model

In [None]:
import pickle

In [None]:
# Save to file in Colab's local environment
file='scaler.sav'
pickle.dump(trained_scaler,open(file,'wb'))

In [None]:
file='trained_model.sav'
pickle.dump(model,open(file,'wb')) # wb= writing model in in binary

In [None]:
# Loading the saved model
loaded_scaler=pickle.load(open('scaler.sav','rb')) # rb= reading model in binary
loaded_model=pickle.load(open('trained_model.sav','rb')) # read in binary

In [None]:
input_data=(6,148,72,35,0,33.6,0.627,50)
input_data=np.asarray(input_data)
input_data=input_data.reshape(1,-1)
input_data=scaler.transform(input_data)
prediction=loaded_model.predict(input_data)
print(prediction)
if prediction[0]==0:
  print("Non Diabetic")
else:
  print("Diabetic")

[1]
Diabetic


