In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
# loading the csv data to a Pandas DataFrame
dia = pd.read_csv('../dataset/diabetes_dataset.csv')
dia

In [None]:
# statistical measures about the data
dia.describe()

In [None]:
# checking the distribution of Target Variable
dia['diabetes'].value_counts()

In [None]:
#Filling null values with 0
dia.fillna(0)
#Displaying the data types of each columns
print(dia.dtypes)

In [None]:
dia['age'] = dia['age'].astype('int64')
dia['HbA1c_level'] = dia['HbA1c_level'].astype('int64')
dia['bmi'] = dia['bmi'].astype('int64')

In [None]:
#Grouping the dataset and taking mean values 
dia.groupby('diabetes').mean()

Splitting the Features and Target

In [None]:
#Dropping the output column (diabetes)
X = dia.drop(columns='diabetes', axis=1)
Y = dia['diabetes']
print(X)

In [None]:
print(Y)

In [None]:
scalar = StandardScaler()
Standardized_data = scalar.fit_transform(X)
print(Standardized_data)

In [None]:
X = Standardized_data

Train Test Split

In [None]:
#Splitting the Data into Training data & Test Data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, Y_train)

In [None]:
# accuracy on training data
X_train_prediction = classifier.predict(X_train)
trained_accuracy = accuracy_score(X_train_prediction, Y_train)
print(trained_accuracy)

In [None]:
# accuracy on test data
X_test_prediction = classifier.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction, Y_test)
print(test_accuracy)

In [None]:
input_data = (57.0,0,0,27.32,8.2,126)

# change the input data to a numpy array
input_data_as_numpy_array = np.array(input_data)

# reshape the numpy array as we are predicting for only on instance
reshaped_input_data = input_data_as_numpy_array.reshape(1,-1)

std_data = scalar.transform(reshaped_input_data)

prediction = classifier.predict(std_data)
print(prediction)

if(prediction[0] == 0):
    print('The person is not Diabetic')
else:
    print('The person is Diabetic')

Saving the trained model

In [None]:
import pickle
filename = '../saved_models/diabetes_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

# loading the saved model
loaded_model = pickle.load(open('../saved_models/diabetes_model.sav', 'rb'))

In [None]:
input_data = (75.0,0,0,27.32,6.1,100)

input_data_as_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

std_data = scalar.transform(input_data_reshaped)

prediction = loaded_model.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not Diabetic')
else:
  print('The person is Diabetic')

In [None]:
column_headers = list(dia.columns)
print("The Column Header :", column_headers)