In [223]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# SVC - Support Vector Classification
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')


In [224]:
# This will read the data from the csv file
data = pd.read_csv('/content/diabetes.csv')

In [225]:
# Finds the mean,median and mode of the dataset
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [237]:
data_copy = data.copy(deep=True)

In [238]:
# Removes data with 0 values from the dataset
data_copy[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI']] = data_copy[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI']].replace(0,np.nan)

In [239]:
# Replaces the zero data entries with the mean value of the dataset
data['Glucose'] = data['Glucose'].replace(0,data['Glucose'].median())
data['BloodPressure'] = data['BloodPressure'].replace(0,data['BloodPressure'].median())
data['SkinThickness'] = data['SkinThickness'].replace(0,data['SkinThickness'].median())
data['Insulin'] = data['Insulin'].replace(0,data['Insulin'].median())
data['BMI'] = data['BMI'].replace(0,data['BMI'].median())

In [240]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [241]:
# Training 90% of the dataset and tesing 10% of the dataset
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.10,random_state = 41)

In [242]:
# Feature scaling for Logistic Regression (Standard Scaler Output is input for logistic regression)
pipeline_lr = Pipeline([('scalar1',StandardScaler()),('lr_classifier',LogisticRegression())])
pipeline_knn = Pipeline([('scalar2',StandardScaler()),('lr_classifier',KNeighborsClassifier())])
pipeline_svc = Pipeline([('scalar3',StandardScaler()),('lr_classifier',SVC())])

In [243]:
# Feature Scaling is not required for non-machine learning algorithms like Decision Tree and Random forest
pipeline_dt = Pipeline([('dt_classifier', DecisionTreeClassifier())])
pipeline_rf = Pipeline([('dt_classifier', RandomForestClassifier())])
pipeline_gbc = Pipeline([('dt_classifier', GradientBoostingClassifier())])

In [244]:
pipelines = [pipeline_lr,pipeline_knn, pipeline_svc, pipeline_dt, pipeline_rf, pipeline_gbc]

In [245]:
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [246]:
pipe_dict = {0:'LR',
             1:'KNN',
             2:'SVC',
             3:'DT',
             4: 'RF',
             5: 'GBC'}

In [247]:
# Predicts the accuracy of various ML algorithms
for i,model in enumerate(pipelines):
    print("{} Test Accuracy:{}".format(pipe_dict[i],model.score(X_test,y_test)*100))

LR Test Accuracy:81.81818181818183
KNN Test Accuracy:76.62337662337663
SVC Test Accuracy:83.11688311688312
DT Test Accuracy:80.51948051948052
RF Test Accuracy:88.31168831168831
GBC Test Accuracy:85.71428571428571


In [None]:
 # Predicting whether the patient is diabetic or non-diabetic using Random Forest Classifier model(Highest accuracy)

In [143]:
from sklearn.ensemble import RandomForestClassifier

In [144]:
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [146]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [147]:
rf = RandomForestClassifier()

In [148]:
rf.fit(X,y)

RandomForestClassifier()

In [20]:
new_data = pd.DataFrame({
'Pregnancies':1,
'Glucose':79.0,
'BloodPressure':60.0,
'SkinThickness':42.0,
'Insulin':48.0,
'BMI':43.5,
'DiabetesPedigreeFuncton':0.678,
'Age':23,},index=[0])

In [21]:
p = rf.predict(new_data)

In [22]:
if p[0] == 0:
    print('Non-Diabetic')
else:
    print('Diabetic')

Non-Diabetic
