In [174]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# SVC - Support Vector Classification
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline


In [175]:
data = pd.read_csv('/content/diabetes.csv')

In [176]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10,random_state=41)

In [199]:
# Feature scaling for Logistic Regression (Standard Scaler Output is input for logistic regression)
pipeline_lr = Pipeline([('scalar1',StandardScaler()),('lr_classifier',LogisticRegression())])
pipeline_knn = Pipeline([('scalar2',StandardScaler()),('lr_classifier',KNeighborsClassifier())])
pipeline_svc = Pipeline([('scalar3',StandardScaler()),('lr_classifier',SVC())])

In [200]:
# Feature Scaling is not required for non-machine learning algorithms like Decision Tree and Random forest
pipeline_dt = Pipeline([('dt_classifier', DecisionTreeClassifier())])
pipeline_rf = Pipeline([('dt_classifier', RandomForestClassifier())])
pipeline_gbc = Pipeline([('dt_classifier', GradientBoostingClassifier())])

In [201]:
pipelines = [pipeline_lr,pipeline_knn, pipeline_svc, pipeline_dt, pipeline_rf, pipeline_gbc]

In [202]:
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [203]:
pipe_dict = {0:'LR',
             1:'KNN',
             2:'SVC',
             3:'DT',
             4: 'RF',
             5: 'GBC'}

In [204]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy:{}".format(pipe_dict[i],model.score(X_test,y_test)*100))

LR Test Accuracy:83.11688311688312
KNN Test Accuracy:80.51948051948052
SVC Test Accuracy:83.11688311688312
DT Test Accuracy:74.02597402597402
RF Test Accuracy:87.01298701298701
GBC Test Accuracy:84.4155844155844


In [115]:
 # Predicting whether the patient is diabetic or non-diabetic using Random Forest Classifier model

In [116]:
from sklearn.ensemble import RandomForestClassifier

In [117]:
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [118]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [119]:
rf = RandomForestClassifier()

In [120]:
rf.fit(X,y)

RandomForestClassifier()

In [121]:
new_data = pd.DataFrame({
'Pregnancies':6,
'Glucose':148.0,
'BloodPressure':72.0,
'SkinThickness':35.0,
'Insulin':79.799479,
'BMI':33.6,
'DiabetesPedigreeFuncton':0.627,
'Age':50,},index=[0])

In [122]:
p = rf.predict(new_data)

Feature names unseen at fit time:
- DiabetesPedigreeFuncton
Feature names seen at fit time, yet now missing:
- DiabetesPedigreeFunction



In [123]:
if p[0] == 0:
    print('non-diabetic')
else:
    print('diabetic')

diabetic
