In [3]:
 # import libraries
import pandas as pd
import numpy as np

In [4]:
# set seed for reproductibility
SEED = 20
np.random.seed(SEED)

In [5]:
# Loading Data
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# Replacing all 0 values with Null values
def replace_zero(df):
    df_nan=df.copy(deep=True)
    cols=["Glucose", "BloodPressure","SkinThickness","Insulin","BMI"]
    df_nan[cols] = df_nan[cols].replace({0:np.nan})
    return df_nan
df_nan=replace_zero(df)

In [7]:
# Copy pasting functions from previous notebook
def find_median(frame,var):
    temp = frame[frame[var].notnull()]
    temp = frame[[var,'Outcome']].groupby('Outcome')[[var]].median().reset_index()
    return temp

In [8]:
# Copy pasting functions from previous notebook
def replace_null(frame,var):
    median_df=find_median(frame,var)
    var_0=median_df[var].iloc[0]
    var_1=median_df[var].iloc[1]
    frame.loc[(frame['Outcome'] == 0) & (frame[var].isnull()), var] = var_0
    frame.loc[(frame['Outcome'] == 1) & (frame[var].isnull()), var] = var_1
    return frame[var].isnull().sum()

In [9]:
print(str(replace_null(df_nan,'Glucose'))+ ' Nulls for Glucose')
print(str(replace_null(df_nan,'SkinThickness'))+ ' Nulls for SkinThickness')
print(str(replace_null(df_nan,'Insulin'))+ ' Nulls for Insulin')
print(str(replace_null(df_nan,'BMI'))+ ' Nulls for BMI')
print(str(replace_null(df_nan,'BloodPressure'))+ ' Nulls for BloodPressure')
# We have successfully handled Nulls

0 Nulls for Glucose
0 Nulls for SkinThickness
0 Nulls for Insulin
0 Nulls for BMI
0 Nulls for BloodPressure


In [10]:
df_nan.isnull().sum()
# Just a confirmation
# Everything looks good

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [11]:
# We need to scale our data for uniformity.
from sklearn.preprocessing import StandardScaler
def std_scalar(df):
    std_X = StandardScaler()
    x =  pd.DataFrame(std_X.fit_transform(df.drop(["Outcome"],axis = 1),),
            columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
           'BMI', 'DiabetesPedigreeFunction', 'Age'])
    y=df["Outcome"]
    return x,y

In [12]:
X,Y=std_scalar(df_nan)
X.describe()
# Scaled data looks fine

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,-6.476301e-17,1.480297e-16,-3.978299e-16,8.095376e-18,-3.469447e-18,1.31839e-16,2.451743e-16,1.931325e-16
std,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652
min,-1.141852,-2.551447,-3.999727,-2.486187,-1.434747,-2.070186,-1.189553,-1.041549
25%,-0.8448851,-0.7202356,-0.6934382,-0.4603073,-0.440843,-0.717659,-0.6889685,-0.7862862
50%,-0.2509521,-0.1536274,-0.03218035,-0.1226607,-0.440843,-0.0559387,-0.3001282,-0.3608474
75%,0.6399473,0.6100618,0.6290775,0.3275348,0.3116039,0.6057816,0.4662269,0.6602056
max,3.906578,2.539814,4.100681,7.868309,7.909072,5.041489,5.883565,4.063716


In [13]:
Y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [14]:
#Keeping train  size as 0.8
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=20, stratify=Y)

In [15]:

# We are good to go with baseline model
# Let's first implement KNN
from sklearn.neighbors import KNeighborsClassifier
test_scores = []
train_scores = []
for i in range(5,15):
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(X_train, Y_train)
    train_scores.append(neigh.score(X_train,Y_train))
    test_scores.append(neigh.score(X_test,Y_test))

In [16]:
print('Max train_scores is ' + str(max(train_scores)*100) + ' for k = '+ 
      str(train_scores.index(max(train_scores))+5))

Max train_scores is 85.66775244299674 for k = 5


In [17]:
print('Max test_scores is ' + str(max(test_scores)*100) + ' for k = '+ 
      str(test_scores.index(max(test_scores))+5))
# K=13 has generalized well for our data.

Max test_scores is 87.01298701298701 for k = 13


In [18]:

# Lets try Logistic regression now
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(random_state=20, penalty='l2').fit(X_train, Y_train)
log_pred=log_model.predict(X_test)
log_model.score(X_test, Y_test)

0.8311688311688312

In [19]:
# Support Vector Machines
from sklearn import svm
svm_model = svm.SVC().fit(X_train, Y_train)
svm_pred=svm_model.predict(X_test)
svm_model.score(X_test, Y_test)
# Almost 89% Accuracy

0.8896103896103896

In [20]:
# Function to evaluate model performance
def model_perf(pred,Y_test):
    cmp_list=[]
    for i,j in zip(pred,Y_test):
        if i==j:
            cmp_list.append(1)
        else:
            cmp_list.append(0)
    return cmp_list

In [21]:
cmp_list=model_perf(svm_pred,Y_test)

In [22]:
print('Model Accuracy Confirmation :'+ str(cmp_list.count(1)/len(Y_test)))

Model Accuracy Confirmation :0.8896103896103896


In [23]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(max_depth=2, random_state=20).fit(X_train, Y_train)
rf_pred=rf_model.predict(X_test)
rf_model.score(X_test, Y_test)
# Almost 86% Accuracy

0.8571428571428571

In [24]:
import tensorflow as tf
def build_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation='relu', input_shape=[len(X_train.keys())]),
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(2, activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
  ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

neural_model = build_model()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [25]:
neural_model.summary()

In [28]:
# Keeping EPOCHs high as dataset is small.
EPOCHS = 1000
neural_pred = neural_model.fit(X_train, Y_train,epochs=EPOCHS, validation_split=0.1, verbose=2)

Epoch 1/1000
18/18 - 1s - 28ms/step - accuracy: 0.9167 - loss: 0.2144 - val_accuracy: 0.9032 - val_loss: 0.2397
Epoch 2/1000
18/18 - 0s - 14ms/step - accuracy: 0.9004 - loss: 0.2255 - val_accuracy: 0.9355 - val_loss: 0.1900
Epoch 3/1000
18/18 - 0s - 9ms/step - accuracy: 0.9185 - loss: 0.2032 - val_accuracy: 0.9194 - val_loss: 0.2078
Epoch 4/1000
18/18 - 0s - 8ms/step - accuracy: 0.9112 - loss: 0.2200 - val_accuracy: 0.9032 - val_loss: 0.2247
Epoch 5/1000
18/18 - 0s - 8ms/step - accuracy: 0.8895 - loss: 0.2159 - val_accuracy: 0.9194 - val_loss: 0.2491
Epoch 6/1000
18/18 - 0s - 8ms/step - accuracy: 0.9076 - loss: 0.2105 - val_accuracy: 0.9194 - val_loss: 0.2033
Epoch 7/1000
18/18 - 0s - 10ms/step - accuracy: 0.9076 - loss: 0.2105 - val_accuracy: 0.9194 - val_loss: 0.2300
Epoch 8/1000
18/18 - 0s - 8ms/step - accuracy: 0.9076 - loss: 0.2047 - val_accuracy: 0.8871 - val_loss: 0.2483
Epoch 9/1000
18/18 - 0s - 8ms/step - accuracy: 0.9040 - loss: 0.2116 - val_accuracy: 0.9032 - val_loss: 0.245

In [29]:
# Let's measure final performance
hist = pd.DataFrame(neural_pred.history)
hist['epoch'] = neural_pred.epoch
hist.tail()
# 91% accuracy on train

Unnamed: 0,accuracy,loss,val_accuracy,val_loss,epoch
995,0.945652,0.114348,0.870968,1.11765,995
996,0.943841,0.119509,0.903226,1.177031,996
997,0.940217,0.127334,0.887097,0.967216,997
998,0.940217,0.133113,0.854839,0.936585,998
999,0.942029,0.129932,0.903226,1.08874,999


In [30]:
neural_test=neural_model.predict(X_test)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


In [31]:
neural_test_converted=[]
for i in neural_test:
    if i>0.5:
        neural_test_converted.append(1)
    else:
        neural_test_converted.append(0)

In [32]:
cmp_list=model_perf(neural_test_converted,Y_test)

In [33]:
print('Test Accuracy :' + str(cmp_list.count(1)/len(Y_test)*100)+' %')
#~86% Accuracy.

Test Accuracy :87.01298701298701 %


In [34]:
import pickle
# Lets dump our SVM model
pickle.dump(svm_model, open('svm_model.pkl','wb'))