# Using ML Techniques for Detecting Type II Diabetes

**Eelis KOSTIAINEN, Khristen THORNBURG**

In [28]:
import numpy as np
import pandas as pd
MMOL_PER_LITRE_CONSTANT = 0.0555

In [32]:
pima_df = pd.read_csv('./data/pima.csv')
pima_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [33]:
pima_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [34]:
# The following fields cannot physically be zero: BMI, Glucose, BloodPressure, SkinThickness

def replace_with_median(df, key, value=0):
    column = df[key].copy()
    outer = column[~(column==value)]
    return column.replace(value, outer.mean())

def sanitize_data(df):
    data = df.copy()
    data['BMI'] = replace_with_median(data, 'BMI')
    data['BloodPressure'] = replace_with_median(data, 'BloodPressure')
    data['SkinThickness'] = replace_with_median(data, 'SkinThickness')
    data['Glucose'] = replace_with_median(data, 'Glucose')
    # Prefer mmol/l over mg/Dl
    data['Glucose'] = data['Glucose'] * MMOL_PER_LITRE_CONSTANT
    
    return data

pima_df = sanitize_data(pima_df)
pima_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,6.753615,72.405184,29.15342,79.799479,32.457464,0.471876,33.240885,0.348958
std,3.369578,1.689195,12.096346,8.790942,115.244002,6.875151,0.331329,11.760232,0.476951
min,0.0,2.442,24.0,7.0,0.0,18.2,0.078,21.0,0.0
25%,1.0,5.536125,64.0,25.0,0.0,27.5,0.24375,24.0,0.0
50%,3.0,6.4935,72.202592,29.15342,30.5,32.4,0.3725,29.0,0.0
75%,6.0,7.783875,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,11.0445,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [35]:
german_df = pd.read_csv('./data/germany.csv')
german_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [36]:
german_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,3.7035,121.1825,69.1455,20.935,80.254,32.193,0.47093,33.0905,0.342
std,3.306063,32.068636,19.188315,16.103243,111.180534,8.149901,0.323553,11.786423,0.474498
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,63.5,0.0,0.0,27.375,0.244,24.0,0.0
50%,3.0,117.0,72.0,23.0,40.0,32.3,0.376,29.0,0.0
75%,6.0,141.0,80.0,32.0,130.0,36.8,0.624,40.0,1.0
max,17.0,199.0,122.0,110.0,744.0,80.6,2.42,81.0,1.0


In [37]:
# Since the datasets have the same columns, we can use the same sanitization both
german_df = sanitize_data(german_df)
german_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,3.7035,6.769631,72.403665,29.341275,80.254,32.650101,0.47093,33.0905,0.342
std,3.306063,1.694588,11.949531,9.123652,111.180534,7.190136,0.323553,11.786423,0.474498
min,0.0,2.442,24.0,7.0,0.0,18.2,0.078,21.0,0.0
25%,1.0,5.4945,64.0,25.0,0.0,27.6,0.244,24.0,0.0
50%,3.0,6.549,72.0,29.341275,40.0,32.4,0.376,29.0,0.0
75%,6.0,7.8255,80.0,32.0,130.0,36.8,0.624,40.0,1.0
max,17.0,11.0445,122.0,110.0,744.0,80.6,2.42,81.0,1.0
