#                                Predicting Diabetes Using Tensorflow

![alt text](img/diabetes.jpg "Diabetes Prediction")

In This Notebook, I'm trying to predict the occurence of Diabetes for a given dataset. The Prima Indian Diabetes Dataset is located <a href="https://archive.ics.uci.edu/ml/datasets/pima+indians+diabetes"/a>here.

In [32]:
# importing the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn
%matplotlib inline

In [33]:
# read everything from the dataset and store in it as a pandas dataframe

diabetes = pd.read_csv("dataset/pima-indians-diabetes.csv")

In [34]:
# take look at the columns and their organization

diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
Number_pregnant          768 non-null int64
Glucose_concentration    768 non-null float64
Blood_pressure           768 non-null float64
Triceps                  768 non-null float64
Insulin                  768 non-null float64
BMI                      768 non-null float64
Pedigree                 768 non-null float64
Age                      768 non-null int64
Class                    768 non-null int64
Group                    768 non-null object
dtypes: float64(6), int64(3), object(1)
memory usage: 60.1+ KB


In [35]:
# Statistically check out the data points and see how they're spread out and centered. 

diabetes.describe()

Unnamed: 0,Number_pregnant,Glucose_concentration,Blood_pressure,Triceps,Insulin,BMI,Pedigree,Age,Class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,0.60751,0.566438,0.207439,0.094326,0.47679,0.168179,33.240885,0.348958
std,3.369578,0.160666,0.158654,0.161134,0.136222,0.117499,0.141473,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0
25%,1.0,0.497487,0.508197,0.0,0.0,0.406855,0.070773,24.0,0.0
50%,3.0,0.58794,0.590164,0.232323,0.036052,0.4769,0.125747,29.0,0.0
75%,6.0,0.704774,0.655738,0.323232,0.150414,0.545455,0.234095,41.0,1.0
max,17.0,1.0,1.0,1.0,1.0,1.0,1.0,81.0,1.0


In [36]:
# take a look at the dataset for the first five datapoints

diabetes.head()

Unnamed: 0,Number_pregnant,Glucose_concentration,Blood_pressure,Triceps,Insulin,BMI,Pedigree,Age,Class,Group
0,6,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,50,1,B
1,1,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,31,0,C
2,8,0.919598,0.52459,0.0,0.0,0.347243,0.253629,32,1,B
3,1,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,21,0,B
4,0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,33,1,C


In [37]:
# look at all the columns

diabetes.columns

Index(['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps',
       'Insulin', 'BMI', 'Pedigree', 'Age', 'Class', 'Group'],
      dtype='object')

In [38]:
# These column datapoints are disproportionately scaled. We need to get them to a scale to a uniform scale. 
# We'll choose cols that need to be Normalized.

cols_to_normalize = ['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps',
       'Insulin', 'BMI', 'Pedigree']

In [39]:
# checkout tangible scaling options 

from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler

In [40]:
# initialize the tangible scalers 

std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
robust_scaler = RobustScaler()

In [41]:
# standard scaler outputs

std_scaler.fit_transform(diabetes[cols_to_normalize])

array([[ 0.63994726,  0.84832379,  0.14964075, ..., -0.69289057,
         0.20401277,  0.46849198],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.69289057,
        -0.68442195, -0.36506078],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -0.69289057,
        -1.10325546,  0.60439732],
       ..., 
       [ 0.3429808 ,  0.00330087,  0.14964075, ...,  0.27959377,
        -0.73518964, -0.68519336],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.69289057,
        -0.24020459, -0.37110101],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.69289057,
        -0.20212881, -0.47378505]])

In [42]:
# Min Max scaler outputs

minmax_scaler.fit_transform(diabetes[cols_to_normalize])

array([[ 0.35294118,  0.74371859,  0.59016393, ...,  0.        ,
         0.50074516,  0.23441503],
       [ 0.05882353,  0.42713568,  0.54098361, ...,  0.        ,
         0.39642325,  0.11656704],
       [ 0.47058824,  0.91959799,  0.52459016, ...,  0.        ,
         0.34724292,  0.25362938],
       ..., 
       [ 0.29411765,  0.6080402 ,  0.59016393, ...,  0.13238771,
         0.390462  ,  0.07130658],
       [ 0.05882353,  0.63316583,  0.49180328, ...,  0.        ,
         0.4485842 ,  0.11571307],
       [ 0.05882353,  0.46733668,  0.57377049, ...,  0.        ,
         0.45305514,  0.10119556]])

In [43]:
# robust scaler outputs

robust_scaler.fit_transform(diabetes[cols_to_normalize])

array([[ 0.6       ,  0.75151515,  0.        , ..., -0.23968566,
         0.17204301,  0.66535948],
       [-0.4       , -0.77575758, -0.33333333, ..., -0.23968566,
        -0.58064516, -0.05620915],
       [ 1.        ,  1.6       , -0.44444444, ..., -0.23968566,
        -0.93548387,  0.78300654],
       ..., 
       [ 0.4       ,  0.0969697 ,  0.        , ...,  0.64047151,
        -0.62365591, -0.33333333],
       [-0.4       ,  0.21818182, -0.66666667, ..., -0.23968566,
        -0.20430108, -0.06143791],
       [-0.4       , -0.58181818, -0.11111111, ..., -0.23968566,
        -0.17204301, -0.1503268 ]])

I'd like to continue with Min Max scaler because it seems to be more fit this data here, avoiding negative elements.

In [44]:
diabetes[cols_to_normalize] = minmax_scaler.fit_transform(diabetes[cols_to_normalize])

In [46]:
# get the first 3 entires of the scaled data


diabetes[cols_to_normalize].head()

Unnamed: 0,Number_pregnant,Glucose_concentration,Blood_pressure,Triceps,Insulin,BMI,Pedigree
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638


In [50]:
diabetes.columns

Index(['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps',
       'Insulin', 'BMI', 'Pedigree', 'Age', 'Class', 'Group'],
      dtype='object')

In [51]:
# turn required cols into tf based cols

num_preg = tf.feature_column.numeric_column('Number_pregnant')
glucose_conc = tf.feature_column.numeric_column('Glucose_concentration')
blood_pressure = tf.feature_column.numeric_column('Blood_pressure')
triceps = tf.feature_column.numeric_column('Triceps')
insulin = tf.feature_column.numeric_column('Insulin')
bmi = tf.feature_column.numeric_column('BMI')
diabetes_pedgree = tf.feature_column.numeric_column('Pedigree')
age = tf.feature_column.numeric_column('Age')

In [52]:
# turn categorical col into tf categorical column 

assigned_group = tf.feature_column.categorical_column_with_vocabulary_list('Group',['A','B','C','D'])

In [None]:
# before we turn age to categorical tf variable lets turn it into a 