In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('heart.csv')
df.sample(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
433,46,M,ASY,110,236,0,Normal,125,Y,2.0,Flat,1
888,52,M,ASY,128,204,1,Normal,156,Y,1.0,Flat,1
171,40,M,NAP,140,235,0,Normal,188,N,0.0,Up,0
62,45,M,ASY,140,224,0,Normal,144,N,0.0,Up,0
709,52,M,ASY,125,212,0,Normal,168,N,1.0,Up,1


In [2]:
#Need to split data into training set and testing set
from sklearn.model_selection import train_test_split

X = df.drop('HeartDisease', axis=1)
Y = df['HeartDisease']

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, 
    test_size=0.2, random_state=42
)

In [3]:
import functools
import tensorflow as tf

LABEL_COLUMN = 'HeartDisease'
LABELS = [0, 12]

def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=5, # Artificially small to make examples easier to show.
        label_name=LABEL_COLUMN,
        na_value="?",
        num_epochs=1,
        ignore_errors=True, 
        **kwargs)
    return dataset

raw_training_data = get_dataset("heart.csv")
raw_testing_data = get_dataset("heart.csv")

In [4]:
def show_batch(dataset):
      for batch, label in dataset.take(1):
        for key, value in batch.items():
              print("{:20s}: {}".format(key,value.numpy()))

In [5]:
show_batch(raw_training_data)

Age                 : [63 65 53 68 42]
Sex                 : [b'M' b'M' b'M' b'M' b'M']
ChestPainType       : [b'ASY' b'ASY' b'ASY' b'NAP' b'ATA']
RestingBP           : [140 136 125 180 120]
Cholesterol         : [  0 248   0 274 196]
FastingBS           : [1 0 1 1 0]
RestingECG          : [b'LVH' b'Normal' b'Normal' b'LVH' b'Normal']
MaxHR               : [149 140 120 150 150]
ExerciseAngina      : [b'N' b'Y' b'N' b'Y' b'N']
Oldpeak             : [2.  4.  1.5 1.6 0. ]
ST_Slope            : [b'Up' b'Down' b'Up' b'Flat' b'Up']


In [6]:
class PackNumericFeatures(object):
    def __init__(self, names):
        self.names = names

    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features['numeric'] = numeric_features

        return features, labels

In [7]:
NUMERIC_FEATURES = ['Age','RestingBP','Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

packed_train_data = raw_training_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = raw_testing_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


In [8]:
show_batch(packed_train_data)

Sex                 : [b'F' b'M' b'M' b'M' b'F']
ChestPainType       : [b'ATA' b'ASY' b'ASY' b'NAP' b'ATA']
RestingECG          : [b'ST' b'LVH' b'LVH' b'Normal' b'Normal']
ExerciseAngina      : [b'N' b'Y' b'Y' b'N' b'N']
ST_Slope            : [b'Up' b'Flat' b'Down' b'Up' b'Flat']
numeric             : [[ 55.  110.  344.    0.  160.    0. ]
 [ 53.  130.    0.    0.  135.    1. ]
 [ 59.  170.  326.    0.  140.    3.4]
 [ 50.  129.  196.    0.  163.    0. ]
 [ 56.  120.  279.    0.  150.    1. ]]


In [9]:
example_batch, labels_batch = next(iter(packed_train_data)) 

In [10]:
CATEGORIES = {
    'Sex': ['M', 'F'],
    'ChestPainType' : ['ATA', 'ASY', 'NAP','TA'],
    'RestingECG' : ['LVH', 'Normal', 'ST'],
    'ExerciseAngina' : ['Y', 'N'],
    'ST_Slope' : ['Up', 'Flat','Down']
}


In [11]:
categorical_columns = []
for feature, vocab in CATEGORIES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
    categorical_columns.append(tf.feature_column.indicator_column(cat_col))

In [12]:
categorical_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Sex', vocabulary_list=('M', 'F'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='ChestPainType', vocabulary_list=('ATA', 'ASY', 'NAP', 'TA'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='RestingECG', vocabulary_list=('LVH', 'Normal', 'ST'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='ExerciseAngina', vocabulary_list=('Y', 'N'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='ST_Slope', vocabulary_list=('Up', 'Flat', 'Down'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]

In [15]:
import pandas as pd
desc = pd.read_csv('heart.csv')[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
count,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657
min,28.0,0.0,0.0,0.0,60.0,-2.6
25%,47.0,120.0,173.25,0.0,120.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6
75%,60.0,140.0,267.0,0.0,156.0,1.5
max,77.0,200.0,603.0,1.0,202.0,6.2


In [16]:
MEAN = np.array(desc.T['mean'])
STD = np.array(desc.T['std'])

In [17]:
def normalize_numeric_data(data, mean, std):
  # Center the data
  return (data-mean)/std

In [18]:
# See what you just created.
normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

numeric_column = tf.feature_column.numeric_column('numeric', normalizer_fn=normalizer, shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]
numeric_column

NumericColumn(key='numeric', shape=(6,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x00000233E29445E0>, mean=array([ 53.51089325, 132.39651416, 198.79956427,   0.23311547,
       136.80936819,   0.88736383]), std=array([  9.43261651,  18.51415412, 109.38414455,   0.42304562,
        25.46033414,   1.06657015])))

In [19]:
example_batch['numeric']

<tf.Tensor: shape=(5, 6), dtype=float32, numpy=
array([[ 59. , 154. ,   0. ,   0. , 131. ,   1.5],
       [ 50. , 170. , 209. ,   0. , 116. ,   0. ],
       [ 52. , 140. , 266. ,   0. , 134. ,   2. ],
       [ 42. , 150. , 268. ,   0. , 136. ,   0. ],
       [ 46. , 180. , 280. ,   0. , 120. ,   0. ]], dtype=float32)>

In [20]:
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)
numeric_layer(example_batch).numpy()

array([[ 0.5819282 ,  1.1668632 , -1.817444  , -0.55104095, -0.22817343,
         0.57439834],
       [-0.37220794,  2.031067  ,  0.09325336, -0.55104095, -0.8173252 ,
        -0.83197886],
       [-0.1601777 ,  0.41068497,  0.61435264, -0.55104095, -0.11034309,
         1.0431907 ],
       [-1.2203289 ,  0.9508123 ,  0.63263685, -0.55104095, -0.03178953,
        -0.83197886],
       [-0.79626846,  2.5711942 ,  0.74234194, -0.55104095, -0.660218  ,
        -0.83197886]], dtype=float32)

In [21]:
categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)
print(categorical_layer(example_batch).numpy()[0])

[0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]


In [22]:
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns+numeric_columns)

print(preprocessing_layer(example_batch).numpy()[0])

[ 0.          1.          0.          0.          1.          0.
  0.          0.          1.          1.          0.          0.
  1.          0.          0.5819282   1.1668632  -1.817444   -0.55104095
 -0.22817343  0.57439834]


In [23]:
model = tf.keras.Sequential([
  preprocessing_layer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1),
])

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

In [24]:
train_data = packed_train_data.shuffle(500)
test_data = packed_test_data

In [25]:
model.fit(train_data, epochs=20)

Epoch 1/20
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x233e2ca4430>

In [26]:
test_loss, test_accuracy = model.evaluate(test_data)

Consider rewriting this model with the Functional API.


In [30]:
predictions = model.predict(test_data)

# Show some results
for prediction, HD in zip(predictions[:10], list(test_data)[0][1][:10]):
    prediction = tf.sigmoid(prediction).numpy()
    print("Predicted chance of having a Heart Disease: {:.2%}".format(prediction[0])," | Actual outcome: ",
          ("Has a Heart Disease" if bool(HD) else "Has no Heart Disease"))


Predicted chance of having a Heart Disease: 20.97%  | Actual outcome:  Has a Heart Disease
Predicted chance of having a Heart Disease: 100.00%  | Actual outcome:  Has a Heart Disease
Predicted chance of having a Heart Disease: 24.73%  | Actual outcome:  Has a Heart Disease
Predicted chance of having a Heart Disease: 0.01%  | Actual outcome:  Has no Heart Disease
Predicted chance of having a Heart Disease: 99.88%  | Actual outcome:  Has no Heart Disease
