# **TensorFlow 2.0 alpha - Structured Data Classification & Feature Column Type Selection**

### Cleveland Clinic Foundation for Heart Disease dataset
### 14 Features - Numerical and Categorical

In [1]:
from __future__ import absolute_import, division, print_function

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


## Create a dataframe - Pandas

In [2]:
URL = 'http://storage.googleapis.com/applied-dl/heart.csv'
df = pd.read_csv(URL)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


## Split data into Train, Validation, and Test sets

In [3]:
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

print(len(train), 'Training examples')
print(len(val), 'Validation examples')
print(len(test), 'Testing examples')

193 Training examples
49 Validation examples
61 Testing examples


## Create an Input Pipeline - tf.data
#### tf.data lets you use Feature Columns as a bridge from the df, to Training features

In [4]:
# create a tf.data dataset from df

def df_to_dataset(df, shuffle=True, batch_size=32):
    df = df.copy()
    labels = df.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    return ds

In [5]:
# a small batch size used for this exercise

batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

### Explore the Input Pipeline

In [7]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['age'])
    print('A batch of targets:', label_batch)

Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
A batch of ages: tf.Tensor([44 56 45 35 58], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 0 0 0 1], shape=(5,), dtype=int32)


#### a Dict of column names was returned, mapping to column values from df rows

## Explore the Feature Columns
### TF provides many types of Feature columns, and they can transform a column of *df* - mostly used for Categorical Variables 

In [8]:
# create a batch, to check out Feature column types

example_batch = next(iter(train_ds))[0]

# method for creating a feature column, and transform a batch of data

def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

#### Numeric Columns - often, generally no need to transform

In [9]:
age = feature_column.numeric_column("age")
demo(age)

W0402 01:11:27.441462 140736985473984 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:2758: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


[[44.]
 [56.]
 [45.]
 [35.]
 [58.]]


#### Bucketized Column - one-hot values to ranges

In [11]:
# If numerical data needs to be split into categories, use a Bucketized column with buckets (numerical ranges) 

age_buckets = feature_column.bucketized_column(age,
                                              boundaries=[18,25,30,35,40,45,50,55,60,65])
demo(age_buckets)

W0402 01:15:36.945746 140736985473984 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:2902: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]


### Categorical Columns
#### Must map categorical values to numerical values, before feeding the model - one-hot Strings to vectors, through Categorical Vocabulary Columns
#### *thal* - is a categorical column with this dataset ('fixed', 'normal', 'reversible')

In [12]:
# Vocabulary can be passed as a list - categorical_column_with_vocabulary_list() 
# Vocabulary can be loaded from a file - categorical_column_with_vocabulary_file

thal = feature_column.categorical_column_with_vocabulary_list('thal', 
                                                             ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)

W0402 01:24:24.975803 140736985473984 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4307: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
W0402 01:24:24.977220 140736985473984 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4362: VocabularyListCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


## Embedding Columns
#### With a large number of Categorical variables, it becomes impossible to use one-hot encodings when training a Neural Network. This can be dealt with by using an **Embedding Column** which represents values as a low-dimension, Dense vector where cells can contain any number (One-hot would give hi-dimension vectors, cells 0 or 1).
#### This is used best when Categorical variables have many possible values. The parameter, size of embedding, can be tuned.

In [13]:
thal_embedding = feature_column.embedding_column(thal, dimension=8)
demo(thal_embedding)

[[-0.10887046 -0.01993302  0.42641217  0.37123096  0.07721975  0.00589128
  -0.0172096  -0.05719358]
 [-0.13204265 -0.09045451 -0.5227163  -0.30877015 -0.36057395 -0.18295632
  -0.02021505  0.31593645]
 [-0.13204265 -0.09045451 -0.5227163  -0.30877015 -0.36057395 -0.18295632
  -0.02021505  0.31593645]
 [-0.13204265 -0.09045451 -0.5227163  -0.30877015 -0.36057395 -0.18295632
  -0.02021505  0.31593645]
 [-0.10887046 -0.01993302  0.42641217  0.37123096  0.07721975  0.00589128
  -0.0172096  -0.05719358]]


## Hashed Feature Column
#### Another way to deal with Categorical variables of a large number of values, is to use a Hash Bucket. This Feature column selects a Hash Value from the input, then assigns one of the Buckets to encode a string.
#### No vocabulary is needed, and the number of Buckets can be less than the number of Categories - however, can be problematic as different strings can be mapped to the same Buckets

In [14]:
# categorical_column_with_hash_bucket can be used to create Hash Buckets

thal_hashed = feature_column.categorical_column_with_hash_bucket('thal',
                                                                hash_bucket_size=1000)
demo(feature_column.indicator_column(thal_hashed))

W0402 01:41:02.224721 140736985473984 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4362: HashedCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Feature Crosses
#### Combining features into a single feature (feature crosses), allows the model to place separate Weights on each combination of features. Crossed Columns are backed by Hashed columns (rather than building full tables of all possible combinations).
#### Combine *age* and *thal* to create a new Crossed Feature

In [15]:
crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                               hash_bucket_size=1000)

demo(feature_column.indicator_column(crossed_feature))

W0402 01:45:00.372318 140736985473984 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4362: CrossedColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Choosing which types of columns to use
#### For model accuracy, a larger dataset would be needed and features used would need to be carefully selected
#### For these purposes - Comparison of Feature Column Types - features selected arbitrarily

In [19]:
feature_columns = []

# numeric columns
for header in ['age','trestbps', 'chol', 'thalach', 'oldpeak','slope','ca']:
    feature_columns.append(feature_column.numeric_column(header))

# bucketized columns
age_buckets = feature_column.bucketized_column(age, boundaries=[18,25,30,35,40,45,50,55,60,65])
feature_columns.append(age_buckets)

# categorical columns
thal = feature_column.categorical_column_with_vocabulary_list('thal', ['fixed',
                                                                      'normal',
                                                                      'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# embedding columns
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# crossed columns
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

### Create a feature Layer - DenseFeatures layer

In [21]:
# Feed the defined Feature Columns into the Layer

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

# create new Input Pipeline, with larger batch size

batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

### Create, Compile, and Train the Model

In [22]:
model = tf.keras.Sequential([feature_layer,
                            layers.Dense(128, activation='relu'),
                            layers.Dense(128, activation='relu'),
                            layers.Dense(1, activation='sigmoid')
                            ])

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

model.fit(train_ds,
         validation_data=val_ds,
         epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x127ba1358>

In [23]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.6721311


# Conclusion
#### The goal of this exercise is demonstrating the various Feature Column Types when handling Structured Data. In order to see the best results from the model, much more data would be necessary, particularly when using Deep Learning. If going with a small dataset, it is possible a Random Forest would provide a good baseline. 

In [24]:
# notes for this exercise provided by tensorflow.org - Permission granted

#
#@title MIT License
#
# Copyright (c) 2017 François Chollet