### Handling Mixed Data Types

In [56]:
from types import SimpleNamespace
import pandas as pd
import numpy as np
np.set_printoptions(precision=3, suppress=True)
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras import backend


def as_keras_inputs(*,items):
    inputs = {}
    for name,column in items:
        if column.dtype == object:
            inputs[name] = tf.keras.Input(shape=(1,),name=name,dtype=tf.string)
        else:
            inputs[name] = tf.keras.Input(shape=(1,),name=name,dtype=tf.float32)    
    
    return inputs


def normalize_numeric_inputs(*,inputs,dataframe):
    normalizer = layers.Normalization()
    
    all_numeric_inputs = {name:input for name,input in inputs.items() if input.dtype==tf.float32}
    x = layers.Concatenate()(list(all_numeric_inputs.values()))
    normalizer.adapt(np.array(dataframe[all_numeric_inputs.keys()]))
    
    return normalizer(x)

def strings_to_float32(*,inputs,feats):
    for name, input in inputs.items():
        if input.dtype == tf.float32:
            continue

        # For the string inputs use the tf.keras.layers.StringLookup function 
        # to map from strings to integer indices in a vocabulary. 
        lookup = layers.StringLookup(vocabulary=np.unique(feats[name]))

        # Next, use tf.keras.layers.CategoryEncoding to convert the indexes 
        # into float32 data appropriate for the model.
        # The default settings for the tf.keras.layers.CategoryEncoding layer 
        # create a one-hot vector for each input.
        one_hot_vector = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())
        
        return one_hot_vector(lookup(input))

# Build a model based on a sequential model from our preprocessing model 
# and our keras inputs based on the panda dataframe
def build_model(*,preprocessing_head,inputs):
    seq_model = tf.keras.Sequential([
        layers.Dense(64),
        layers.Dense(1)
    ])
    result = seq_model(preprocessing_head(inputs))
    model = tf.keras.Model(inputs,result)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer = tf.keras.optimizers.Adam())
    return model
    
model_name = 'heart_health'           
hh = pd.read_csv('./input/Heart_health.csv')
# tensorflow is a bit more strict with column strings now
hh.columns = [c.replace("/","_") for c in list(hh.columns)]

# prefer namespace over extra top level variables
dset = SimpleNamespace(feats=None,labels=None,tdict=None)
dset.feats = hh.copy()
dset.labels = dset.feats.pop("Heart Attack")

# To build the preprocessing model, start by building a set of symbolic 
# tf.keras.Input objects, matching the names and data-types of the CSV columns.
ktinputs = as_keras_inputs(items=dset.feats.items())

# The first step in your preprocessing logic is to concatenate the 
# numeric inputs together, and run them through a normalization layer:
ppinputs = [normalize_numeric_inputs(inputs=ktinputs,dataframe=hh)]

# Now we need to convert our strings into float32 appropriate for the model
ppinputs.append(strings_to_float32(inputs=ktinputs,feats=dset.feats))

# With the collection of inputs and preprocessed_inputs, you can concatenate all 
# the preprocessed inputs together, and build a model that handles the preprocessing:

preprocessed_inputs = layers.Concatenate()(ppinputs)
dset_preprocessing_model = tf.keras.Model(inputs=ktinputs,outputs=preprocessed_inputs,name=model_name)

# Keras models don't automatically convert pandas DataFrames because it's not clear if 
# it should be converted to one tensor or to a dictionary of tensors. 
# So, convert it to a dictionary of tensors:
dset.tdict = {name:np.array(value) for name, value in dset.feats.items()}

# Now build a model to train
dset_model = build_model(preprocessing_head=dset_preprocessing_model,inputs=ktinputs)
dset_model.fit(
    x=dset.tdict,
    y=dset.labels,
    epochs=10)

dset_model.save(f'{model_name}_test.keras')



Epoch 1/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 581us/step - loss: 0.5291
Epoch 2/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 579us/step - loss: 0.2330
Epoch 3/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 542us/step - loss: 0.1255
Epoch 4/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 533us/step - loss: 0.0919
Epoch 5/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 484us/step - loss: 0.0627
Epoch 6/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 451us/step - loss: 0.0437
Epoch 7/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 500us/step - loss: 0.0334
Epoch 8/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490us/step - loss: 0.0293
Epoch 9/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471us/step - loss: 0.0258
Epoch 10/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 488us/step - lo