In [None]:
import os
import tensorflow as tf
import tensorflow.keras as keras

We assume that the dataset has already been divided into a train set and a validation set

In [None]:
HOUSING_PATH = './datasets/cali_housing'
TRAIN_PATH = os.path.join(HOUSING_PATH, 'train.csv')
VALID_PATH = os.path.join(HOUSING_PATH, 'valid.csv')

To normalize the data, we use the `keras.Normalization` layer, it can work in two ways, either you explicitly pass in the
means and the stds for the data, or you pass in a data sample to its `adapt` method for it to calculate the values from 
the data sample. We choose the latter.

In [None]:
def preprocess_only_numeric_attribs(line):
    """
    Processes only numeric attributes, which are subsequently
    passed to the adapt() method of the Normalization layer
    """
    defs = [0.] * 9 + [tf.constant([], dtype=tf.string)]
    decoded = tf.io.decode_csv(line, defs)
    X_numeric = tf.stack(decoded[:-2])
    return X_numeric


In [None]:
def numeric_attrib_dataset_factory(filepath):
    dataset = tf.data.Dataset.list_files(filepath)
    dataset = tf.data.TextLineDataset(dataset).skip(1)
    dataset = dataset.map(preprocess_only_numeric_attribs)
    return dataset.cache()
    

In [None]:
numeric_attrib_dataset = numeric_attrib_dataset_factory(TRAIN_PATH)

In [None]:
norm_layer = keras.layers.Normalization()
norm_layer.adapt(numeric_attrib_dataset)

The means computed by the normalization layer's `adapt` method will be used to impute missing values, in this case, the total_bedrooms feature has missing values.

In [None]:
x_means = norm_layer.mean.numpy().tolist()

Note that this preprocess function returns a total of 3 values, two of them packaged into a single tuple. This is because our model takes in two inputs, one of them being the 8 numerical values, and the other being a categorical input (string) - `ocean_proximity`,
the keras layers handle encoding it using Embeddings as is shown later. Therefore we package those two into a single tuple.

In [None]:
def preprocess(line):
    # Impute missing values with the means
    defs =  x_means + [0.] + [tf.constant([], dtype=tf.string)]
    decoded = tf.io.decode_csv(line, defs)
    X_normal = tf.stack(decoded[:-2])
    X_cat = tf.stack(decoded[-1:])
    y = tf.stack(decoded[8:9])
    return (X_normal, X_cat), y

In [None]:
def csv_reader_dataset(filepath):
    dataset = tf.data.Dataset.list_files(filepath)
    dataset = tf.data.TextLineDataset(dataset).skip(1)
    dataset = dataset.map(preprocess)
    return dataset.batch(32).prefetch(1)

In [None]:
train_set = csv_reader_dataset(TRAIN_PATH)
valid_set = csv_reader_dataset(VALID_PATH)

This segment creates the vocabulary table, its `lookup` method takes in a string tensor and returns the category ie the index it belongs to, or returns an index that is out of bounds for the vocab list if it is out of vocabulary.

In [None]:
cats_vocab = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
num_oov_buckets = 2

initializer = tf.lookup.KeyValueTensorInitializer(cats_vocab, tf.range(len(cats_vocab), dtype=tf.int64))
vocab_table = tf.lookup.StaticVocabularyTable(initializer, num_oov_buckets=num_oov_buckets)

The model is created using the functional api, as expected it has two inputs, one for the numeric attribs and the other for the categorical attribs, `ocean_proximity` in this case. The numeric inputs are subsequently passed into the normalization layer. The categorical inputs are first passed into a lambda layer to encode them ordinally, then they're passed into the Embedding layer to encode them using 10-dimensional embeddings.

In [None]:
n_hidden = 20
normal_inputs = keras.layers.Input(shape=[8])
normalized_normal_inputs = norm_layer(normal_inputs)
cat_inputs = keras.layers.Input(shape=[], dtype=tf.string)

cat_index_layer = keras.layers.Lambda(lambda cats: vocab_table.lookup(cats))(cat_inputs)
embedding_layer = keras.layers.Embedding(len(cats_vocab) + num_oov_buckets, 10)(cat_index_layer)

inputs_concatenated = keras.layers.concatenate([normalized_normal_inputs, embedding_layer])

dense = keras.layers.Dense(
    100, 
    activation='elu', 
    kernel_initializer='he_normal',
    kernel_regularizer=keras.regularizers.L1(0.02)
)(inputs_concatenated)

bn_1 = keras.layers.BatchNormalization()(dense)

for _ in range(n_hidden-1):
    dense_2 = keras.layers.Dense(
        100, activation='elu', 
        kernel_initializer='he_normal',
        kernel_regularizer=keras.regularizers.L1(0.02)
    )(dense)
    bn = keras.layers.BatchNormalization()(dense_2)
    dense = dense_2

output = keras.layers.Dense(1)(dense_2)

In [None]:
model = keras.models.Model(inputs=[normal_inputs, cat_inputs], outputs=[output])

In [None]:
model.compile(loss='mse', optimizer='nadam', metrics=['mse'])

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
]

In [None]:
history = model.fit(
    train_set, 
    epochs=100, 
    validation_data=valid_set, 
    callbacks=callbacks
)