# CSC 74020 Machine Learning
# Week 9: Neural Networks on Tabular Data

#### We build a few Neural Networks on tabular data and show how to use the keras model class and layer classes for building Neural Networks (including non-sequential networks)

In [3]:
# !pip install tensorflow_addons

In [1]:
from typing import Any, Dict

import numpy as np
import pandas as pd
import math

import scipy.special
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf

import matplotlib.pyplot as plt

from tensorflow.keras.optimizers import Adam
# from official.nlp import optimization

import tensorflow_addons as tfa

from tensorflow.keras.layers import Dense, Activation, BatchNormalization, Dropout, ReLU, Add, PReLU

## Build out MLP (standard FF NN) and ResNet Block

In [2]:
class MLP(tf.keras.Model):
    def __init__(self,  d_main: int, d_hidden: int, **kwargs ) -> None:
        super(MLP, self).__init__(**kwargs )

        self.dense1 = Dense(d_main)#d_main, d_hidden, bias_first)
        self.dense2 = Dense(d_hidden)

        # self.normalization = BatchNormalization()
        self.activation = PReLU()
        self.output_layer = Dense(1)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.activation(x)
        x = self.dense2(inputs)
        x = self.activation(x)
        x = self.output_layer(x)
        return x

In [4]:
class ResNetBlock(tf.keras.layers.Layer):
    """The main building block of `ResNet`."""

    def __init__( self, d_main: int, d_hidden: int, **kwargs ) -> None:
        super(ResNetBlock, self).__init__( **kwargs)

        self.normalization = BatchNormalization()
        self.linear_first = Dense(d_hidden)#d_main, d_hidden, bias_first)
        self.activation = ReLU()
        self.dropout_first = Dropout(.2)
        self.linear_second = Dense(d_main)
        self.dropout_second = Dropout(0)
        self.skip_connection = True

    def call(self, x):
        x_input = x
        x = self.normalization(x)
        x = self.linear_first(x)
        x = self.activation(x)
        x = self.dropout_first(x)
        x = self.linear_second(x)
        x = self.dropout_second(x)
        if self.skip_connection:
            x = x_input + x
        return x

In [5]:
class ResNet(tf.keras.Model):
    def __init__(self,  d_main: int, d_hidden: int, **kwargs ) -> None:
        super(ResNet, self).__init__(**kwargs )

        self.linear_first = Dense(d_main)#d_main, d_hidden, bias_first)
        self.resnetblock1 = ResNetBlock(d_main,d_hidden)
        self.resnetblock2 = ResNetBlock(d_main,d_hidden)
        self.normalization = BatchNormalization()
        self.activation = PReLU()
        self.output_layer = Dense(1)
        # self.output_skip = Dense(1)
        # self.add_layer = Add()

    def call(self, inputs):
        x = self.linear_first(inputs)
        #x1 = self.output_skip(inputs)
        x = self.resnetblock1(x)
        x = self.resnetblock2(x)
        x = self.normalization(x)
        x = self.activation(x)
        x = self.output_layer(x)
        #x = self.add_layer([x,x1])
        return x

In [6]:
class ResNetDR(tf.keras.Model):
    def __init__(self,  d_main: int, d_hidden: int, **kwargs ) -> None:
        super(ResNetDR, self).__init__(**kwargs )

        self.dense1 = Dense(d_main)#d_main, d_hidden, bias_first)
        self.dense2 = Dense(d_hidden)

        # self.normalization = BatchNormalization()
        self.activation = PReLU()
        self.output_layer = Dense(1)
        self.output_skip = Dense(1)
        self.add_layer = Add()

    def call(self, inputs):
        x = self.dense1(inputs)
        x1 = self.output_skip(inputs)
        x = self.activation(x)
        x = self.dense2(inputs)
        x = self.activation(x)
        x = self.output_layer(x)
        x = self.add_layer([x,x1])
        return x

### Data

In [7]:
# !!! NOTE !!! The dataset splits, preprocessing and other details are
# significantly different from those used in the
# paper "Revisiting Deep Learning Models for Tabular Data",
# so the results will be different from the reported in the paper.

dataset = sklearn.datasets.fetch_california_housing()
task_type = 'regression'

# dataset = sklearn.datasets.fetch_covtype()
# task_type = 'multiclass'

assert task_type in ['binclass', 'multiclass', 'regression']

X_all = dataset['data'].astype('float32')
y_all = dataset['target'].astype('float32' if task_type == 'regression' else 'int64')
if task_type != 'regression':
    y_all = sklearn.preprocessing.LabelEncoder().fit_transform(y_all).astype('int64')
n_classes = int(max(y_all)) + 1 if task_type == 'multiclass' else None

X = {}
y = {}
X['train'], X['test'], y['train'], y['test'] = sklearn.model_selection.train_test_split(
    X_all, y_all, train_size=0.8
)
X['train'], X['val'], y['train'], y['val'] = sklearn.model_selection.train_test_split(
    X['train'], y['train'], train_size=0.8
)

X_orig=X.copy()

In [8]:
# not the best way to preprocess features, but enough for the demonstration
# preprocess = sklearn.preprocessing.StandardScaler().fit(X_orig['train'])
preprocess = sklearn.preprocessing.QuantileTransformer().fit(X_orig['train'])
# preprocess = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1),clip=True).fit(X_orig['train'])

X = {
    k: (2*preprocess.transform(v)-1)#preprocess.transform(v)
    for k, v in X_orig.items()
}
y = {k: v for k, v in y.items()}

# !!! CRUCIAL for neural networks when solving regression problems !!!
if task_type == 'regression':
    y_mean = y['train'].mean().item()
    y_std = y['train'].std().item()
    y = {k: (v - y_mean) / y_std for k, v in y.items()}
else:
    y_std = y_mean = None

# if task_type != 'multiclass':
#     y = {k: v.float() for k, v in y.items()}

### Side Notes: Neighborhood Components Analysis
Here we investigate which features help us relate the feature space to the target as if we were using kNN. 2 cells down, notice the last 2 columns are showing the highest weighting which indicates an optimal kNN distance to use should put more weight on Latitude and Longitude as opposed to other features in the models.

In [9]:

from sklearn.neighbors import NeighborhoodComponentsAnalysis

nca = NeighborhoodComponentsAnalysis(random_state=42)
nca.fit(X['train'], np.clip(np.round(1.5*y['train']),-2,3))

np.matmul(nca.components_,np.transpose(nca.components_))

array([[ 3555.92777338,   695.42934337,   319.90542887,  -223.64682757,
          238.37764503, -1374.48275465, -3503.45188749, -2863.51884507],
       [  695.42934337,   388.55171233,   -24.36365546,   -38.26317762,
           67.09014986,  -187.00878955,  -433.2993336 ,  -573.05953163],
       [  319.90542887,   -24.36365546,   810.10477386,   -96.01367228,
          -58.63373553,  -527.57552327,  1608.20532435,   689.62971105],
       [ -223.64682757,   -38.26317762,   -96.01367228,    90.23908814,
          -15.62713775,    94.41293912,   -36.7601109 ,   248.8586409 ],
       [  238.37764503,    67.09014986,   -58.63373553,   -15.62713775,
           58.57184347,   -99.15022294,  -561.05871631,  -293.23693598],
       [-1374.48275465,  -187.00878955,  -527.57552327,    94.41293912,
          -99.15022294,  1245.15702944,   319.09516799,  1525.22653504],
       [-3503.45188749,  -433.2993336 ,  1608.20532435,   -36.7601109 ,
         -561.05871631,   319.09516799, 14294.94018523,   

In [10]:
x_nca = np.matmul(nca.components_,np.transpose(nca.components_))
print(np.array_str(x_nca/1000., precision=3, suppress_small=True))

[[ 3.556  0.695  0.32  -0.224  0.238 -1.374 -3.503 -2.864]
 [ 0.695  0.389 -0.024 -0.038  0.067 -0.187 -0.433 -0.573]
 [ 0.32  -0.024  0.81  -0.096 -0.059 -0.528  1.608  0.69 ]
 [-0.224 -0.038 -0.096  0.09  -0.016  0.094 -0.037  0.249]
 [ 0.238  0.067 -0.059 -0.016  0.059 -0.099 -0.561 -0.293]
 [-1.374 -0.187 -0.528  0.094 -0.099  1.245  0.319  1.525]
 [-3.503 -0.433  1.608 -0.037 -0.561  0.319 14.295  0.919]
 [-2.864 -0.573  0.69   0.249 -0.293  1.525  0.919 14.291]]


In [11]:
#create categories for common numeric values (not useful here)

# min_support=15
# for ii in range(8):
#     values = pd.DataFrame(X['train'])[ii].value_counts()
#     values=np.sort(values[values>=min_support].index)
#     if len(values)>0:
#         enc = OneHotEncoder(categories=[list(values)],handle_unknown='ignore')
#         X['train']=np.concatenate([X['train'],enc.fit_transform(pd.DataFrame(X['train'])[[ii]]).toarray()],axis=1)
#         X['test']=np.concatenate([X['test'],enc.fit_transform(pd.DataFrame(X['test'])[[ii]]).toarray()],axis=1)
#         X['val']=np.concatenate([X['val'],enc.fit_transform(pd.DataFrame(X['val'])[[ii]]).toarray()],axis=1)

In [12]:
X['train'].shape, X['test'].shape, X['val'].shape

((13209, 8), (4128, 8), (3303, 8))

In [13]:
#we standardize the target based on the training portion
np.sqrt(np.mean(y['val']*y['val']))

1.0103194

### Model Training / Experiments

In [14]:
epochs = 25
batch_size=128
init_lr = 0.001

#we dont need these, but can be useful with certain learning rate schedulers
# steps_per_epoch = int(len(X['train'])/batch_size)
# num_train_steps = steps_per_epoch * epochs
# num_warmup_steps = 0

In [15]:
# Setup checkpoint path (to save the best weights / reduce overfitting)
checkpoint_path = "model_checkpoint/checkpoint.ckpt"

# Create a ModelCheckpoint callback that saves the model's weights only
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         save_weights_only=True, # set to False to save the entire model
                                                         save_best_only=True, # set to True to save only the best model instead of a model every epoch
                                                         save_freq="epoch", # save every epoch
                                                         verbose=1)
this_model=MLP(512,512)
this_model.compile(optimizer=tfa.optimizers.AdamW(0,.003), loss='MSE',   metrics=['mse'])#tf.keras.losses.MSE
this_model.fit(X['train'], y['train'],validation_data=(X['test'], y['test']), batch_size=batch_size,epochs=epochs,  callbacks=[checkpoint_callback])
print(np.sqrt(this_model.evaluate(X['val'], y['val'])))
this_model.load_weights(checkpoint_path)
print(np.sqrt(this_model.evaluate(X['val'], y['val'])))

Epoch 1/25
Epoch 00001: val_loss improved from inf to 0.29426, saving model to model_checkpoint\checkpoint.ckpt
Epoch 2/25
Epoch 00002: val_loss improved from 0.29426 to 0.28008, saving model to model_checkpoint\checkpoint.ckpt
Epoch 3/25
Epoch 00003: val_loss improved from 0.28008 to 0.26538, saving model to model_checkpoint\checkpoint.ckpt
Epoch 4/25
Epoch 00004: val_loss did not improve from 0.26538
Epoch 5/25
Epoch 00005: val_loss improved from 0.26538 to 0.24427, saving model to model_checkpoint\checkpoint.ckpt
Epoch 6/25
Epoch 00006: val_loss did not improve from 0.24427
Epoch 7/25
Epoch 00007: val_loss improved from 0.24427 to 0.24039, saving model to model_checkpoint\checkpoint.ckpt
Epoch 8/25
Epoch 00008: val_loss did not improve from 0.24039
Epoch 9/25
Epoch 00009: val_loss improved from 0.24039 to 0.23250, saving model to model_checkpoint\checkpoint.ckpt
Epoch 10/25
Epoch 00010: val_loss improved from 0.23250 to 0.23111, saving model to model_checkpoint\checkpoint.ckpt
Epoch

In [16]:
epochs=35

def warmup_and_step_decay(epoch):
    initial_lrate = 0.01
    drop = 0.5
    epochs_drop = 8
    warmup=.5
    warmup_steps=2
    if epoch<=warmup_steps:
        lrate = pow(warmup,warmup_steps-epoch+1)*initial_lrate
    else:
        lrate = initial_lrate * math.pow(drop,  math.floor((1+epoch)/epochs_drop))
    print("Epoch: "+str(epoch)+" Learning Rate: "+str(lrate))
    return lrate
lr_callback = tf.keras.callbacks.LearningRateScheduler(warmup_and_step_decay)

# Setup checkpoint path
checkpoint_path = "model_checkpoint/checkpoint.ckpt"

# Create a ModelCheckpoint callback that saves the model's weights only
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         save_weights_only=True, # set to False to save the entire model
                                                         save_best_only=True, # set to True to save only the best model instead of a model every epoch
                                                         save_freq="epoch", # save every epoch
                                                         verbose=1)

ResModelDR = ResNetDR(256,256)
ResModelDR.compile(optimizer=tfa.optimizers.AdamW(0,.003), loss='MSE',   metrics=['mse'])#tf.keras.losses.MSE
ResModelDR.fit(X['train'], y['train'],validation_data=(X['test'], y['test']), batch_size=batch_size,epochs=epochs,  callbacks=[checkpoint_callback,lr_callback])
print(np.sqrt(ResModelDR.evaluate(X['val'], y['val'])))
ResModelDR.load_weights(checkpoint_path)
print(np.sqrt(ResModelDR.evaluate(X['val'], y['val'])))

Epoch: 0 Learning Rate: 0.00125
Epoch 1/35
Epoch 00001: val_loss improved from inf to 0.32741, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 1 Learning Rate: 0.0025
Epoch 2/35
Epoch 00002: val_loss improved from 0.32741 to 0.29887, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 2 Learning Rate: 0.005
Epoch 3/35
Epoch 00003: val_loss improved from 0.29887 to 0.28551, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 3 Learning Rate: 0.01
Epoch 4/35
Epoch 00004: val_loss did not improve from 0.28551
Epoch: 4 Learning Rate: 0.01
Epoch 5/35
Epoch 00005: val_loss improved from 0.28551 to 0.27728, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 5 Learning Rate: 0.01
Epoch 6/35
Epoch 00006: val_loss improved from 0.27728 to 0.25807, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 6 Learning Rate: 0.01
Epoch 7/35
Epoch 00007: val_loss improved from 0.25807 to 0.25248, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 7 Learning Rate: 0.005
Epoch 8

In [17]:
epochs=35

def warmup_and_step_decay(epoch):
    initial_lrate = 0.01
    drop = 0.5
    epochs_drop = 8
    warmup=.5
    warmup_steps=2
    if epoch<=warmup_steps:
        lrate = pow(warmup,warmup_steps-epoch+1)*initial_lrate
    else:
        lrate = initial_lrate * math.pow(drop,  math.floor((1+epoch)/epochs_drop))
    print("Epoch: "+str(epoch)+" Learning Rate: "+str(lrate))
    return lrate
lr_callback = tf.keras.callbacks.LearningRateScheduler(warmup_and_step_decay)

# Setup checkpoint path
checkpoint_path = "model_checkpoint/checkpoint.ckpt"

# Create a ModelCheckpoint callback that saves the model's weights only
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         save_weights_only=True, # set to False to save the entire model
                                                         save_best_only=True, # set to True to save only the best model instead of a model every epoch
                                                         save_freq="epoch", # save every epoch
                                                         verbose=1)

ResModel = ResNet(128,256)
ResModel.compile(optimizer=tfa.optimizers.AdamW(0,.003), loss='MSE',   metrics=['mse'])#tf.keras.losses.MSE
ResModel.fit(X['train'], y['train'],validation_data=(X['test'], y['test']), batch_size=batch_size,epochs=epochs,  callbacks=[checkpoint_callback,lr_callback])
print(np.sqrt(ResModel.evaluate(X['val'], y['val'])))
ResModel.load_weights(checkpoint_path)
print(np.sqrt(ResModel.evaluate(X['val'], y['val'])))

Epoch: 0 Learning Rate: 0.00125
Epoch 1/35
Epoch 00001: val_loss improved from inf to 0.49805, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 1 Learning Rate: 0.0025
Epoch 2/35
Epoch 00002: val_loss improved from 0.49805 to 0.44167, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 2 Learning Rate: 0.005
Epoch 3/35
Epoch 00003: val_loss improved from 0.44167 to 0.33805, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 3 Learning Rate: 0.01
Epoch 4/35
Epoch 00004: val_loss did not improve from 0.33805
Epoch: 4 Learning Rate: 0.01
Epoch 5/35
Epoch 00005: val_loss improved from 0.33805 to 0.32944, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 5 Learning Rate: 0.01
Epoch 6/35
Epoch 00006: val_loss improved from 0.32944 to 0.27286, saving model to model_checkpoint\checkpoint.ckpt
Epoch: 6 Learning Rate: 0.01
Epoch 7/35
Epoch 00007: val_loss did not improve from 0.27286
Epoch: 7 Learning Rate: 0.005
Epoch 8/35
Epoch 00008: val_loss improved from 0.27286 to 0.2

In [18]:
ResModel.summary()

Model: "res_net"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              multiple                  1152      
_________________________________________________________________
res_net_block (ResNetBlock)  multiple                  66432     
_________________________________________________________________
res_net_block_1 (ResNetBlock multiple                  66432     
_________________________________________________________________
batch_normalization_2 (Batch multiple                  512       
_________________________________________________________________
p_re_lu_2 (PReLU)            multiple                  128       
_________________________________________________________________
dense_12 (Dense)             multiple                  129       
Total params: 134,785
Trainable params: 134,017
Non-trainable params: 768
___________________________________________________