# CSC 74020 Machine Learning
# Week 9: Neural Networks on Tabular Data

#### We build a few Neural Networks on tabular data and show how to use the keras model class and layer classes for building Neural Networks (including non-sequential networks)

In [None]:
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.3/612.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.22.0 typeguard-2.13.3


In [None]:
from typing import Any, Dict

import numpy as np
import pandas as pd
import math

import scipy.special
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf

import matplotlib.pyplot as plt

from tensorflow.keras.optimizers import Adam
# from official.nlp import optimization

import tensorflow_addons as tfa

from tensorflow.keras.layers import Dense, Activation, BatchNormalization, Dropout, ReLU, Add, PReLU


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



## Build out MLP (standard FF NN) and ResNet Block

In [None]:
class MLP(tf.keras.Model):
    def __init__(self,  d_main: int, d_hidden: int, **kwargs ) -> None:
        super(MLP, self).__init__(**kwargs )

        self.dense1 = Dense(d_main)#d_main, d_hidden, bias_first)
        self.dense2 = Dense(d_hidden)

        # self.normalization = BatchNormalization()
        self.activation = PReLU()
        self.output_layer = Dense(1)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.activation(x)
        x = self.dense2(inputs)
        x = self.activation(x)
        x = self.output_layer(x)
        return x

In [None]:
class ResNetBlock(tf.keras.layers.Layer):
    """The main building block of `ResNet`."""

    def __init__( self, d_main: int, d_hidden: int, **kwargs ) -> None:
        super(ResNetBlock, self).__init__( **kwargs)

        self.normalization = BatchNormalization()
        self.linear_first = Dense(d_hidden)#d_main, d_hidden, bias_first)
        self.activation = ReLU()
        self.dropout_first = Dropout(.2)
        self.linear_second = Dense(d_main)
        self.dropout_second = Dropout(0)
        self.skip_connection = True

    def call(self, x):
        x_input = x
        x = self.normalization(x)
        x = self.linear_first(x)
        x = self.activation(x)
        x = self.dropout_first(x)
        x = self.linear_second(x)
        x = self.dropout_second(x)
        if self.skip_connection:
            x = x_input + x
        return x

In [None]:
class ResNet(tf.keras.Model):
    def __init__(self,  d_main: int, d_hidden: int, **kwargs ) -> None:
        super(ResNet, self).__init__(**kwargs )

        self.linear_first = Dense(d_main)#d_main, d_hidden, bias_first)
        self.resnetblock1 = ResNetBlock(d_main,d_hidden)
        self.resnetblock2 = ResNetBlock(d_main,d_hidden)
        self.normalization = BatchNormalization()
        self.activation = PReLU()
        self.output_layer = Dense(1)
        # self.output_skip = Dense(1)
        # self.add_layer = Add()

    def call(self, inputs):
        x = self.linear_first(inputs)
        #x1 = self.output_skip(inputs)
        x = self.resnetblock1(x)
        x = self.resnetblock2(x)
        x = self.normalization(x)
        x = self.activation(x)
        x = self.output_layer(x)
        #x = self.add_layer([x,x1])
        return x

In [None]:
class ResNetDR(tf.keras.Model):
    def __init__(self,  d_main: int, d_hidden: int, **kwargs ) -> None:
        super(ResNetDR, self).__init__(**kwargs )

        self.dense1 = Dense(d_main)#d_main, d_hidden, bias_first)
        self.dense2 = Dense(d_hidden)

        # self.normalization = BatchNormalization()
        self.activation = PReLU()
        self.output_layer = Dense(1)
        self.output_skip = Dense(1)
        self.add_layer = Add()

    def call(self, inputs):
        x = self.dense1(inputs)
        x1 = self.output_skip(inputs)
        x = self.activation(x)
        x = self.dense2(inputs)
        x = self.activation(x)
        x = self.output_layer(x)
        x = self.add_layer([x,x1])
        return x

### Data

In [None]:
# !!! NOTE !!! The dataset splits, preprocessing and other details are
# significantly different from those used in the
# paper "Revisiting Deep Learning Models for Tabular Data",
# so the results will be different from the reported in the paper.

dataset = sklearn.datasets.fetch_california_housing()
task_type = 'regression'

# dataset = sklearn.datasets.fetch_covtype()
# task_type = 'multiclass'

assert task_type in ['binclass', 'multiclass', 'regression']

X_all = dataset['data'].astype('float32')
y_all = dataset['target'].astype('float32' if task_type == 'regression' else 'int64')
if task_type != 'regression':
    y_all = sklearn.preprocessing.LabelEncoder().fit_transform(y_all).astype('int64')
n_classes = int(max(y_all)) + 1 if task_type == 'multiclass' else None

X = {}
y = {}
X['train'], X['test'], y['train'], y['test'] = sklearn.model_selection.train_test_split(
    X_all, y_all, train_size=0.8
)
X['train'], X['val'], y['train'], y['val'] = sklearn.model_selection.train_test_split(
    X['train'], y['train'], train_size=0.8
)

X_orig=X.copy()

In [None]:
# not the best way to preprocess features, but enough for the demonstration
# preprocess = sklearn.preprocessing.StandardScaler().fit(X_orig['train'])
preprocess = sklearn.preprocessing.QuantileTransformer().fit(X_orig['train'])
# preprocess = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1),clip=True).fit(X_orig['train'])

X = {
    k: (2*preprocess.transform(v)-1)#preprocess.transform(v)
    for k, v in X_orig.items()
}
y = {k: v for k, v in y.items()}

# !!! CRUCIAL for neural networks when solving regression problems !!!
if task_type == 'regression':
    y_mean = y['train'].mean().item()
    y_std = y['train'].std().item()
    y = {k: (v - y_mean) / y_std for k, v in y.items()}
else:
    y_std = y_mean = None

# if task_type != 'multiclass':
#     y = {k: v.float() for k, v in y.items()}

### Side Notes: Neighborhood Components Analysis
Here we investigate which features help us relate the feature space to the target as if we were using kNN. 2 cells down, notice the last 2 columns are showing the highest weighting which indicates an optimal kNN distance to use should put more weight on Latitude and Longitude as opposed to other features in the models.

In [None]:

from sklearn.neighbors import NeighborhoodComponentsAnalysis

nca = NeighborhoodComponentsAnalysis(random_state=42)
nca.fit(X['train'], np.clip(np.round(1.5*y['train']),-2,3))

np.matmul(nca.components_,np.transpose(nca.components_))

KeyboardInterrupt: ignored

In [None]:
x_nca = np.matmul(nca.components_,np.transpose(nca.components_))
print(np.array_str(x_nca/1000., precision=3, suppress_small=True))

In [None]:
#create categories for common numeric values (not useful here)

# min_support=15
# for ii in range(8):
#     values = pd.DataFrame(X['train'])[ii].value_counts()
#     values=np.sort(values[values>=min_support].index)
#     if len(values)>0:
#         enc = OneHotEncoder(categories=[list(values)],handle_unknown='ignore')
#         X['train']=np.concatenate([X['train'],enc.fit_transform(pd.DataFrame(X['train'])[[ii]]).toarray()],axis=1)
#         X['test']=np.concatenate([X['test'],enc.fit_transform(pd.DataFrame(X['test'])[[ii]]).toarray()],axis=1)
#         X['val']=np.concatenate([X['val'],enc.fit_transform(pd.DataFrame(X['val'])[[ii]]).toarray()],axis=1)

In [None]:
X['train'].shape, X['test'].shape, X['val'].shape

In [None]:
#we standardize the target based on the training portion
np.sqrt(np.mean(y['val']*y['val']))

### Model Training / Experiments

In [None]:
epochs = 25
batch_size=128
init_lr = 0.001

#we dont need these, but can be useful with certain learning rate schedulers
# steps_per_epoch = int(len(X['train'])/batch_size)
# num_train_steps = steps_per_epoch * epochs
# num_warmup_steps = 0

In [None]:
# Setup checkpoint path (to save the best weights / reduce overfitting)
checkpoint_path = "model_checkpoint/checkpoint.ckpt"

# Create a ModelCheckpoint callback that saves the model's weights only
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         save_weights_only=True, # set to False to save the entire model
                                                         save_best_only=True, # set to True to save only the best model instead of a model every epoch
                                                         save_freq="epoch", # save every epoch
                                                         verbose=1)
this_model=MLP(512,512)
this_model.compile(optimizer=tfa.optimizers.AdamW(0,.003), loss='MSE',   metrics=['mse'])#tf.keras.losses.MSE
this_model.fit(X['train'], y['train'],validation_data=(X['test'], y['test']), batch_size=batch_size,epochs=epochs,  callbacks=[checkpoint_callback])
print(np.sqrt(this_model.evaluate(X['val'], y['val'])))
this_model.load_weights(checkpoint_path)
print(np.sqrt(this_model.evaluate(X['val'], y['val'])))

Epoch 1/25




Epoch 1: val_loss improved from inf to 0.31499, saving model to model_checkpoint/checkpoint.ckpt
Epoch 2/25
Epoch 2: val_loss improved from 0.31499 to 0.28951, saving model to model_checkpoint/checkpoint.ckpt
Epoch 3/25
Epoch 3: val_loss improved from 0.28951 to 0.28575, saving model to model_checkpoint/checkpoint.ckpt
Epoch 4/25
Epoch 4: val_loss improved from 0.28575 to 0.27148, saving model to model_checkpoint/checkpoint.ckpt
Epoch 5/25
Epoch 5: val_loss improved from 0.27148 to 0.26631, saving model to model_checkpoint/checkpoint.ckpt
Epoch 6/25
Epoch 6: val_loss improved from 0.26631 to 0.25303, saving model to model_checkpoint/checkpoint.ckpt
Epoch 7/25
Epoch 7: val_loss did not improve from 0.25303
Epoch 8/25
Epoch 8: val_loss improved from 0.25303 to 0.24787, saving model to model_checkpoint/checkpoint.ckpt
Epoch 9/25
Epoch 9: val_loss improved from 0.24787 to 0.24581, saving model to model_checkpoint/checkpoint.ckpt
Epoch 10/25
Epoch 10: val_loss did not improve from 0.24581
E

In [None]:
epochs=35

def warmup_and_step_decay(epoch):
    initial_lrate = 0.01
    drop = 0.5
    epochs_drop = 8
    warmup=.5
    warmup_steps=2
    if epoch<=warmup_steps:
        lrate = pow(warmup,warmup_steps-epoch+1)*initial_lrate
    else:
        lrate = initial_lrate * math.pow(drop,  math.floor((1+epoch)/epochs_drop))
    print("Epoch: "+str(epoch)+" Learning Rate: "+str(lrate))
    return lrate
lr_callback = tf.keras.callbacks.LearningRateScheduler(warmup_and_step_decay)

# Setup checkpoint path
checkpoint_path = "model_checkpoint/checkpoint.ckpt"

# Create a ModelCheckpoint callback that saves the model's weights only
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         save_weights_only=True, # set to False to save the entire model
                                                         save_best_only=True, # set to True to save only the best model instead of a model every epoch
                                                         save_freq="epoch", # save every epoch
                                                         verbose=1)

ResModelDR = ResNetDR(256,256)
ResModelDR.compile(optimizer=tfa.optimizers.AdamW(0,.003), loss='MSE',   metrics=['mse'])#tf.keras.losses.MSE
ResModelDR.fit(X['train'], y['train'],validation_data=(X['test'], y['test']), batch_size=batch_size,epochs=epochs,  callbacks=[checkpoint_callback,lr_callback])
print(np.sqrt(ResModelDR.evaluate(X['val'], y['val'])))
ResModelDR.load_weights(checkpoint_path)
print(np.sqrt(ResModelDR.evaluate(X['val'], y['val'])))

Epoch: 0 Learning Rate: 0.00125
Epoch 1/35




Epoch 1: val_loss improved from inf to 0.35566, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 1 Learning Rate: 0.0025
Epoch 2/35
Epoch 2: val_loss improved from 0.35566 to 0.30825, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 2 Learning Rate: 0.005
Epoch 3/35
Epoch 3: val_loss improved from 0.30825 to 0.29400, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 3 Learning Rate: 0.01
Epoch 4/35
Epoch 4: val_loss did not improve from 0.29400
Epoch: 4 Learning Rate: 0.01
Epoch 5/35
Epoch 5: val_loss improved from 0.29400 to 0.27009, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 5 Learning Rate: 0.01
Epoch 6/35
Epoch 6: val_loss did not improve from 0.27009
Epoch: 6 Learning Rate: 0.01
Epoch 7/35
Epoch 7: val_loss improved from 0.27009 to 0.26628, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 7 Learning Rate: 0.005
Epoch 8/35
Epoch 8: val_loss improved from 0.26628 to 0.24336, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 8 Learning Ra

In [None]:
epochs=35

def warmup_and_step_decay(epoch):
    initial_lrate = 0.01
    drop = 0.5
    epochs_drop = 8
    warmup=.5
    warmup_steps=2
    if epoch<=warmup_steps:
        lrate = pow(warmup,warmup_steps-epoch+1)*initial_lrate
    else:
        lrate = initial_lrate * math.pow(drop,  math.floor((1+epoch)/epochs_drop))
    print("Epoch: "+str(epoch)+" Learning Rate: "+str(lrate))
    return lrate
lr_callback = tf.keras.callbacks.LearningRateScheduler(warmup_and_step_decay)

# Setup checkpoint path
checkpoint_path = "model_checkpoint/checkpoint.ckpt"

# Create a ModelCheckpoint callback that saves the model's weights only
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         save_weights_only=True, # set to False to save the entire model
                                                         save_best_only=True, # set to True to save only the best model instead of a model every epoch
                                                         save_freq="epoch", # save every epoch
                                                         verbose=1)

ResModel = ResNet(128,256)
ResModel.compile(optimizer=tfa.optimizers.AdamW(0,.003), loss='MSE',   metrics=['mse'])#tf.keras.losses.MSE
ResModel.fit(X['train'], y['train'],validation_data=(X['test'], y['test']), batch_size=batch_size,epochs=epochs,  callbacks=[checkpoint_callback,lr_callback])
print(np.sqrt(ResModel.evaluate(X['val'], y['val'])))
ResModel.load_weights(checkpoint_path)
print(np.sqrt(ResModel.evaluate(X['val'], y['val'])))

Epoch: 0 Learning Rate: 0.00125
Epoch 1/35
Epoch 1: val_loss improved from inf to 0.58543, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 1 Learning Rate: 0.0025
Epoch 2/35
Epoch 2: val_loss improved from 0.58543 to 0.40567, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 2 Learning Rate: 0.005
Epoch 3/35
Epoch 3: val_loss improved from 0.40567 to 0.34031, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 3 Learning Rate: 0.01
Epoch 4/35
Epoch 4: val_loss did not improve from 0.34031
Epoch: 4 Learning Rate: 0.01
Epoch 5/35
Epoch 5: val_loss improved from 0.34031 to 0.32926, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 5 Learning Rate: 0.01
Epoch 6/35
Epoch 6: val_loss improved from 0.32926 to 0.32474, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 6 Learning Rate: 0.01
Epoch 7/35
Epoch 7: val_loss improved from 0.32474 to 0.27907, saving model to model_checkpoint/checkpoint.ckpt
Epoch: 7 Learning Rate: 0.005
Epoch 8/35
Epoch 8: val_loss improv

In [None]:
ResModel.summary()

Model: "res_net"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             multiple                  1152      
                                                                 
 res_net_block (ResNetBlock  multiple                  66432     
 )                                                               
                                                                 
 res_net_block_1 (ResNetBlo  multiple                  66432     
 ck)                                                             
                                                                 
 batch_normalization_2 (Bat  multiple                  512       
 chNormalization)                                                
                                                                 
 p_re_lu_2 (PReLU)           multiple                  128       
                                                           