Calvin's Scalable Deep Neural Network

Written by Calvin W.Y. Chan calvin.chan@bayer.com, June 2021 (Github: https://github.com/calvinwy, Linkedin: https://www.linkedin.com/in/calchan/)

# Initialization

In [40]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data.dataloader import default_collate
from siuba import _, select, rename, left_join
from pytorch_forecasting.metrics import MAPE
import ray
from ray import tune
from ray.tune.schedulers import HyperBandScheduler
from ray.tune.suggest.basic_variant import BasicVariantGenerator
from ray.tune.suggest.bayesopt import BayesOptSearch
from ray.tune.suggest.bohb import TuneBOHB


import os
import itertools
import warnings
import filelock

import string
import time
import random

import pdb

# Environment Setting

In [41]:
import sys
sys.version_info

sys.version_info(major=3, minor=9, micro=5, releaselevel='final', serial=0)

In [42]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Environmental Variables

In [43]:
# Fake Data Creation Parameters
N_SAMPLE = 5000
N_FEATURE = 4

In [44]:
# Data Handling Parameters
test_split_ratio = 0.2
k = 5

In [45]:
# Training Parameters
num_epochs = 5

# Learning Algorithm Parameters
lr_min = 1e-4
lr_max = 1e-1
# batch_size = [1,16,32,64]
batch_size = [64]

# Architecture Sampling Parameters
h_total_min = 8
h_total_max = 11
h_total_step = 2

h_min_neuron_per_layer = 2
h_max_neuron_per_layer = 5
h_max_layer = None

dropout_p_min = 0
dropout_p_max = 0.7

# Only use for Architecture Table Search
h_branch_n_samples = 2

# Ray Tune Hyperparameter Search
num_hp_search_samples = 1
chkpt_dir = "/home/calvin_chan/data/output/checkpoint/testing"

In [46]:
# Setup output directories
if not os.path.exists(chkpt_dir):
    os.makedirs(chkpt_dir)

# Data I/O

Fake Dataset

In [47]:
features_colname = [ 'feature_' + x for x in string.ascii_lowercase[:N_FEATURE] ]

sample_features = pd.DataFrame(np.random.randn(N_SAMPLE,N_FEATURE),columns=features_colname)
y_out = pd.DataFrame(np.random.randint(1000,80000,size=N_SAMPLE).astype(np.double),columns=['y_out'])

In [48]:
x = sample_features
y = y_out

In [49]:
y.head()

Unnamed: 0,y_out
0,7824.0
1,32569.0
2,66965.0
3,17769.0
4,79209.0


---

# Neural Network Module

In [50]:
class dnn(nn.Module):

    # Constructor
    def __init__(self, in_feat, layers, dropout_p=None, act_fn=torch.relu):
        super(dnn, self).__init__()
        layers = [in_feat] + layers   # Add input layer
        self.hidden = nn.ModuleList()
        self.out = nn.Linear(layers[-1],1).double()
        self.act_fn = act_fn
        self.dropout = nn.Dropout(p=dropout_p)
        # --- Scalable Layers ---
        for input_size, output_size in zip(layers, layers[1:]):
            self.hidden.append(nn.Linear(input_size,output_size).double())
            
    # Prediction
    def forward(self, x):
        L = len(self.hidden)
        for (l, single_layer) in zip(range(L), self.hidden):
            x = single_layer(x)
            x = self.dropout(self.act_fn(x))
        x = self.act_fn(self.out(x))
        return x

In [51]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.kaiming_uniform_(m.weight.data)
        nn.init.constant_(m.bias.data, 0)

---

# Data Handling and Hyperparameter Tunning

### Overview

* Modeling = Train + Validation 80%, Test 20% Data Split
* Modeling = Train + Validation 80%, Using K-Fold for Hyperparameters Tunning

<ol>
    <li>Split the data into 80/20</li>
    <li>Use K-Fold cross-validation splitting given k (eg. k=5 would results 80%/5=16% of each fold)</li>
    <ol>
        <li>For each fold, use the K-Fold training set for building model for each hyperparameters set</li>
        <li>For each fold, evaluate all models with different hyperparameters with the K-Fold test set</li>
        <li>Summarize the results into a table of (hyperparameter index, K-Fold index)</li>
        <li>Find the best hyperparameter set
    </ol>
    <li>Use the entire "Modeling = Train + Validation 80%" dataset to train a model</li>
    <li>Evaluate the model using the 20% Test Data
</ol>

<center>
    <img src="./graphics/DataHandling.jpg" width="1074" alt="data_splitting"  />
</center>

### Implementation

Data Splitting
* Using `sklearn.model_selection.train_test_split` function to perform the 80/20 split
* Generate index for K-fold of the 80% train+validation dataset using `sklearn.model_selection.KFold`
* Create a list of pytorch dataloader for each of the K-fold for training
* Create a list of pytorch dataloader for each of the K-fold for validation

Hyperparameter Tunning
* Use K in K-Fold as grid (must run) hyperparameter
* Use __custom sampling function__ to describe the hierachical neuron distribution between:
 * total neuron: $H_{total}$
 * neuron per layer: $H_{branch}$

<p style="margin-left: 100px">$H_{total}=15\quad\longrightarrow\quad H_{branch}=\begin{bmatrix}[3,3,4,5] \end{bmatrix}$ </p>
    
* __Custom Sampling Function__
 * Using the total number of neuron from the last level, create all possible combination given the number of element
 * Sample an element from the list of combination and returns it


## Data Handling

Modeling-Testing 80/20 Split

In [52]:
def convert_multidimensional_labels(df,col):
    '''
    Convert Multiple Column Label into Single Column
    
    Args:
        df: A pandas dataframe with row as samples, and column as N-dimensional subgroup to be encoded.
        col: Column name of the combined column
        
    Returns:
        df: A pandas dataframe with new label column

    Raises:
        -

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    if df.shape[1] == 1:
        df = pd.concat([df,df],axis=1)
        df.columns = [df.columns[0],col]
    else:
        df[col] = tuple(labels.values.tolist())
        df[col] = labels[col].apply(lambda x: ','.join([str(c) for c in x ]))
    return(df)

def combine_multidimensional_ohe(s):
    '''
    One-Hot-Encoding (OHE) based on joint label of multiple columns
    The default OHE feature of Pandas and sklearn takes each column as independent OHE. 
    This function uses the 2D unique label combination as a single dimension for OHE.
    
    Args:
        s: A pandas dataframe with row as samples, and column as N-dimensional subgroup to be encoded.

    Returns:
        s_ohe: A pandas dataframe with N-D OHE
        conversion_table: The conversion table for N-D OHE

    Raises:
        -

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    unique_labels = [ sorted(s[name].unique().tolist()) for name in s.columns.tolist() ]
    multidimensional_labels = [*itertools.product(*unique_labels)]
    labels = pd.DataFrame(multidimensional_labels, columns=s.columns.tolist())
    labels = convert_multidimensional_labels(labels,'sgrp')
    conversion_table = pd.get_dummies(labels, columns=['sgrp'])
    s_ohe = pd.merge(s,conversion_table,on=s.columns.tolist(),how='left').drop(s.columns.tolist(),axis=1)
    return(s_ohe,conversion_table)


def unique_list(ls_of_ls):
    '''
    Return the unique list in a list of list
    
    Args:
        ls_of_ls: List of list (eg. [[1,2,3],[1,3,2],[1,2,3]])

    Returns:
        unique_ls: Unique list within the input list (eg. [[1,3,2],[1,2,3]])

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    unique_ls = [list(ls_out) for ls_out in set(tuple(ls) for ls in ls_of_ls)]
    return unique_ls


def model_test_split(*args, id_col=None, test_ratio=0.2, random_state=25, report_id=False):
    
    '''
    Split the dataset into modeling and test set
    
    This function is to encapsulate the variying input feature size given the grouping by id_col,
    and this decompose the one-hot-encoding column into a separate feature set to be used in the
    deep learning model as separate input.
    
    Args:
        *args:
            x: A pandas dataframe with row as samples, and column as ID and feature type
            y: A pandas dataframe with row as samples, and column as output
        ohe_col: A list of column names indicating the one-hot-encoding columns in x
        id_col: Column name of the grouping column to be converted to one-hot-encoding
        test_size: The split ratio of the test set
        random_state: Random seed use by the `sklearn.model_selection.train_test_split` function
        retain_df: If this is 'True' and the input 'args' are dataframes, do not convert them to list of single row dataframe

    Returns:
        x_model, x_test: List of numpy matrix as model/test data split with from commond id_col labels of x and y
        s_model, s_test: List of numpy matrix as model/test data split with from commond id_col labels of x and y
        y_model, y_test: List of numpy matrix as model/test data split with from commond id_col labels of x and y

    Raises:
        Warning when the labels in id_col of x and y do not match
        
    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''

    if id_col is not None:

        inds = []
        data = []
        for arg in args:
            (ind,dat) = zip(*list(arg.groupby(id_col)))
            inds.append(ind)
            data.append(dat)

        # Determine of ID entry is missing from any of the input dataset
        id_not_match_flag = !(len(set.intersection(*[set(ind) for ind in inds])) == len(set.union(*[set(ind) for ind in inds])))
        if not id_not_match_flag:
            warnings.warn("Unmatch ID entries in one or more data inputs (eg. x, y)!")

        # Extract Common ID from x, s, y Samples
        select_ids = list(set.intersection(*[set(ind) for ind in inds]))

        # Split dataframes into sample list
        # (multi-resolution: each list element contains multiple x and single y based on id_col)
        dataset = []
        for i, dat in enumerate(data):
            dataset.append([ dat[inds[i].index(single_id)].drop(id_col,axis=1) for single_id in select_ids ])

    else:
        # Determine index labels in each input dataset is the same
        dataset_indices = [ list(dataset.index) for dataset in args ]
        select_ids = unique_list(dataset_indices)
        
        id_not_match_flag = !(len(data_length) == 1)
        assert id_not_match_flag, "Unmatch length in one or more data inputs (eg. x, y)!"
        select_ids = select_ids[0]
        
        # Split dataframes into sample list
        # (equal resolution: each list element contains one row in both x and y)
        dataset = []
        for i, dat in enumerate(args):
            dataset.append([ dat.loc[[single_id]] for single_id in select_ids ])

    # Including index as one of the splitting dataset
    dataset = dataset + [select_ids]
    out = train_test_split(*dataset, test_size=test_ratio, random_state=random_state)
    split_ids = out[-2:]
    out = out[0:-2]

    if report_id:
        return(out, split_ids)
    else:
        return(out)

Data Loader

In [53]:
class NumericData(Dataset):
    def __init__(self, x, y, transform=None, dtype=torch.double, sample_ids=None):
        assert (len(y) == len(x)), "Number of x and y samples do not match!"
        self.len = len(y)
        self.transform = transform
        self.sample_ids = sample_ids
        self.x, self.x_col = self._format_dataset(x, dtype)
        self.y, self.y_col = self._format_dataset(y, dtype)
        
    def __getitem__(self, index):
        
        sample = [self.x[index],
                  self.y[index]]
        if self.transform:
            sample = self.transform(sample)
        return sample
    
    def __len__(self):
        return self.len
    
    def _format_dataset(self, d, dtype):
        if type(d) == pd.core.frame.DataFrame:
            # check to make sure that the sample_ids are the same as dataframe row index if sample_ids exist
            if self.sample_ids is not None:
                assert (len(unique_list([list(d.index),self.sample_ids])) == 1), "Input data rowname/index not equal to sample_ids!"
            else:
                self.sample_ids = list(d.index)
            # extract column names
            colname = d.columns
        else:
            colname = d[0].columns

        # convert dataframe to list of a single row tensor
        out = self._sample_type_convert(d, dtype)

        return out, colname
        
    def _sample_type_convert(self, samples, dtype):
        # since the input samples are list of single-row-dataframe, with dimension of 1 x Features
        # to convert them into tensors, the row dimension is removed.
        samples_out = [ torch.tensor(sample_ele.iloc[0]).type(dtype) for sample_ele in samples ]
        return samples_out

K-Fold Data Preparation

In [54]:
def get_k_fold_indices(n_samples, k=5, shuffle=False):
    '''
    Drawing sample indices for K-Fold
    
    Args:
        samples: Number of samples in the dataset
        shuffle: Shuffling of samples

    Returns:
        kfold_train_ind: Indices for training set
        kfold_valid_ind: Indices for validation set

    Raises:
        -

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    kfold = KFold(n_splits=k, shuffle=shuffle).split([*range(n_samples)])
    i, kfold_ind = zip(*[*enumerate(kfold)])   # Expand the index obtained by the K-Fold function
    kfold_train_ind, kfold_valid_ind = zip(*kfold_ind)
    return(kfold_train_ind, kfold_valid_ind)

In [55]:
def select_ind(ls,ind):
    return [ ls[i] for i in ind.tolist() ]

Process and Split the Data

In [56]:
def patitioned_data_object_numeric(x, y, test_split_ratio, k, random_state=25):
    # Model/Test Splitting
    (x_model, x_test, 
     y_model, y_test), (samples_id_model, samples_id_test) = model_test_split(x, y, 
                                                             test_ratio=test_split_ratio, 
                                                             report_id=True,
                                                             random_state=random_state)
    
    # K-Fold Index Sampling
    [kfold_train_ind, kfold_valid_ind] = get_k_fold_indices(n_samples=len(y_model), k=k, shuffle=False)   # Shuffle is NOT needed, since the samples were shuffled in the model/test split

    # Create K-set of datasets for Pytorch data loader
    dataset_train_kfold = [ NumericData(select_ind(x_model,fold_ind), 
                                        select_ind(y_model,fold_ind),
                                        sample_ids = select_ind(samples_id_model, fold_ind)) 
                                           for fold_ind in kfold_train_ind ]
    dataset_valid_kfold = [ NumericData(select_ind(x_model,fold_ind), 
                                        select_ind(y_model,fold_ind),
                                        sample_ids = select_ind(samples_id_model, fold_ind)) 
                                           for fold_ind in kfold_valid_ind ]

    # Create dataset for modeling and testing
    dataset_model = NumericData(x_model, y_model, sample_ids = samples_id_model)
    dataset_test = NumericData(x_test, y_test, sample_ids = samples_id_test)
    
    return dataset_model, dataset_test, dataset_train_kfold, dataset_valid_kfold

## Hyperparameter Sampling

#### Neuron Custom Sampling Function

In [57]:
def integer_partitions(n_ele, n_min=1, max_dim=None, recursion_level=1):
    '''
    Fast Integer Partitioning
    Dividing a single integer into a list of integer that sums up to the given number
    
    Args:
        num_ele: Total number of elements to be distributed
        n_min: Minimum number of elements per output dimension

    Returns:
        Iterator as list of elements splitted into multiple dimensions
        
    Original Source :
    (Modification made to speed up by skpping recurrsion exceed max_dim)
        https://stackoverflow.com/questions/10035752/elegant-python-code-for-integer-partitioning
    
    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    if (max_dim is not None) and (recursion_level > max_dim):
        yield None
    else:
        yield (n_ele,)
        for i in range(n_min, n_ele//2 + 1):
            for p in integer_partitions(n_ele-i, i, max_dim, recursion_level+1):
                if p is not None:
                    yield (i,) + p
                elif recursion_level != 1:
                    yield None

In [58]:
def split_sampling(num_ele, n_min=1, n_max=None, out_dim=None, n_samples=1, prepend=[], postpend=[], single_sample=False):
    '''
    Randomly split the elements into multiple dimensions
    This is use for neuron sampling the number of elements and layer for multibranch neural network
    
    Args:
        num_ele: Total number of elements to be distributed
        n_min: Minimum number of elements per output dimension
        n_max: Maximum number of elements per output dimension
        out_dim: Number of output dimensions to distribute the element, random dimensions will be given with None given

    Returns:
        sample: List of elements splitted into multiple dimensions
        
    Raises:
        -
        
    Example:
        >>> split_sampling(14, n_min=2, out_dim=4)
        [2, 5, 4, 3]
        
    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    # !!! DEBUG !!!
    # print(f"num_ele: {num_ele}; n_min: {n_min}; out_dim: {out_dim}")
    
    # Generate the Integer Partitions
    splits = integer_partitions(num_ele, n_min=n_min, max_dim=out_dim)
    if n_max is not None:
        splits = [ split for split in splits if max(split) <= n_max ]
    if out_dim is not None:
        splits = [ split for split in splits if len(list(split)) == out_dim ]
    else:
        splits = [ split for split in splits ]
    
    # Filter with Number of Output Dimension
    splits_perm = [list(set(itertools.permutations(split))) for split in splits ]
    unique_splits_perm = list(itertools.chain.from_iterable(splits_perm))
        
    # Randomly Sample one of the permutation
    if n_samples <= len(unique_splits_perm):
        sample = list([ prepend+list(sample)+postpend for sample in random.sample(unique_splits_perm, k=n_samples)])
    else:
        sample = list([ prepend+list(sample)+postpend for sample in random.choices(unique_splits_perm, k=n_samples)])
    if single_sample:
        sample = sample[0]
    
    return(sample)

---

# Training Procedure

Reporting Functions

In [59]:
def loss_fifo(y_est, y, sgrp=None, history=None, queue_len=1000):
    '''
    Record Loss of Output Data
    Due to the variable input resolution, zero padding is required for batch gradient decent for cspd algorithm.  Therefore, the zero padded batches could introduce a bias in the loss metric computation.  To avoid this problem, the zero padded data with all zeros for the subgroup indicator is used to remove these entries during error computation.
    
    Args:
        y_est: Model prediction output
        y: Training data output ground truth
        sgrp: Subgrouping one-hot-encoded matrix for the batch data (B x O x S matrix, where B is batch size, O is output dimensions, S is number of subgroups)
        queue_len: Maximum records to be stored in the history queue

    Returns:
        history: Dictionary of output to be reported, each dictionary element is a numpy array as a queue containing the history of past results.

    Raises:
        -

    Example:
        

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
       
    # remove zero-padded cases
    if sgrp is not None:
        y_est, y, num_pts = remove_zero_padded(y_est, y, sgrp, return_length=True)
    else:
        assert y_est.shape[0] == y.shape[0], "y and y_est has different shape"
        num_pts = y_est.shape[0]
    
    num_pts = min(num_pts, queue_len)   # if queue is smaller than the number of results, truncate the front
    y_est = y_est[-num_pts:]
    y = y[-num_pts:]
        
    # managing the results FIFO queue
    # :: push new sample and remove older samples
    # :: keep the y, y_est in a FIFO for computing statistics
    if history is None or len(history) == 0:
        # initialize for the queue
        history = {'y': y, 'y_est': y_est}
    elif len(history['y']) < queue_len:
        # insert y into non-empty queue and trim data extended beyond queue size
        history['y'] = torch.cat( (history['y'], y), dim=0)[-queue_len:]
        history['y_est'] = torch.cat( (history['y_est'], y_est), dim=0)[-queue_len:]
    else:
        # shift element and replace (push on FIFO)
        history['y'] = torch.roll(history['y'], -num_pts, dims=0)
        history['y'][-num_pts:] = y
        history['y_est'] = torch.roll(history['y_est'], -num_pts, dims=0)
        history['y_est'][-num_pts:] = y_est
    
    return(history)

In [60]:
def compute_iqr(e):
    '''
    Compute Loss IQR
    
    Args:
        e: Error/Loss

    Returns:
        iqr: Interquartile range of the error

    Raises:
        -

    Example:
        

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    q75 = torch.quantile(e, 0.75)
    q25 = torch.quantile(e, 0.25)
    iqr = q75 - q25
    return iqr

In [61]:
def compute_l1_iqr(y_est, y):
    '''
    Compute L1 Loss IQR
    
    Args:
        y_est: Model prediction output
        y: Training data output ground truth

    Returns:
        iqr: Interquartile range of the error

    Raises:
        -

    Example:
        

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    e = torch.abs(y_est - y)
    q75 = torch.quantile(e, 0.75)
    q25 = torch.quantile(e, 0.25)
    iqr = q75 - q25
    return iqr

In [62]:
def compute_mape_iqr(y_est, y):
    '''
    Compute Error IQR
    
    Args:
        y_est: Model prediction output
        y: Training data output ground truth

    Returns:
        iqr: Interquartile range of the error

    Raises:
        -

    Example:
        

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    e = torch.abs(y_est - y)
    q75 = torch.quantile(e, 0.75)
    q25 = torch.quantile(e, 0.25)
    iqr = q75 - q25
    return iqr

Ray Tune Training Procedure

In [63]:
# Training procedure
def train_dnn_raytune(config, 
                      num_in_feat,
                      criterion=nn.MSELoss(),
                      checkpoint_dir=None, 
                      num_epochs=100, 
                      train_dataset=None, 
                      valid_dataset=None,
                      metric_dict={'rmse':     lambda y_est,y: torch.sqrt(nn.MSELoss(reduction="mean")(y_est,y)),
                                   'mean_l1':  lambda y_est,y: nn.L1Loss(reduction="mean")(y_est,y),
                                   'l1_iqr':   lambda y_est,y: compute_iqr(nn.L1Loss(reduction="none")(y_est,y)),
                                   'med-ape':  lambda y_est,y: torch.median((y-y_est).abs()/y.abs()),
                                   'mape':     lambda y_est,y: torch.mean((y-y_est).abs()/y.abs()),
                                   'mape_iqr': lambda y_est,y: compute_iqr((y-y_est).abs()/y.abs())},
                      train_metric_samples=None,
                      force_cpu=False
                     ):
    '''
    Training procedure for cspd regression with Ray Tune hyperparameter tuning
    This function is to be used for training with hyperparameter tuning based on Ray Tune. A cspd architecture table is given and the following hyperparameters are sampled by Ray Tune:
        lr: learning rate
        h_branch: neural network architecture definition
        dropout_p: dropout probability of all the neurons in the network
        k: k-fold index k for the dataset
        batch_size: the batch size use for the mini-batch use for batch gradient descent

    Args:
    (Note: This function is not meant to run directly by user, these arguemnts are passed indirectly by tune.run.)
        config: Ray Tune hyperparameter sampling configuration (for details, please refer to: https://docs.ray.io/en/master/tune/user-guide.html)
        checkpoint_dir: Output directory of training log, including the tensorboard output
        num_epochs: Number of training epochs
        num_in_feat: Number of input features for the network
        num_branch: Number of parallel branches in the network (subgroups)
        train_dataset: (List of or single) OutputDataBatch class Pytorch dataloader object
        valid_dataset: (List of or single) OutputDataBatch class Pytorch dataloader object
        metric_dict: Dictionary of loss function to be use for metric reporting (Attention: These are only used for reporting, not as training loss function!)

    Returns:
        result is return indirectly with tune.run

    Raises:
        -

    Example:
        -
    '''

    #====================== Ray Tune Parameters Setup ======================#

    if 'dropout_p' in config.keys():
        _dropout_p = config['dropout_p']
    else:
        _dropout_p = 0

    if 'batch_size' in config.keys():
        _batch_size = config['batch_size']
    else:
        _batch_size = 1
        
    # determine if input is k-fold dataset or single dataset
    if (type(train_dataset) is list) and (type(valid_dataset) is list) and ('k' in config.keys()):
        _train_dataset = train_dataset[config['k']]
        _valid_dataset = valid_dataset[config['k']]
    else:
        _train_dataset = train_dataset
        _valid_dataset = valid_dataset
        
    # measure error metric across whole epoch if no sample length is given
    # (the latest progress might not be shown properly and error could be overestimated by earlier samples)
    if train_metric_samples is None:
        train_metric_samples = len(_train_dataset)
    
    # gpu usage
    if not force_cpu:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    else:
        device = "cpu"

    train_loader = torch.utils.data.DataLoader(dataset=_train_dataset, batch_size=_batch_size, shuffle=True, 
                                               collate_fn=lambda x: [ x_ele.to(device) for x_ele in default_collate(x) ] )
    valid_loader = torch.utils.data.DataLoader(dataset=_valid_dataset, batch_size=_batch_size, shuffle=True,
                                               collate_fn=lambda x: [ x_ele.to(device) for x_ele in default_collate(x) ] )

    #====================== Model Setup ======================#

    # initialize ANN architecture
    model = dnn(in_feat = num_in_feat, 
                layers = config['h_layers'], 
                dropout_p = _dropout_p,
                act_fn = torch.relu)
    model.apply(initialize_weights)

    # gpu usage
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)   # for multiple GPUs
    model.to(device)

    
    # optimizer is controlled by ray tune hyperparameter
    optimizer = torch.optim.Adam(model.parameters(), lr = config["lr"])
    
    # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint
    # should be restored.
    if checkpoint_dir:
        checkpoint = os.path.join(checkpoint_dir, "checkpoint")
        model_state, optimizer_state = torch.load(checkpoint)
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
    
    # create loss metric dictionary to store results
    history = {'train': {}, 'valid': {}}
    metric_output = {}

    for epoch in range(num_epochs):

        #====================== Training ======================#

        # training using all training samples
        for i, (x, y) in enumerate(train_loader):
            # zero the parameter gradients
            optimizer.zero_grad()

            # set the model to training mode
            model.train()
            
            # forward + backward + optimize
            y_est = model(x)
            loss = criterion(y_est, y)
            loss.backward()
            optimizer.step()
            
            # record the prediction results
            # :: the following function is use to remove zero-padded samples in batch training
            # :: loss metrics are kept in a FIFO queue per latest samples in order to compute statistics
            history['train'] = loss_fifo(y_est, y, history=history['train'], queue_len=train_metric_samples)
            
        #====================== Validation ======================#
        
        # set the model to evaluation mode
        model.eval()

        # training using all validation samples
        with torch.no_grad():
            for  i, (x, y) in enumerate(valid_loader):
                y_est = model(x)
                # record the prediction results
                # :: the following function is use to remove zero-padded samples in batch training
                # :: loss metrics are kept in a FIFO queue per latest samples in order to compute statistics
                history['valid'] = loss_fifo(y_est, y, history=history['valid'], queue_len=len(_valid_dataset))
                
        for metric in metric_dict.keys():
            for dataset in history.keys():
                metric_label = '_'.join([dataset,metric])
                metric_output[metric_label] = metric_dict[metric](history[dataset]['y_est'],history[dataset]['y']).item()
                
        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and will potentially be passed as the `checkpoint_dir`
        # parameter in future iterations.
        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save( (model.state_dict(), optimizer.state_dict()), path )

        tune.report(**metric_output)

Training routine to be use for manual training with __No Hyperparameter Tuning__ with Ray Tune

In [64]:
# Training procedure
def train_dnn(model, train_dataset, valid_dataset, criterion, optimizer, 
              epochs=100, 
              batch_size=1, 
              metric_dict={'rmse':     lambda y_est,y: torch.sqrt(nn.MSELoss(reduction="mean")(y_est,y)),
                           'mean_l1':  lambda y_est,y: nn.L1Loss(reduction="mean")(y_est,y),
                           'l1_iqr':   lambda y_est,y: compute_iqr(nn.L1Loss(reduction="none")(y_est,y)),
                           'med-ape':  lambda y_est,y: torch.median((y-y_est).abs()/y.abs()),
                           'mape':     lambda y_est,y: torch.mean((y-y_est).abs()/y.abs()),
                           'mape_iqr': lambda y_est,y: compute_iqr((y-y_est).abs()/y.abs())},
              train_metric_samples=None,
              ):
    '''
    Training procedure for cspd regression
    This function is to be used for training without hyperparameter optimization, this function is usually use for test run to make sure all modification on the cspd architecture is working before submitting a list of models for hyperparameter search. To use hyperparameter optimization, please use either `train_cspd_raytune` or `train_cspd_raytune_auto_architecture`.
    
    Args:
        model: Pytorch model object of cspd
        train_dataset: OutputDataBatch or OutputData class Pytorch dataloader object
        valid_dataset: OutputDataBatch or OutputData class Pytorch dataloader object
        criterion: Training criterion to be used (eg. criterion = nn.MSELoss())
        optimizer: Training optimizer to be used (eg. optimizer = torch.optim.Adam(model.parameters(), lr = 0.1))
        epochs: Number of training epochs to be used
        batch_size: The batch size to use for batch gradient descent of the output dimension, the input dimension will be setted to zero patching within the OutputDataBatch object for comparable input size to perform the stacked computation
        metric_dict: Dictionary of loss function to be use for metric reporting (Attention: These are only used for reporting, not as training loss function!)
        history_queue_len: The number of loss result samples to keep for statistical reporting

    Returns:
        history: Training and validation results summary
        model: Implicitly updated in the model object

    Raises:
        -

    Example:
        # Example of cspd training with no subgroupings
        # (Remark: s_model and s_test are all generated with all 1's by model_test_split function with ohe_cols=None)
        (x_model, x_test, s_model, s_test, y_model, y_test) = model_test_split(x, y, ohe_cols=None, id_col=y_id_col, test_size=0.3, random_state=25)
        dataset_train = OutputDataBatch(x_model, s_model, y_model, zero_patch = False)
        dataset_valid = OutputDataBatch(x_test, s_test, y_test, zero_patch = False)
        architecture = [2,2,3,2,2]   # single branch with 5 layers
        model = dnn(in_feat=10, layers=architecture, dropout_p=0.3)
        model.apply(initialize_weights)
        optimizer = torch.optim.Adam(model.parameters(), lr = 0.1)
        criterion = nn.MSELoss()
        metric_dict = {'rmse': lambda y_est,y: torch.sqrt(nn.MSELoss(reduction="none")(y_est,y)), 
                       'mape': lambda y_est,y: (y-y_est).abs()/y.abs()}
        training_results = train_cspd(model=model, 
                                      train_dataset=dataset_model, 
                                      valid_dataset=dataset_test, 
                                      criterion=criterion,
                                      optimizer=optimizer,
                                      metric_dict=metric_dict,
                                      epochs=num_epochs, 
                                      batch_size=64)        

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    history = {'train': {}, 'valid': {}}
    metric_output = {}

    if train_metric_samples is None:
        train_metric_samples = len(train_dataset)

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True)
    
    for epoch in range(epochs):

        #====================== Training ======================#

        running_loss = 0.0
        epoch_steps = 0

        # training using all training samples
        for i, (x, y) in enumerate(train_loader):
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # set the model to training mode
            model.train()
            
            # forward + backward + optimize
            y_est = model(x)
            loss = criterion(y_est, y)
            loss.backward()
            optimizer.step()
            history['train'] = loss_fifo(y_est, y, history=history['train'], queue_len=train_metric_samples)

        #====================== Validation ======================#
        
        # set the model to evaluation mode
        model.eval()

        # training using all validation samples
        with torch.no_grad():
            for  i, (x, y) in enumerate(valid_loader):
                y_est = model(x)
                history['valid'] = loss_fifo(y_est, y, history=history['valid'], queue_len=len(valid_dataset))
    
        metric_labels = []
        for metric in metric_dict.keys():
            for dataset in history.keys():
                metric_label = '_'.join([dataset,metric])
                metric_output[metric_label] = metric_dict[metric](history[dataset]['y_est'],history[dataset]['y']).item()
                metric_labels.append(metric_label)
                        
        print(f"[Epoch: { epoch+1 }]", end=" " )
        for metric_label in metric_labels:
            print(f"{metric_label}: {metric_output[metric_label]:.3f},", end=" ")
        print(f"")

    return (history)

In [65]:
def train_dnn_raytune_cpu_gpu_distributed(config, 
                                          num_in_feat,
                                          criterion=nn.MSELoss(),
                                          checkpoint_dir=None, 
                                          num_epochs=100, 
                                          train_dataset=None, 
                                          valid_dataset=None,
                                          metric_dict={'rmse':     lambda y_est,y: torch.sqrt(nn.MSELoss(reduction="mean")(y_est,y)),
                                                       'mean_l1':  lambda y_est,y: nn.L1Loss(reduction="mean")(y_est,y),
                                                       'l1_iqr':   lambda y_est,y: compute_iqr(nn.L1Loss(reduction="none")(y_est,y)),
                                                       'med-ape':  lambda y_est,y: torch.median((y-y_est).abs()/y.abs()),
                                                       'mape':     lambda y_est,y: torch.mean((y-y_est).abs()/y.abs()),
                                                       'mape_iqr': lambda y_est,y: compute_iqr((y-y_est).abs()/y.abs())},
                                          train_metric_samples=None,
                                          ):
    '''
    CPU/GPU Distributed Wrapper Function for Training procedure for cspd regression
    This function is written to allow training done on both CPU and GPU of a single machine at the same time.
    
    Args:

    Returns:
        result: Training metric results

    Source:
        This code is modified from the following: https://discuss.ray.io/t/different-trial-on-cpu-and-gpu-separately/2883

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    
    a = filelock.FileLock("/tmp/gpu.lock")
    try:
        # Makes it so that 1 trial will use the GPU at once.
        a.acquire(timeout=1)
        result = train_dnn_raytune(config, 
                                   num_in_feat,
                                   criterion,
                                   checkpoint_dir, 
                                   num_epochs, 
                                   train_dataset, 
                                   valid_dataset,
                                   metric_dict,
                                   train_metric_samples,
                                   force_cpu=False
                                   )
    except filelock.Timeout:
        # If the lock is acquired, you can just use CPU, and disable GPU access.
        result = train_dnn_raytune(config, 
                                   num_in_feat,
                                   criterion,
                                   checkpoint_dir, 
                                   num_epochs, 
                                   train_dataset, 
                                   valid_dataset,
                                   metric_dict,
                                   train_metric_samples,
                                   force_cpu=True
                                   )
    finally:
        # Release the lock after training is done.
        a.release()
    return result


---

# Training

In [66]:
(x_model, x_test, 
 y_model, y_test), (samples_id_model, samples_id_test) = model_test_split(x, y, 
                                                         test_ratio=test_split_ratio, 
                                                         report_id=True,
                                                         random_state=15)

Data Preparation

In [68]:
dataset_model, dataset_test, dataset_train_kfold, dataset_valid_kfold = patitioned_data_object_numeric(x, y, test_split_ratio, k)

In [69]:
dataset_valid_kfold[0].x[2]

tensor([-1.0293, -0.9659, -0.5685, -0.9649], dtype=torch.float64)

Functions for Joining Results and Architecture Table

In [70]:
def convert_nested_numeric_to_string(in_list):
    return(' ; '.join([' '.join([str(c) for c in lst]) for lst in in_list]))

In [71]:
def join_nested(left, right, on):
    left['key'] = left[on].apply(convert_nested_numeric_to_string)
    right['key'] = right[on].apply(convert_nested_numeric_to_string)
    out = pd.merge(left.drop(columns=[on]), right, on='key', how='left')
    out = out.drop(columns=['key'])
    return(out)

---

### Hyperparameter Tuning (Ray Tune using Auto Network Architecture Tunning)

In [72]:
report_metrics = ["training_iteration",
                  "train_rmse", 
                  "valid_rmse",
                  "train_mean_l1", 
                  "valid_mean_l1",
                  "train_l1_iqr",
                  "valid_l1_iqr",
                  "train_med-ape",
                  "valid_med-ape",
                  "train_mape", 
                  "valid_mape",
                  "train_mape_iqr", 
                  "valid_mape_iqr",
                 ]


reporter = tune.JupyterNotebookReporter(overwrite=False, max_progress_rows=35, metric_columns= report_metrics)
scheduler = HyperBandScheduler(metric="valid_mape", mode="min", max_t=num_epochs)
searchopt = BasicVariantGenerator(max_concurrent=15)

config = {"lr": tune.loguniform(lr_min, lr_max),                       # Learning Rate
          "dropout_p": tune.uniform(dropout_p_min, dropout_p_max),     # Dropout On/Off
          "k": tune.grid_search([*range(k)]),                          # K-Fold Index
          "batch_size": tune.choice(batch_size),                       # 1: SGD; 2+: Zero-Filled BGD
          "h_total": tune.choice([*range(h_total_min, h_total_max, h_total_step)]),
          "h_layers": tune.sample_from(lambda spec: split_sampling(num_ele = spec.config.h_total, 
                                                                   n_min = h_min_neuron_per_layer,
                                                                   n_max = h_max_neuron_per_layer,
                                                                   out_dim = h_max_layer,
                                                                   single_sample = True)),
         }

In [73]:
t0 = time.time()

result = tune.run(
    tune.with_parameters(train_dnn_raytune_cpu_gpu_distributed, 
                         num_in_feat   = N_FEATURE,
                         num_epochs    = num_epochs, 
                         train_dataset = dataset_train_kfold, 
                         valid_dataset = dataset_valid_kfold,
                         train_metric_samples = round(len(dataset_train_kfold[0])/10),
                         ),
    config = config,
    resources_per_trial={"gpu": 1},
    num_samples = num_hp_search_samples,
    local_dir = chkpt_dir,
    progress_reporter = reporter,
    scheduler = scheduler,
    search_alg = searchopt,
)

t1 = time.time()
print(f"Time elapsed: {t1-t0}s")

Trial name,status,loc,batch_size,dropout_p,h_layers,h_total,k,lr
train_dnn_raytune_cpu_gpu_distributed_a36cc_00000,RUNNING,,64,0.0921763,"[3, 5, 2]",10,0,0.000164085
train_dnn_raytune_cpu_gpu_distributed_a36cc_00001,PENDING,,64,0.363724,"[2, 2, 4]",8,1,0.0988601
train_dnn_raytune_cpu_gpu_distributed_a36cc_00002,PENDING,,64,0.586927,"[4, 2, 2, 2]",10,2,0.00569188
train_dnn_raytune_cpu_gpu_distributed_a36cc_00003,PENDING,,64,0.00683811,"[2, 4, 2]",8,3,0.0185841
train_dnn_raytune_cpu_gpu_distributed_a36cc_00004,PENDING,,64,0.25734,"[2, 2, 2, 2]",8,4,0.000352712


Result for train_dnn_raytune_cpu_gpu_distributed_a36cc_00000:
  date: 2022-03-25_16-00-12
  done: false
  experiment_id: 4706dda29d2349a793d93a2f472b9a8c
  hostname: ip-10-123-137-245
  iterations_since_restore: 1
  node_ip: 10.123.137.245
  pid: 979
  should_checkpoint: true
  time_since_restore: 2.626666784286499
  time_this_iter_s: 2.626666784286499
  time_total_s: 2.626666784286499
  timestamp: 1648224012
  timesteps_since_restore: 0
  train_l1_iqr: 36411.15439262513
  train_mape: 0.9998978541395598
  train_mape_iqr: 8.452889486720849e-05
  train_mean_l1: 38239.8253584498
  train_med-ape: 0.9999726874861796
  train_rmse: 44107.711333706655
  training_iteration: 1
  trial_id: a36cc_00000
  valid_l1_iqr: 39577.89798053116
  valid_mape: 0.9999145795114288
  valid_mape_iqr: 6.153741725434259e-05
  valid_mean_l1: 39761.06219371004
  valid_med-ape: 0.9999676453376549
  valid_rmse: 45778.74130960422
  


Trial name,status,loc,batch_size,dropout_p,h_layers,h_total,k,lr,training_iteration,train_rmse,valid_rmse,train_mean_l1,valid_mean_l1,train_l1_iqr,valid_l1_iqr,train_med-ape,valid_med-ape,train_mape,valid_mape,train_mape_iqr,valid_mape_iqr
train_dnn_raytune_cpu_gpu_distributed_a36cc_00002,RUNNING,,64,0.586927,"[4, 2, 2, 2]",10,2,0.00569188,,,,,,,,,,,,,
train_dnn_raytune_cpu_gpu_distributed_a36cc_00000,PAUSED,,64,0.0921763,"[3, 5, 2]",10,0,0.000164085,5.0,44601.4,45778.3,38378.5,39760.5,39254.0,39577.8,0.999958,0.999954,0.999851,0.999884,0.000103582,8.52046e-05
train_dnn_raytune_cpu_gpu_distributed_a36cc_00001,PENDING,,64,0.363724,"[2, 2, 4]",8,1,0.0988601,,,,,,,,,,,,,
train_dnn_raytune_cpu_gpu_distributed_a36cc_00003,PENDING,,64,0.00683811,"[2, 4, 2]",8,3,0.0185841,,,,,,,,,,,,,
train_dnn_raytune_cpu_gpu_distributed_a36cc_00004,PENDING,,64,0.25734,"[2, 2, 2, 2]",8,4,0.000352712,,,,,,,,,,,,,


Result for train_dnn_raytune_cpu_gpu_distributed_a36cc_00002:
  date: 2022-03-25_16-00-16
  done: false
  experiment_id: 3d0773acb7a14be8936efcf6c5dc611b
  hostname: ip-10-123-137-245
  iterations_since_restore: 1
  node_ip: 10.123.137.245
  pid: 982
  should_checkpoint: true
  time_since_restore: 2.7220420837402344
  time_this_iter_s: 2.7220420837402344
  time_total_s: 2.7220420837402344
  timestamp: 1648224016
  timesteps_since_restore: 0
  train_l1_iqr: 37647.5
  train_mape: 1.0
  train_mape_iqr: 0.0
  train_mean_l1: 40618.221875
  train_med-ape: 1.0
  train_rmse: 46263.6169199472
  training_iteration: 1
  trial_id: a36cc_00002
  valid_l1_iqr: 41511.0
  valid_mape: 1.0
  valid_mape_iqr: 0.0
  valid_mean_l1: 39166.08125
  valid_med-ape: 1.0
  valid_rmse: 45520.772548159264
  


Trial name,status,loc,batch_size,dropout_p,h_layers,h_total,k,lr,training_iteration,train_rmse,valid_rmse,train_mean_l1,valid_mean_l1,train_l1_iqr,valid_l1_iqr,train_med-ape,valid_med-ape,train_mape,valid_mape,train_mape_iqr,valid_mape_iqr
train_dnn_raytune_cpu_gpu_distributed_a36cc_00003,RUNNING,,64,0.00683811,"[2, 4, 2]",8,3,0.0185841,,,,,,,,,,,,,
train_dnn_raytune_cpu_gpu_distributed_a36cc_00000,PAUSED,,64,0.0921763,"[3, 5, 2]",10,0,0.000164085,5.0,44601.4,45778.3,38378.5,39760.5,39254.0,39577.8,0.999958,0.999954,0.999851,0.999884,0.000103582,8.52046e-05
train_dnn_raytune_cpu_gpu_distributed_a36cc_00002,PAUSED,,64,0.586927,"[4, 2, 2, 2]",10,2,0.00569188,1.0,46263.6,45520.8,40618.2,39166.1,37647.5,41511.0,1.0,1.0,1.0,1.0,0.0,0.0
train_dnn_raytune_cpu_gpu_distributed_a36cc_00001,PENDING,,64,0.363724,"[2, 2, 4]",8,1,0.0988601,,,,,,,,,,,,,
train_dnn_raytune_cpu_gpu_distributed_a36cc_00004,PENDING,,64,0.25734,"[2, 2, 2, 2]",8,4,0.000352712,,,,,,,,,,,,,


Result for train_dnn_raytune_cpu_gpu_distributed_a36cc_00003:
  date: 2022-03-25_16-00-20
  done: false
  experiment_id: eaee5fa59c044daf9abb1c0b9a9a1eb5
  hostname: ip-10-123-137-245
  iterations_since_restore: 1
  node_ip: 10.123.137.245
  pid: 983
  should_checkpoint: true
  time_since_restore: 2.529177665710449
  time_this_iter_s: 2.529177665710449
  time_total_s: 2.529177665710449
  timestamp: 1648224020
  timesteps_since_restore: 0
  train_l1_iqr: 41454.25
  train_mape: 1.0
  train_mape_iqr: 0.0
  train_mean_l1: 38703.4375
  train_med-ape: 1.0
  train_rmse: 44938.01115543778
  training_iteration: 1
  trial_id: a36cc_00003
  valid_l1_iqr: 37635.5
  valid_mape: 1.0
  valid_mape_iqr: 0.0
  valid_mean_l1: 41044.505
  valid_med-ape: 1.0
  valid_rmse: 46733.981796065695
  


Trial name,status,loc,batch_size,dropout_p,h_layers,h_total,k,lr,training_iteration,train_rmse,valid_rmse,train_mean_l1,valid_mean_l1,train_l1_iqr,valid_l1_iqr,train_med-ape,valid_med-ape,train_mape,valid_mape,train_mape_iqr,valid_mape_iqr
train_dnn_raytune_cpu_gpu_distributed_a36cc_00004,RUNNING,,64,0.25734,"[2, 2, 2, 2]",8,4,0.000352712,,,,,,,,,,,,,
train_dnn_raytune_cpu_gpu_distributed_a36cc_00000,PAUSED,,64,0.0921763,"[3, 5, 2]",10,0,0.000164085,5.0,44601.4,45778.3,38378.5,39760.5,39254.0,39577.8,0.999958,0.999954,0.999851,0.999884,0.000103582,8.52046e-05
train_dnn_raytune_cpu_gpu_distributed_a36cc_00002,PAUSED,,64,0.586927,"[4, 2, 2, 2]",10,2,0.00569188,1.0,46263.6,45520.8,40618.2,39166.1,37647.5,41511.0,1.0,1.0,1.0,1.0,0.0,0.0
train_dnn_raytune_cpu_gpu_distributed_a36cc_00003,PAUSED,,64,0.00683811,"[2, 4, 2]",8,3,0.0185841,1.0,44938.0,46734.0,38703.4,41044.5,41454.2,37635.5,1.0,1.0,1.0,1.0,0.0,0.0
train_dnn_raytune_cpu_gpu_distributed_a36cc_00001,PENDING,,64,0.363724,"[2, 2, 4]",8,1,0.0988601,,,,,,,,,,,,,


Result for train_dnn_raytune_cpu_gpu_distributed_a36cc_00002:
  date: 2022-03-25_16-00-16
  done: false
  experiment_id: 3d0773acb7a14be8936efcf6c5dc611b
  experiment_tag: 2_batch_size=64,dropout_p=0.58693,h_layers=[4, 2, 2, 2],h_total=10,k=2,lr=0.0056919
  hostname: ip-10-123-137-245
  iterations_since_restore: 1
  node_ip: 10.123.137.245
  pid: 982
  should_checkpoint: true
  time_since_restore: 2.7220420837402344
  time_this_iter_s: 2.7220420837402344
  time_total_s: 2.7220420837402344
  timestamp: 1648224016
  timesteps_since_restore: 0
  train_l1_iqr: 37647.5
  train_mape: 1.0
  train_mape_iqr: 0.0
  train_mean_l1: 40618.221875
  train_med-ape: 1.0
  train_rmse: 46263.6169199472
  training_iteration: 1
  trial_id: a36cc_00002
  valid_l1_iqr: 41511.0
  valid_mape: 1.0
  valid_mape_iqr: 0.0
  valid_mean_l1: 39166.08125
  valid_med-ape: 1.0
  valid_rmse: 45520.772548159264
  
Result for train_dnn_raytune_cpu_gpu_distributed_a36cc_00004:
  date: 2022-03-25_16-00-24
  done: false
  exp

Trial name,status,loc,batch_size,dropout_p,h_layers,h_total,k,lr,training_iteration,train_rmse,valid_rmse,train_mean_l1,valid_mean_l1,train_l1_iqr,valid_l1_iqr,train_med-ape,valid_med-ape,train_mape,valid_mape,train_mape_iqr,valid_mape_iqr
train_dnn_raytune_cpu_gpu_distributed_a36cc_00001,RUNNING,10.123.137.245:4120,64,0.363724,"[2, 2, 4]",8,1,0.0988601,4,37570.4,32454.0,31587.9,26320.0,34114.9,33418.5,0.994761,0.656561,1.22093,0.785931,0.394726,0.310706
train_dnn_raytune_cpu_gpu_distributed_a36cc_00000,PAUSED,,64,0.0921763,"[3, 5, 2]",10,0,0.000164085,5,44601.4,45778.3,38378.5,39760.5,39254.0,39577.8,0.999958,0.999954,0.999851,0.999884,0.000103582,8.52046e-05
train_dnn_raytune_cpu_gpu_distributed_a36cc_00002,TERMINATED,,64,0.586927,"[4, 2, 2, 2]",10,2,0.00569188,1,46263.6,45520.8,40618.2,39166.1,37647.5,41511.0,1.0,1.0,1.0,1.0,0.0,0.0
train_dnn_raytune_cpu_gpu_distributed_a36cc_00003,TERMINATED,,64,0.00683811,"[2, 4, 2]",8,3,0.0185841,1,44938.0,46734.0,38703.4,41044.5,41454.2,37635.5,1.0,1.0,1.0,1.0,0.0,0.0
train_dnn_raytune_cpu_gpu_distributed_a36cc_00004,TERMINATED,,64,0.25734,"[2, 2, 2, 2]",8,4,0.000352712,3,45399.3,44964.2,39656.4,38718.5,39077.7,39402.5,0.999998,0.999999,0.999996,0.999997,2.25195e-06,2.71487e-06


Result for train_dnn_raytune_cpu_gpu_distributed_a36cc_00000:
  date: 2022-03-25_16-00-12
  done: false
  experiment_id: 4706dda29d2349a793d93a2f472b9a8c
  experiment_tag: 0_batch_size=64,dropout_p=0.092176,h_layers=[3, 5, 2],h_total=10,k=0,lr=0.00016409
  hostname: ip-10-123-137-245
  iterations_since_restore: 5
  node_ip: 10.123.137.245
  pid: 979
  should_checkpoint: true
  time_since_restore: 2.9783012866973877
  time_this_iter_s: 0.08421802520751953
  time_total_s: 2.9783012866973877
  timestamp: 1648224012
  timesteps_since_restore: 0
  train_l1_iqr: 39253.98314173389
  train_mape: 0.9998508236609247
  train_mape_iqr: 0.00010358227859030844
  train_mean_l1: 38378.49073751628
  train_med-ape: 0.9999583010909148
  train_rmse: 44601.38062049611
  training_iteration: 5
  trial_id: a36cc_00000
  valid_l1_iqr: 39577.82135635793
  valid_mape: 0.999884156555035
  valid_mape_iqr: 8.520455768790125e-05
  valid_mean_l1: 39760.529412408294
  valid_med-ape: 0.999954384829557
  valid_rmse: 457

Trial name,status,loc,batch_size,dropout_p,h_layers,h_total,k,lr,training_iteration,train_rmse,valid_rmse,train_mean_l1,valid_mean_l1,train_l1_iqr,valid_l1_iqr,train_med-ape,valid_med-ape,train_mape,valid_mape,train_mape_iqr,valid_mape_iqr
train_dnn_raytune_cpu_gpu_distributed_a36cc_00000,TERMINATED,,64,0.0921763,"[3, 5, 2]",10,0,0.000164085,5,44601.4,45778.3,38378.5,39760.5,39254.0,39577.8,0.999958,0.999954,0.999851,0.999884,0.000103582,8.52046e-05
train_dnn_raytune_cpu_gpu_distributed_a36cc_00001,TERMINATED,,64,0.363724,"[2, 2, 4]",8,1,0.0988601,5,37536.0,32300.8,30784.2,26230.6,34086.4,33372.3,0.985753,0.648748,1.09371,0.786898,0.470246,0.297696
train_dnn_raytune_cpu_gpu_distributed_a36cc_00002,TERMINATED,,64,0.586927,"[4, 2, 2, 2]",10,2,0.00569188,1,46263.6,45520.8,40618.2,39166.1,37647.5,41511.0,1.0,1.0,1.0,1.0,0.0,0.0
train_dnn_raytune_cpu_gpu_distributed_a36cc_00003,TERMINATED,,64,0.00683811,"[2, 4, 2]",8,3,0.0185841,1,44938.0,46734.0,38703.4,41044.5,41454.2,37635.5,1.0,1.0,1.0,1.0,0.0,0.0
train_dnn_raytune_cpu_gpu_distributed_a36cc_00004,TERMINATED,,64,0.25734,"[2, 2, 2, 2]",8,4,0.000352712,3,45399.3,44964.2,39656.4,38718.5,39077.7,39402.5,0.999998,0.999999,0.999996,0.999997,2.25195e-06,2.71487e-06


2022-03-25 16:00:29,077	INFO tune.py:561 -- Total run time: 20.60 seconds (20.45 seconds for the tuning loop).


Time elapsed: 23.02700185775757s


In [74]:
best_trial = result.get_best_trial("valid_med-ape", "min", "last")

all_trials = result.results_df
all_trials.columns = all_trials.columns.str.replace('config.', '', regex=False).tolist()
all_trials[['k','h_total','h_layers','dropout_p','lr','valid_med-ape']]

Unnamed: 0_level_0,k,h_total,h_layers,dropout_p,lr,valid_med-ape
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a36cc_00000,0,10,"[3, 5, 2]",0.092176,0.000164,0.999954
a36cc_00001,1,8,"[2, 2, 4]",0.363724,0.09886,0.648748
a36cc_00002,2,10,"[4, 2, 2, 2]",0.586927,0.005692,1.0
a36cc_00003,3,8,"[2, 4, 2]",0.006838,0.018584,1.0
a36cc_00004,4,8,"[2, 2, 2, 2]",0.25734,0.000353,0.999999


In [75]:
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(best_trial.last_result["valid_med-ape"]))

Best trial config: {'lr': 0.0988600955127942, 'dropout_p': 0.3637244172903719, 'k': 1, 'batch_size': 64, 'h_total': 8, 'h_layers': [2, 2, 4]}
Best trial final validation loss: 0.6487475182900605


---

### Modeling (No Ray Tune)

In [76]:
t0 = time.time()

layers = [4,3,2]
model = dnn(N_FEATURE,layers,dropout_p=0.5).to(device)
model.apply(initialize_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.1)
criterion = nn.MSELoss()
metric_dict = {'rmse': lambda y_est,y: torch.sqrt(nn.MSELoss(reduction="mean")(y_est,y)), 
               'mape': lambda y_est,y: torch.mean((y-y_est).abs()/y.abs()),
               'l1_iqr': lambda y_est,y: compute_iqr(nn.L1Loss(reduction="none")(y_est,y))}


training_results = train_dnn(model=model, 
                             train_dataset=dataset_model, 
                             valid_dataset=dataset_test, 
                             criterion=criterion,
                             optimizer=optimizer,
                             metric_dict=metric_dict,
                             epochs=num_epochs, 
                             batch_size=64)

t1 = time.time()
print(f"Time elapsed: {t1-t0}s")

[Epoch: 1] train_rmse: 44656.154, valid_rmse: 40081.069, train_mape: 1.013, valid_mape: 0.810, train_l1_iqr: 38822.949, valid_l1_iqr: 37998.633, 
[Epoch: 2] train_rmse: 42523.016, valid_rmse: 39647.244, train_mape: 1.131, valid_mape: 0.793, train_l1_iqr: 38067.950, valid_l1_iqr: 38640.221, 
[Epoch: 3] train_rmse: 41925.223, valid_rmse: 38268.487, train_mape: 1.123, valid_mape: 0.785, train_l1_iqr: 36779.380, valid_l1_iqr: 38560.708, 
[Epoch: 4] train_rmse: 41800.049, valid_rmse: 39570.515, train_mape: 1.104, valid_mape: 0.787, train_l1_iqr: 37233.037, valid_l1_iqr: 38687.136, 
[Epoch: 5] train_rmse: 42089.798, valid_rmse: 39741.708, train_mape: 1.107, valid_mape: 0.789, train_l1_iqr: 37493.652, valid_l1_iqr: 38367.757, 
Time elapsed: 0.46985721588134766s


---

# skorch

(see: https://skorch.readthedocs.io/en/stable/user/neuralnet.html)

In [77]:
from sklearn.datasets import make_regression
from skorch import NeuralNetRegressor, NeuralNet

In [78]:
dnn_architecture = [2, 3, 3]
dnn_dropout_p = 0.2
dnn_act_fn = torch.relu

dnn_lr = 1e-4
dnn_criterion = nn.MSELoss
dnn_batch_size = 128

In [79]:
dnn_net = NeuralNet(
    module = dnn,
    module__in_feat = N_FEATURE,
    module__layers = dnn_architecture,
    module__dropout_p = dnn_dropout_p,
    module__act_fn = dnn_act_fn,
    criterion = dnn_criterion,
    lr = dnn_lr,
    batch_size = dnn_batch_size,
)

In [80]:
dnn_net.fit(pd.concat(x_model).to_numpy(), pd.concat(y_model).to_numpy())

  epoch           train_loss       valid_loss     dur
-------  -------------------  ---------------  ------
      1  [36m33241405015355.0195[0m  [32m2022710428.4325[0m  0.0783
      2  [36m2107227932.1366[0m  2022710428.4325  0.0717
      3  2107227932.1366  2022710428.4325  0.0713
      4  2107227932.1366  2022710428.4325  0.0705
      5  2107227932.1366  2022710428.4325  0.0724
      6  2107227932.1366  2022710428.4325  0.0711
      7  2107227932.1366  2022710428.4325  0.0699
      8  2107227932.1366  2022710428.4325  0.0815
      9  2107227932.1366  2022710428.4325  0.0698
     10  2107227932.1366  2022710428.4325  0.0732


<class 'skorch.net.NeuralNet'>[initialized](
  module_=dnn(
    (hidden): ModuleList(
      (0): Linear(in_features=4, out_features=2, bias=True)
      (1): Linear(in_features=2, out_features=3, bias=True)
      (2): Linear(in_features=3, out_features=3, bias=True)
    )
    (out): Linear(in_features=3, out_features=1, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  ),
)

In [81]:
dnn_net.fit_loop(pd.concat(x_model).to_numpy(), pd.concat(y_model).to_numpy(), epochs=100)

     11  2107227932.1366  2022710428.4325  0.0784
     12  2107227932.1366  2022710428.4325  0.1142
     13  2107227932.1366  2022710428.4325  0.1137
     14  2107227932.1366  2022710428.4325  0.1146
     15  2107227932.1366  2022710428.4325  0.1230
     16  2107227932.1366  2022710428.4325  0.1223
     17  2107227932.1366  2022710428.4325  0.1218
     18  2107227932.1366  2022710428.4325  0.1210
     19  2107227932.1366  2022710428.4325  0.1224
     20  2107227932.1366  2022710428.4325  0.1248
     21  2107227932.1366  2022710428.4325  0.0931
     22  2107227932.1366  2022710428.4325  0.0742
     23  2107227932.1366  2022710428.4325  0.0889
     24  2107227932.1366  2022710428.4325  0.1168
     25  2107227932.1366  2022710428.4325  0.0766
     26  2107227932.1366  2022710428.4325  0.0748
     27  2107227932.1366  2022710428.4325  0.1148
     28  2107227932.1366  2022710428.4325  0.0749
     29  2107227932.1366  2022710428.4325  0.0772
     30  2107227932.1366  2022710428.4325  0.0800


<class 'skorch.net.NeuralNet'>[initialized](
  module_=dnn(
    (hidden): ModuleList(
      (0): Linear(in_features=4, out_features=2, bias=True)
      (1): Linear(in_features=2, out_features=3, bias=True)
      (2): Linear(in_features=3, out_features=3, bias=True)
    )
    (out): Linear(in_features=3, out_features=1, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  ),
)