Ordinal Hyperplane Loss Classifier - OHPL-All
- Re-implementation in Pytorch based Tensorflow version on https://github.com/ohpl/ohpl
- Algorithm: Bob Vanderheyden, Ying Xie, Mohan Rachumallu 2019 IEEE International Conference on Big Data

Copyright (C) Bayer Pharmaceutical - All Rights Reserved

Unauthorized copying of this file, via any medium is strictly prohibited
Proprietary and confidential
Written by Calvin W.Y. Chan calvin.chan@bayer.com, August 2021

# Initialization

In [46]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix, f1_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data.dataloader import default_collate
import torch.nn.functional as F
import ray
from ray import tune
from ray.tune.schedulers import HyperBandScheduler
from ray.tune.suggest.basic_variant import BasicVariantGenerator
from ray.tune.suggest.bayesopt import BayesOptSearch
from ray.tune.suggest.bohb import TuneBOHB


import os
import sys
import itertools
import warnings
import filelock

import string
import time
import random

import pdb

# Environmental Setting

In [47]:
sys.version_info

sys.version_info(major=3, minor=9, micro=5, releaselevel='final', serial=0)

In [48]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Environmental Variables

In [49]:
# Data Handling Parameters
test_split_ratio = 0.2
k = 5
random_state = 10

In [50]:
# Training Parameters
num_epochs = 100

# Learning Algorithm Parameters
lr_min = 1e-4
lr_max = 1e-1
# batch_size = [1,16,32,64]
batch_size = [5]

# Architecture Sampling Parameters
h_total_min = 4
h_total_max = 10
h_total_step = 2

h_fc_min_neuron_per_layer = 2
h_fc_max_neuron_per_layer = 5
h_fc_max_layer = None

dropout_p_min = 0
dropout_p_max = 0.7

# OHPL Parameters
margin_min = 0.5
margin_max = 1.0
alpha_min  = 0.5
alpha_max  = 1.0

# Ray Tune Hyperparameter Search
num_hp_search_samples = 1
chkpt_dir = "/home/calvin_chan/data/output/testing"

In [51]:
# Setup output directories
if not os.path.exists(chkpt_dir):
    os.makedirs(chkpt_dir)

# Data I/O

In [52]:
filepath = '/home/calvin_chan/package/ohpl/design/ohpl-master/world_happiness_2015_2019.csv'

In [53]:
data = pd.read_csv(filepath)
data.Score = data.Score.astype('int32')
data.drop(['Year'], axis=1, inplace=True)
data = data.dropna()
data.columns = data.columns.str.replace(' ','_').str.lower()

In [54]:
x_col = ['gdp_per_capita', 
         'social_support', 
         'healthy_life_expectancy',
         'freedom_to_make_life_choices', 
         'generosity',
         'perceptions_of_corruption']
y_col = ['score']

In [55]:
x = data[x_col]
y = data[y_col]

In [56]:
min_label = min(y.values.tolist())[0]
max_label = max(y.values.tolist())[0]

In [57]:
N_FEATURE = len(x_col)

---

# Neural Network Module

In [58]:
class MultiLayerFC(nn.Module):

    # Constructor
    def __init__(self, in_feat, layers, dropout_p=None, act_fn=torch.relu, dtype=torch.double):
        super(MultiLayerFC, self).__init__()
        layers = [in_feat] + layers   # Add input layer
        self.hidden = nn.ModuleList()
        self.out = nn.Linear(layers[-1], 1).type(dtype)
        self.act_fn = act_fn
        self.dropout = nn.Dropout(p=dropout_p)
        # --- Scalable Layers ---
        for input_size, output_size in zip(layers, layers[1:]):
            self.hidden.append(nn.Linear(input_size,output_size).type(dtype))
            
    # Prediction
    def forward(self, x):
        L = len(self.hidden)
        for (l, single_layer) in zip(range(L), self.hidden):
            x = single_layer(x)
            x = self.dropout(self.act_fn(x))
        x = self.out(x)
        return x

In [59]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.kaiming_uniform_(m.weight.data)
        nn.init.constant_(m.bias.data, 0)

---

# Data Handling and Hyperparameter Tunning

Modeling-Testing 80/20 Split

In [84]:
def convert_multidimensional_labels(df,col):
    '''
    Convert Multiple Column Label into Single Column
    
    Args:
        df: A pandas dataframe with row as samples, and column as N-dimensional segment to be encoded.
        col: Column name of the combined column
        
    Returns:
        df: A pandas dataframe with new label column

    Raises:
        -

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    if df.shape[1] == 1:
        df = pd.concat([df,df],axis=1)
        df.columns = [df.columns[0],col]
    else:
        df[col] = tuple(labels.values.tolist())
        df[col] = labels[col].apply(lambda x: ','.join([str(c) for c in x ]))
    return(df)

def combine_multidimensional_ohe(s):
    '''
    One-Hot-Encoding (OHE) based on joint label of multiple columns
    The default OHE feature of Pandas and sklearn takes each column as independent OHE. 
    This function uses the 2D unique label combination as a single dimension for OHE.
    
    Args:
        s: A pandas dataframe with row as samples, and column as N-dimensional segment to be encoded.

    Returns:
        s_ohe: A pandas dataframe with N-D OHE
        conversion_table: The conversion table for N-D OHE

    Raises:
        -

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    unique_labels = [ sorted(s[name].unique().tolist()) for name in s.columns.tolist() ]
    multidimensional_labels = [*itertools.product(*unique_labels)]
    labels = pd.DataFrame(multidimensional_labels, columns=s.columns.tolist())
    labels = convert_multidimensional_labels(labels,'seg')
    conversion_table = pd.get_dummies(labels, columns=['seg'])
    s_ohe = pd.merge(s,conversion_table,on=s.columns.tolist(),how='left').drop(s.columns.tolist(),axis=1)
    return(s_ohe,conversion_table)


def unique_list(ls_of_ls):
    '''
    Return the unique list in a list of list
    
    Args:
        ls_of_ls: List of list (eg. [[1,2,3],[1,3,2],[1,2,3]])

    Returns:
        unique_ls: Unique list within the input list (eg. [[1,3,2],[1,2,3]])

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    unique_ls = [list(ls_out) for ls_out in set(tuple(ls) for ls in ls_of_ls)]
    return unique_ls


def model_test_split(*args, id_col=None, test_ratio=0.2, random_state=25, report_id=False):
    
    '''
    Split the dataset into modeling and test set
    
    This function is to encapsulate the variying input feature size given the grouping by id_col,
    and this decompose the one-hot-encoding column into a separate feature set to be used in the
    deep learning model as separate input.
    
    Args:
        *args:
            x: A pandas dataframe with row as samples, and column as ID and feature type
            y: A pandas dataframe with row as samples, and column as output
        ohe_col: A list of column names indicating the one-hot-encoding columns in x
        id_col: Column name of the grouping column to be converted to one-hot-encoding
        test_size: The split ratio of the test set
        random_state: Random seed use by the `sklearn.model_selection.train_test_split` function
        retain_df: If this is 'True' and the input 'args' are dataframes, do not convert them to list of single row dataframe

    Returns:
        x_model, x_test: List of numpy matrix as model/test data split with from commond id_col labels of x and y
        s_model, s_test: List of numpy matrix as model/test data split with from commond id_col labels of x and y
        y_model, y_test: List of numpy matrix as model/test data split with from commond id_col labels of x and y

    Raises:
        Warning when the labels in id_col of x and y do not match
        
    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''

    if id_col is not None:

        inds = []
        data = []
        for arg in args:
            (ind,dat) = zip(*list(arg.groupby(id_col)))
            inds.append(ind)
            data.append(dat)

        # Determine of ID entry is missing from any of the input dataset
        id_match_flag = (len(set.intersection(*[set(ind) for ind in inds])) == len(set.union(*[set(ind) for ind in inds])))
        if not id_match_flag:
            warnings.warn("Unmatch ID entries in one or more data inputs (eg. x, y)!")

        # Extract Common ID from x, s, y Samples
        select_ids = list(set.intersection(*[set(ind) for ind in inds]))

        # Split dataframes into sample list
        # (multi-resolution: each list element contains multiple x and single y based on id_col)
        dataset = []
        for i, dat in enumerate(data):
            dataset.append([ dat[inds[i].index(single_id)].drop(id_col,axis=1) for single_id in select_ids ])

    else:
        # Determine index labels in each input dataset is the same
        dataset_indices = [ list(dataset.index) for dataset in args ]
        select_ids = unique_list(dataset_indices)
        id_match_flag = (len(select_ids) == 1)
        assert id_match_flag, "Unmatch length in one or more data inputs (eg. x, y)!"
        select_ids = select_ids[0]
        
        # Split dataframes into sample list
        # (equal resolution: each list element contains one row in both x and y)
        dataset = []
        for i, dat in enumerate(args):
            dataset.append([ dat.loc[[single_id]] for single_id in select_ids ])

    # Including index as one of the splitting dataset
    assert all(test_ratio*len(data) >= 1 for data in dataset), "Number of samples resulting from ratio must be larger than 1 sample!"
    dataset = dataset + [select_ids]
    out = train_test_split(*dataset, test_size=test_ratio, random_state=random_state)
    split_ids = out[-2:]
    out = out[0:-2]

    if report_id:
        return(out, split_ids)
    else:
        return(out)

Data Loader

In [62]:
class NumericData(Dataset):
    def __init__(self, x, y, transform=None, dtype=torch.double, sample_ids=None):
        assert (len(y) == len(x)), "Number of x and y samples do not match!"
        self.len = len(y)
        self.transform = transform
        self.sample_ids = sample_ids

        self.x, self.x_col, self.x_min, self.x_max = self._format_dataset(x, dtype)
        self.y, self.y_col, self.y_min, self.y_max = self._format_dataset(y, dtype)

    def __getitem__(self, index):
        
        sample = [self.x[index],
                  self.y[index]]
        if self.transform:
            sample = self.transform(sample)
        return sample
    
    def __len__(self):
        return self.len
    
    def _format_dataset(self, d, dtype):
        if type(d) == pd.core.frame.DataFrame:
            # check to make sure that the sample_ids are the same as dataframe row index if sample_ids exist
            if self.sample_ids is not None:
                assert (len(unique_list([list(d.index),self.sample_ids])) == 1), "Input data rowname/index not equal to sample_ids!"
            else:
                self.sample_ids = list(d.index)
            
            # extract column names
            colname = d.columns
            
            # get y-min/max (required for OHPL)
            d_min = d.min().tolist()
            d_min = d.max().tolist()          
        else:
            # extract column names
            colname = d[0].columns

            # get y-min/max (required for OHPL)
            d_max = pd.concat(d).max().tolist()
            d_min = pd.concat(d).min().tolist()
            
        # convert dataframe to list of a single row tensor
        out = self._sample_type_convert(d, dtype)

        return out, colname, d_min, d_max
        
    def _sample_type_convert(self, samples, dtype):
        # since the input samples are list of single-row-dataframe, with dimension of 1 x Features
        # to convert them into tensors, the row dimension is removed.
        samples_out = [ torch.tensor(sample_ele.iloc[0]).type(dtype) for sample_ele in samples ]
        return samples_out

K-Fold Data Preparation

In [63]:
def get_k_fold_indices(n_samples, k=5, shuffle=False):
    '''
    Drawing sample indices for K-Fold
    
    Args:
        samples: Number of samples in the dataset
        shuffle: Shuffling of samples

    Returns:
        kfold_train_ind: Indices for training set
        kfold_valid_ind: Indices for validation set

    Raises:
        -

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    kfold = KFold(n_splits=k, shuffle=shuffle).split([*range(n_samples)])
    i, kfold_ind = zip(*[*enumerate(kfold)])   # Expand the index obtained by the K-Fold function
    kfold_train_ind, kfold_valid_ind = zip(*kfold_ind)
    return(kfold_train_ind, kfold_valid_ind)

In [64]:
def select_ind(ls,ind):
    return [ ls[i] for i in ind.tolist() ]

Process and Split the Data

In [67]:
def patitioned_data_object_numeric(x, y, test_split_ratio, k, random_state=25):
    # Model/Test Splitting
    (x_model, x_test, 
     y_model, y_test), (samples_id_model, samples_id_test) = model_test_split(x, y, 
                                                             test_ratio=test_split_ratio, 
                                                             report_id=True,
                                                             random_state=random_state)
    
    # K-Fold Index Sampling
    [kfold_train_ind, kfold_valid_ind] = get_k_fold_indices(n_samples=len(y_model), k=k, shuffle=False)   # Shuffle is NOT needed, since the samples were shuffled in the model/test split

    # Create K-set of datasets for Pytorch data loader
    dataset_train_kfold = [ NumericData(select_ind(x_model,fold_ind), 
                                        select_ind(y_model,fold_ind),
                                        sample_ids = select_ind(samples_id_model, fold_ind)) 
                                           for fold_ind in kfold_train_ind ]
    dataset_valid_kfold = [ NumericData(select_ind(x_model,fold_ind), 
                                        select_ind(y_model,fold_ind),
                                        sample_ids = select_ind(samples_id_model, fold_ind)) 
                                           for fold_ind in kfold_valid_ind ]

    # Create dataset for modeling and testing
    dataset_model = NumericData(x_model, y_model, sample_ids = samples_id_model)
    dataset_test = NumericData(x_test, y_test, sample_ids = samples_id_test)
    
    return dataset_model, dataset_test, dataset_train_kfold, dataset_valid_kfold

### Hyperparameter Sampling

In [24]:
def integer_partitions(n_ele, n_min=1, max_dim=None, recursion_level=1):
    '''
    Fast Integer Partitioning
    Dividing a single integer into a list of integer that sums up to the given number
    
    Args:
        num_ele: Total number of elements to be distributed
        n_min: Minimum number of elements per output dimension

    Returns:
        Iterator as list of elements splitted into multiple dimensions
        
    Original Source :
    (Modification made to speed up by skpping recurrsion exceed max_dim)
        https://stackoverflow.com/questions/10035752/elegant-python-code-for-integer-partitioning
    
    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    if (max_dim is not None) and (recursion_level > max_dim):
        yield None
    else:
        yield (n_ele,)
        for i in range(n_min, n_ele//2 + 1):
            for p in integer_partitions(n_ele-i, i, max_dim, recursion_level+1):
                if p is not None:
                    yield (i,) + p
                elif recursion_level != 1:
                    yield None

In [25]:
def split_sampling(num_ele, n_min=1, n_max=None, out_dim=None, n_samples=1, prepend=[], postpend=[], single_sample=False):
    '''
    Randomly split the elements into multiple dimensions
    This is use for neuron sampling the number of elements and layer for multibranch neural network
    
    Args:
        num_ele: Total number of elements to be distributed
        n_min: Minimum number of elements per output dimension
        n_max: Maximum number of elements per output dimension
        out_dim: Number of output dimensions to distribute the element, random dimensions will be given with None given

    Returns:
        sample: List of elements splitted into multiple dimensions
        
    Raises:
        -
        
    Example:
        >>> split_sampling(14, n_min=2, out_dim=4)
        [2, 5, 4, 3]
        
    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    # !!! DEBUG !!!
    # print(f"num_ele: {num_ele}; n_min: {n_min}; out_dim: {out_dim}")
    
    # Generate the Integer Partitions
    splits = integer_partitions(num_ele, n_min=n_min, max_dim=out_dim)
    if n_max is not None:
        splits = [ split for split in splits if max(split) <= n_max ]
    if out_dim is not None:
        splits = [ split for split in splits if len(list(split)) == out_dim ]
    else:
        splits = [ split for split in splits ]
    
    # Filter with Number of Output Dimension
    splits_perm = [list(set(itertools.permutations(split))) for split in splits ]
    unique_splits_perm = list(itertools.chain.from_iterable(splits_perm))
        
    # Randomly Sample one of the permutation
    if n_samples <= len(unique_splits_perm):
        sample = list([ prepend+list(sample)+postpend for sample in random.sample(unique_splits_perm, k=n_samples)])
    else:
        sample = list([ prepend+list(sample)+postpend for sample in random.choices(unique_splits_perm, k=n_samples)])
    if single_sample:
        sample = sample[0]
    
    return(sample)

## OHPL-All Loss Function

In [26]:
def ohpl(y_true, pred, min_label, max_label, margin=1, ordering_loss_weight=1, loss_bound=1e9, ohpl_norm_order=2, dtype=torch.double):
    '''
    OHPL Hyperplane Loss Function
    (Modified from: https://github.com/ohpl/ohpl/blob/master/OHPLall.ipynb)
    
    Args:
        y_true: Ground Truth of output
        y_pred: Network output (w^T Phi(x)) - NOT CATEGORICAL OUTPUT!!!
        minlabel: Minimum ordinal categorical label of y
        maxlabel: Maximum ordinal categorical label of y
        margin:
        ordering_loss_weight: 
        
    Returns:
        mean_loss: Loss measure

    Raises:
        -

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com

    '''
    
    # === HCL: Hyperplane Centroid Loss ===
    # (To ensure hyperplane are ordered by rank)

    pred = pred.type(dtype)
    y_true = y_true.type(dtype)

    ords, idx = torch.unique(y_true, return_inverse=True)
    num_label = ords.shape[0]
    y_true_ohe = F.one_hot(idx,num_classes=num_label)

    # hyperplane intercept term
    yO = pred.type(dtype) @ y_true_ohe.type(dtype)
    yc = torch.sum(y_true_ohe, dim=0)
    class_mean = torch.div(yO,yc).type(dtype)

    # relative rank distance between centroids
    min_distance = torch.reshape(ords,(-1,1)) - torch.reshape(ords,(1,-1))
    min_distance = torch.relu(min_distance)

    # keeps min. distance (???)
    keep = torch.minimum(min_distance,torch.ones(min_distance.shape))
    
    # positive mean sample distance between centroids
    centroid_distance = torch.reshape(class_mean,(-1,1)) - torch.reshape(class_mean,(1,-1))
    centroid_distance = torch.relu(centroid_distance)   # zero loss for correct ordering
    centroid_distance = torch.multiply(keep, centroid_distance)

    hp_ordering_loss = torch.sum(torch.relu(min_distance - centroid_distance))

    # === HPL/HPPL: Hyperplane Point Loss ===
    # (To ensure transformation place the point near the correct centroid)
    mean_centroid_of_sample = y_true_ohe.type(dtype) @ torch.reshape(class_mean,(-1,1))

    # --- Limit Edge Case Loss ---
    # No reason to limit distance from edge cases:
    # 1. Positive edge case (max_label) for upper loss
    # 2. Negative edge case (min_label) for lower loss
    upper_bound = (y_true - max_label + 1) * loss_bound   # Select edge case and give a large loss_bound (we want to pull it back in case if it gets too big)
    upper_bound = torch.relu(upper_bound) + margin        # Add margin to non-edge cases
    lower_bound = (-(y_true - min_label) + 1) * loss_bound
    lower_bound = torch.relu(lower_bound) + margin   

    # -- Compute Loss ---
    upper_loss = pred[:,None] - mean_centroid_of_sample
    upper_loss_bounded = torch.relu(upper_loss - upper_bound[:,None])
    lower_loss = -(pred[:,None] - mean_centroid_of_sample)
    lower_loss_bounded = torch.relu(lower_loss - lower_bound[:,None])

    hp_point_loss = torch.mean(upper_loss_bounded + lower_loss_bounded)

        
    # === OHPL ===
    loss = torch.norm(torch.cat([hp_point_loss[None], (ordering_loss_weight * hp_ordering_loss)[None]]), p=ohpl_norm_order)
    
    return loss

# Training Procedure

Training Procedure

In [27]:
def ohpl_y_class_mean(y):
    '''
    Sample class mean calculation for computing centroid
    The training sample class mean matrix was previously part of the dataloader object.  However, to allow maximum flexibility of 
    random sampling, it was separated out.
    
    Args:
        y: Class label of training samples
    
    Return:
        class_mean: The mean value of each class label for each sample according to their class
    '''
    ohe_encoder = OneHotEncoder(sparse=False, categories='auto')
    y_ohe = ohe_encoder.fit_transform(y)
    y_ohe_inverse = 1/np.sum((y_ohe), axis=0)
    class_mean = (y_ohe * y_ohe_inverse).T
    return class_mean

In [28]:
def ohpl_predict(pred, centroid, min_label, delta=1e-9):
    '''
    OHPL Class Label prediction using training centroid
    The OHPL train the transformation function, but the class label prediction requires using the function as well as the
    centroid computed during training.  The output model only provide the sample projected dimension and distance between
    the model transformed output and the centroid is required to determine class.
    
    Args:
        pred: Model tranformed output at the ordinal hyperplane space
        centroid: Class associated centroid in the ordinal hyperplane space
        min_label: Lowest rank class label
    
    Return:
        y_pred: Predicted class label
        
    Raises:
        -

    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com

    '''
    y_dist = torch.abs(pred - centroid)
    y_prob = (1/(y_dist+delta))/torch.sum(1/(y_dist+delta),axis=1)[:,None]   # convert distance to probability for cross-entropy computation
    y_pred = torch.argmin(y_dist, axis=1) + min_label
    return y_pred, y_prob

In [86]:
def train_ohpl_raytune(config, 
                          num_in_feat,
                          train_dataset, 
                          valid_dataset,
                          criterion=ohpl,
                          checkpoint_dir=None, 
                          num_epochs=100, 
                          metric_dict = {'mae': lambda y_est,y: torch.mean(abs(y_est-y)), 
                                         'mze': lambda y_est,y: torch.mean((torch.abs(y_est-y) > 0).type(torch.double)),
                                         'f1-micro':  lambda y_est,y: f1_score(y,y_est,average='micro'),
                                         'f1-macro':  lambda y_est,y: f1_score(y,y_est,average='macro'),},
                          ohpl_norm_order=1,
                          dtype=torch.double,
                          force_cpu=False
                          ):
    '''
    '''
    
    # gpu usage
    if not force_cpu:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    else:
        device = "cpu"

    #====================== Ray Tune Parameters Setup ======================#

    if 'dropout_p' in config.keys():
        _dropout_p = config['dropout_p']
    else:
        _dropout_p = 0

    if 'batch_size' in config.keys():
        _batch_size = config['batch_size']
    else:
        _batch_size = 1
        
    if 'margin' in config.keys():
        _margin = config['margin']
    else:
        _margin = 1
        
    if 'ordering_loss_weight' in config.keys():
        _ordering_loss_weight = config['ordering_loss_weight']
    else:
        _ordering_loss_weight = 1
        
    if 'loss_bound' in config.keys():
        _loss_bound = config['loss_bound']
    else:
        _loss_bound = 1e9
        
    if (type(train_dataset) is list) and (type(valid_dataset) is list) and ('k' in config.keys()):
        _train_dataset = train_dataset[config['k']]
        _valid_dataset = valid_dataset[config['k']]
    else:
        _train_dataset = train_dataset
        _valid_dataset = valid_dataset
    
    train_loader = torch.utils.data.DataLoader(dataset=_train_dataset, batch_size=_batch_size, shuffle=True, 
                                               collate_fn=lambda x: [ x_ele.to(device) for x_ele in default_collate(x) ] )
    valid_loader = torch.utils.data.DataLoader(dataset=_valid_dataset, batch_size=_batch_size, shuffle=True,
                                               collate_fn=lambda x: [ x_ele.to(device) for x_ele in default_collate(x) ] )

    y_col_index = 0
    min_label = _train_dataset.y_min[y_col_index]
    max_label = _train_dataset.y_max[y_col_index]
    
    # initialize ANN architecture
    model = MultiLayerFC(in_feat = num_in_feat,
                         layers = config['h_fc'],
                         dropout_p = _dropout_p,
                         act_fn = torch.relu).to(device)
    model.apply(initialize_weights)
    
    # optimizer is controlled by ray tune hyperparameter
    optimizer = torch.optim.Adam(model.parameters(), lr = config["lr"])
    
    for epoch in range(num_epochs):

        #====================== Training ======================#

        running_loss = 0.0
        epoch_steps = 0

        # training using all training samples
        for i, (x, y) in enumerate(train_loader):

            # zero the parameter gradients
            optimizer.zero_grad()

            # set the model to training mode
            model.train()

            # forward + backward + optimize
            pred = model(x)

            ohpl_loss = criterion(y.squeeze(dim=1), 
                                  pred, 
                                  min_label, 
                                  max_label, 
                                  _margin, 
                                  _ordering_loss_weight, 
                                  _loss_bound,
                                  ohpl_norm_order,
                                  dtype)
            ohpl_loss.backward()
            optimizer.step()

        # set the model to evaluation mode
        model.eval()

        #====================== Compute Metrics ======================#
        # model can only be evaluated after finishing the complete dataset for OHPL

        train_pred = torch.tensor([])
        valid_pred = torch.tensor([])
        history = {'train': {}, 'valid': {}}
        history['train']['y'] = torch.tensor([])
        history['valid']['y'] = torch.tensor([])
        metric_output = {}
        
        with torch.no_grad():
            for  i, (x, y) in enumerate(train_loader):
                pred = model(x)
                train_pred = torch.cat([train_pred,pred])
                history['train']['y'] = torch.cat( [history['train']['y'], y.squeeze()], dim=0 )

            y_class_mean = ohpl_y_class_mean(history['train']['y'].reshape(-1,1))
            centroid = torch.reshape( torch.tensor(y_class_mean @ train_pred.numpy()), [1,-1] )
            history['train']['y_est'], train_prob = ohpl_predict(train_pred, centroid, min_label)

            for  i, (x, y) in enumerate(valid_loader):
                pred = model(x)
                valid_pred = torch.cat([valid_pred,pred])
                history['valid']['y'] = torch.cat( (history['valid']['y'], y.squeeze()), dim=0 )
            history['valid']['y_est'], valid_prob = ohpl_predict(valid_pred, centroid, min_label)

        for metric in metric_dict.keys():
            for dataset in history.keys():
                metric_label = '_'.join([dataset,metric])
                metric_output[metric_label] = metric_dict[metric](history[dataset]['y_est'],history[dataset]['y']).item()

        # ohpl metric append to output
        metric_output['ohpl'] = ohpl_loss.item()
                
        # cross-entropy-loss metric requires probability matrix
        # (not using y_pred & y_true for computation, therefore need to separate out)
        cross_entropy_loss = nn.CrossEntropyLoss()
        cel_train = cross_entropy_loss(train_prob, history['train']['y'].type(torch.int64) - min_label)
        cel_valid = cross_entropy_loss(valid_prob, history['valid']['y'].type(torch.int64) - min_label)
        metric_output['train_cross-entropy'] = cel_train.item()
        metric_output['valid_cross-entropy'] = cel_valid.item()
        
        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and will potentially be passed as the `checkpoint_dir`
        # parameter in future iterations.
        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save( (model.state_dict(), optimizer.state_dict(), centroid), path )

        tune.report(**metric_output)

In [87]:
# Training procedure
def train_ohpl(model, train_dataset, valid_dataset, min_label, max_label, criterion, optimizer, 
                  num_epochs=100, 
                  batch_size=2, 
                  metric_dict = {'mae': lambda y_est,y: torch.mean(abs(y_est-y)), 
                                 'mze': lambda y_est,y: torch.mean((torch.abs(y_est-y) > 0).type(torch.double)),
                                 'f1-micro':  lambda y_est,y: f1_score(y,y_est,average='micro'),
                                 'f1-macro':  lambda y_est,y: f1_score(y,y_est,average='macro'),},
                  margin=1,
                  ordering_loss_weight=1, 
                  loss_bound=1e9,
                  ohpl_norm_order=1,
                  show_progress=True,
                  dtype=torch.double,
                  ):
    '''
    Training procedure for OHPL classifier
    
    Author:
        Dr. Calvin Chan
        calvin.chan@bayer.com
    '''
    history = {'train': {}, 'valid': {}}
    metric_output = {}

    # gpu usage
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # parallel gpu usage
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)   # for multiple GPUs
    model.to(device)

    # initialize dataloader
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True,
                                               collate_fn=lambda x: [ x_ele.to(device) for x_ele in default_collate(x) ])
    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True,
                                               collate_fn=lambda x: [ x_ele.to(device) for x_ele in default_collate(x) ])
    
    for epoch in range(num_epochs):

        #====================== Training ======================#

        running_loss = 0.0
        epoch_steps = 0

        # training using all training samples
        for i, (x, y) in enumerate(train_loader):
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # set the model to training mode
            model.train()

            # forward + backward + optimize
            pred = model(x)

            ohpl_loss = criterion(y.squeeze(), 
                                  pred.squeeze(), 
                                  min_label, 
                                  max_label, 
                                  margin, 
                                  ordering_loss_weight, 
                                  loss_bound,
                                  ohpl_norm_order,
                                  dtype)
            ohpl_loss.backward()
            optimizer.step()

        # set the model to evaluation mode
        model.eval()

        #====================== Compute Metrics ======================#
        # model can only be evaluated after finishing the complete dataset for OHPL

        train_pred = torch.tensor([])
        valid_pred = torch.tensor([])
        history['train']['y'] = torch.tensor([])
        history['valid']['y'] = torch.tensor([])

        with torch.no_grad():
            for  i, (x, y) in enumerate(train_loader):
                pred = model(x)
                train_pred = torch.cat([train_pred,pred])
                history['train']['y'] = torch.cat( [history['train']['y'], y.squeeze()], dim=0 )

            y_class_mean = ohpl_y_class_mean(history['train']['y'].reshape(-1,1))
            centroid = torch.reshape( torch.tensor(y_class_mean @ train_pred.numpy()), [1,-1] )
            history['train']['y_est'], train_prob = ohpl_predict(train_pred, centroid, min_label)

            for  i, (x, y) in enumerate(valid_loader):
                pred = model(x)
                valid_pred = torch.cat([valid_pred,pred])
                history['valid']['y'] = torch.cat( (history['valid']['y'], y.squeeze()), dim=0 )
            history['valid']['y_est'], valid_prob = ohpl_predict(valid_pred, centroid, min_label)

        # compute loss metrics based on y_pred & y_true
        metric_labels = []
        for metric in metric_dict.keys():
            for dataset in history.keys():
                metric_label = '_'.join([dataset,metric])
                metric_output[metric_label] = metric_dict[metric](history[dataset]['y_est'],history[dataset]['y'])
                metric_labels.append(metric_label)

        # cross-entropy-loss metric requires probability matrix
        # (not using y_pred & y_true for computation, therefore need to separate out)
        cross_entropy_loss = nn.CrossEntropyLoss()
        metric_labels.append("train_cross-entropy")
        metric_labels.append("valid_cross-entropy")
        cel_train = cross_entropy_loss(train_prob, history['train']['y'].type(torch.int64) - min_label)
        cel_valid = cross_entropy_loss(valid_prob, history['valid']['y'].type(torch.int64) - min_label)
        metric_output['train_cross-entropy'] = cel_train
        metric_output['valid_cross-entropy'] = cel_valid
        
         
        # display metrics
        if show_progress:
            print(f"[Epoch: { epoch+1 }]", end=" " )
            print(f"OHPL Loss: {ohpl_loss}")
            for metric_label in metric_labels:
                print(f"{metric_label}: {metric_output[metric_label].item():.3f},", end=" ")
            print(f"")

    return (centroid, history)


---

# Training

In [83]:
dataset_model, dataset_test, dataset_train_kfold, dataset_valid_kfold = patitioned_data_object_numeric(x, y, test_split_ratio, k)

> [0;32m/tmp/ipykernel_8004/448558442.py[0m(145)[0;36mmodel_test_split[0;34m()[0m
[0;32m    143 [0;31m    [0;31m# Including index as one of the splitting dataset[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    144 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 145 [0;31m    [0;32massert[0m [0mall[0m[0;34m([0m[0mtest_ratio[0m[0;34m*[0m[0mlen[0m[0;34m([0m[0mdata[0m[0;34m)[0m [0;34m>=[0m [0;36m1[0m [0;32mfor[0m [0mdata[0m [0;32min[0m [0mdataset[0m[0;34m)[0m[0;34m,[0m [0;34m"Number of samples resulting from ratio must be larger than 1 sample!"[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    146 [0;31m    [0mdataset[0m [0;34m=[0m [0mdataset[0m [0;34m+[0m [0;34m[[0m[0mselect_ids[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    147 [0;31m    [0mout[0m [0;34m=[0m [0mtrain_test_split[0m[0;34m([0m[0;34m*[0m[0mdataset[0m[0;34m,[0m [0mtest_size

In [32]:
t0 = time.time()

N_FEATURE = x.shape[1]
architecture = [25, 30, 6]
dropout_p = 0.1
learning_rate = 1e-2

model = MultiLayerFC(N_FEATURE,architecture,dropout_p=dropout_p).to(device)
model.apply(initialize_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
criterion = ohpl
metric_dict = {
    'mae': lambda y_est,y: torch.mean(torch.abs(y_est-y)), 
    'mze': lambda y_est,y: torch.mean((torch.abs(y_est-y) > 0).type(torch.double)),
    'f1-micro':  lambda y_est,y: f1_score(y,y_est,average='micro'),
    'f1-macro':  lambda y_est,y: f1_score(y,y_est,average='macro'),
              }
centroid, history = train_ohpl(model=model, 
                               train_dataset=dataset_model, 
                               valid_dataset=dataset_test, 
                               min_label=min_label, 
                               max_label=max_label, 
                               criterion=criterion,
                               optimizer=optimizer,
                               metric_dict=metric_dict,
                               num_epochs=num_epochs, 
                               batch_size=64,
                               ohpl_norm_order=1)

t1 = time.time()
print(f"Time elapsed: {t1-t0}s")

confusion_matrix(history['valid']['y'], history['valid']['y_est'])

[Epoch: 1] OHPL Loss: 1.6218199452260271
train_mae: 0.692, valid_mae: 0.720, train_mze: 0.579, valid_mze: 0.605, train_f1-micro: 0.421, valid_f1-micro: 0.395, train_f1-macro: 0.388, valid_f1-macro: 0.358, train_cross-entropy: 1.648, valid_cross-entropy: 1.665, 
[Epoch: 2] OHPL Loss: 2.9365480031398343
train_mae: 0.654, valid_mae: 0.694, train_mze: 0.556, valid_mze: 0.592, train_f1-micro: 0.444, valid_f1-micro: 0.408, train_f1-macro: 0.408, valid_f1-macro: 0.366, train_cross-entropy: 1.641, valid_cross-entropy: 1.649, 
[Epoch: 3] OHPL Loss: 3.663002204848225
train_mae: 0.660, valid_mae: 0.675, train_mze: 0.558, valid_mze: 0.586, train_f1-micro: 0.442, valid_f1-micro: 0.414, train_f1-macro: 0.403, valid_f1-macro: 0.370, train_cross-entropy: 1.635, valid_cross-entropy: 1.641, 
[Epoch: 4] OHPL Loss: 6.567320220031508
train_mae: 0.652, valid_mae: 0.669, train_mze: 0.545, valid_mze: 0.567, train_f1-micro: 0.455, valid_f1-micro: 0.433, train_f1-macro: 0.411, valid_f1-macro: 0.379, train_cross

[Epoch: 34] OHPL Loss: 0.2948230253539015
train_mae: 0.538, valid_mae: 0.592, train_mze: 0.479, valid_mze: 0.548, train_f1-micro: 0.521, valid_f1-micro: 0.452, train_f1-macro: 0.477, valid_f1-macro: 0.376, train_cross-entropy: 1.616, valid_cross-entropy: 1.640, 
[Epoch: 35] OHPL Loss: 0.6943485964438424
train_mae: 0.534, valid_mae: 0.567, train_mze: 0.484, valid_mze: 0.529, train_f1-micro: 0.516, valid_f1-micro: 0.471, train_f1-macro: 0.475, valid_f1-macro: 0.420, train_cross-entropy: 1.603, valid_cross-entropy: 1.621, 
[Epoch: 36] OHPL Loss: 0.23945796907232283
train_mae: 0.481, valid_mae: 0.529, train_mze: 0.450, valid_mze: 0.503, train_f1-micro: 0.550, valid_f1-micro: 0.497, train_f1-macro: 0.536, valid_f1-macro: 0.439, train_cross-entropy: 1.586, valid_cross-entropy: 1.599, 
[Epoch: 37] OHPL Loss: 1.9278924146457976
train_mae: 0.481, valid_mae: 0.478, train_mze: 0.452, valid_mze: 0.452, train_f1-micro: 0.548, valid_f1-micro: 0.548, train_f1-macro: 0.552, valid_f1-macro: 0.480, trai

[Epoch: 67] OHPL Loss: 0.9295292689216992
train_mae: 0.465, valid_mae: 0.420, train_mze: 0.431, valid_mze: 0.414, train_f1-micro: 0.569, valid_f1-micro: 0.586, train_f1-macro: 0.600, valid_f1-macro: 0.521, train_cross-entropy: 1.567, valid_cross-entropy: 1.562, 
[Epoch: 68] OHPL Loss: 0.6215191874850737
train_mae: 0.486, valid_mae: 0.433, train_mze: 0.452, valid_mze: 0.414, train_f1-micro: 0.548, valid_f1-micro: 0.586, train_f1-macro: 0.580, valid_f1-macro: 0.535, train_cross-entropy: 1.573, valid_cross-entropy: 1.577, 
[Epoch: 69] OHPL Loss: 0.4070356306711942
train_mae: 0.463, valid_mae: 0.439, train_mze: 0.434, valid_mze: 0.427, train_f1-micro: 0.566, valid_f1-micro: 0.573, train_f1-macro: 0.597, valid_f1-macro: 0.548, train_cross-entropy: 1.564, valid_cross-entropy: 1.577, 
[Epoch: 70] OHPL Loss: 0.4600729735837023
train_mae: 0.462, valid_mae: 0.452, train_mze: 0.433, valid_mze: 0.439, train_f1-micro: 0.567, valid_f1-micro: 0.561, train_f1-macro: 0.592, valid_f1-macro: 0.522, train

array([[ 1,  0,  0,  0,  0,  0],
       [ 4, 11,  4,  0,  0,  0],
       [ 0, 18, 22,  7,  0,  0],
       [ 0,  0,  8, 33,  9,  1],
       [ 0,  0,  1,  6, 12,  8],
       [ 0,  0,  0,  0,  3,  9]])

Ray Tune Hyperparameter Tuning

In [41]:
report_metrics = ["training_iteration",
                  "ohpl",
                  "train_mae", 
                  "valid_mae",
                  "train_mze", 
                  "valid_mze",
                  "train_f1-micro",
                  "valid_f1-micro",
                  "train_f1-macro",
                  "valid_f1-marco",
                  "train_cross-entropy", 
                  "valid_cross-entropy",
                 ]

reporter = tune.JupyterNotebookReporter(overwrite=True, max_progress_rows=35, metric_columns= report_metrics)
searchopt = BasicVariantGenerator(max_concurrent=15)

config = {"lr": tune.loguniform(lr_min, lr_max),                       # Learning Rate
          "dropout_p": tune.uniform(dropout_p_min, dropout_p_max),     # Dropout On/Off
          "k": tune.grid_search([*range(k)]),                          # K-Fold Index
          "batch_size": tune.choice(batch_size),                       # 1: SGD; 2+: Zero-Filled BGD
          "margin": tune.uniform(margin_min, margin_max),              # OHPL margin
          "ordering_loss_weight": tune.uniform(alpha_min, alpha_max),  # OHPL ordering loss weight
          "h_total": tune.choice([*range(h_total_min, h_total_max, h_total_step)]),
          "h_fc": tune.sample_from(lambda spec: split_sampling(num_ele = spec.config.h_total, 
                                                               n_min = h_fc_min_neuron_per_layer,
                                                               n_max = h_fc_max_neuron_per_layer,
                                                               out_dim = h_fc_max_layer,
                                                               single_sample = True)),
         }

In [42]:
t0 = time.time()

result = tune.run(
        tune.with_parameters(train_ohpl_raytune, 
                             num_in_feat   = N_FEATURE,
                             num_epochs    = num_epochs, 
                             train_dataset = dataset_train_kfold, 
                             valid_dataset = dataset_valid_kfold,
                             ),
        config = config,
        resources_per_trial={"cpu": 1, "gpu": 0.1},
        num_samples = num_hp_search_samples,
        local_dir = chkpt_dir,
        progress_reporter = reporter,
#         scheduler = scheduler,
        search_alg = searchopt,
)

t1 = time.time()
print(f"Time elapsed: {t1-t0}s")

Trial name,status,loc,batch_size,dropout_p,h_fc,h_total,k,lr,margin,ordering_loss_weight,training_iteration,ohpl,train_mae,valid_mae,train_mze,valid_mze,train_f1-micro,valid_f1-micro,train_f1-macro,train_cross-entropy,valid_cross-entropy
train_ohpl_raytune_b89e7_00000,TERMINATED,,5,0.0693328,"[3, 2, 3]",8,0,0.000128489,0.850497,0.509621,100,1.55205,0.741483,0.816,0.609218,0.68,0.390782,0.32,0.379818,1.66549,1.69199
train_ohpl_raytune_b89e7_00001,TERMINATED,,5,0.221446,"[4, 2, 2]",8,1,0.0415138,0.872935,0.92912,100,5.57472,2.89579,2.952,0.987976,1.0,0.012024,0.0,0.0039604,1.79176,1.79176
train_ohpl_raytune_b89e7_00002,TERMINATED,,5,0.586613,"[3, 3]",6,2,0.013008,0.939486,0.978904,100,2.93671,0.963928,0.864,0.669339,0.632,0.330661,0.368,0.284331,1.74916,1.72539
train_ohpl_raytune_b89e7_00003,TERMINATED,,5,0.453275,"[2, 2, 2]",6,3,0.00250131,0.833,0.926098,100,2.53323,1.07014,1.088,0.725451,0.744,0.274549,0.256,0.265007,1.78118,1.75732
train_ohpl_raytune_b89e7_00004,TERMINATED,,5,0.145067,"[2, 2]",4,4,0.000199224,0.930827,0.844911,100,2.35893,0.654,0.629032,0.532,0.516129,0.468,0.483871,0.404811,1.64608,1.59583


2021-08-26 23:53:25,199	INFO tune.py:550 -- Total run time: 33.76 seconds (33.56 seconds for the tuning loop).


Time elapsed: 34.264498710632324s


In [44]:
result.results_df.columns

Index(['train_mae', 'valid_mae', 'train_mze', 'valid_mze', 'train_f1-micro',
       'valid_f1-micro', 'train_f1-macro', 'valid_f1-macro', 'ohpl',
       'train_cross-entropy', 'valid_cross-entropy', 'time_this_iter_s',
       'should_checkpoint', 'done', 'timesteps_total', 'episodes_total',
       'training_iteration', 'experiment_id', 'date', 'timestamp',
       'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore',
       'timesteps_since_restore', 'iterations_since_restore', 'experiment_tag',
       'config.lr', 'config.dropout_p', 'config.k', 'config.batch_size',
       'config.margin', 'config.ordering_loss_weight', 'config.h_total',
       'config.h_fc'],
      dtype='object')

In [45]:
result.results_df.valid_mae

trial_id
b89e7_00000    0.816000
b89e7_00001    2.952000
b89e7_00002    0.864000
b89e7_00003    1.088000
b89e7_00004    0.629032
Name: valid_mae, dtype: float64

# Experiment and Testing

In [None]:
# y_true = torch.tensor([4,1,2,0,4,2,1])
# y_pred = torch.tensor([6.0,3.1,5.2,1.0,4.0,2.2,3.7], dtype=torch.float32)
# minlabel = 0
# maxlabel = 4
# margin = 0.3
# ordering_loss_weight = 0.1   # Alpha
# loss_bound = 1e9

# # === HCL: Hyperplane Centroid Loss ===
# # (To ensure hyperplane are ordered by rank)

# min_label = torch.tensor(minlabel, dtype=torch.float32, requires_grad=False)
# max_label = torch.tensor(maxlabel, dtype=torch.float32, requires_grad=False)
# margin = torch.tensor(margin, dtype=torch.float32, requires_grad=False)
# ordering_loss_weight = torch.tensor(ordering_loss_weight, dtype=torch.float32, requires_grad=False)

# y_true = y_true.type(y_pred.dtype)
# ords, idx = torch.unique(y_true, return_inverse=True)
# num_label = ords.shape[0]
# y_true_ohe = F.one_hot(idx,num_classes=num_label)

# # hyperplane intercept term
# yO = y_pred.type(torch.float32) @ y_true_ohe.type(torch.float32)
# yc = torch.sum(y_true_ohe, dim=0)
# class_mean = torch.div(yO,yc)

# # relative rank distance between centroids
# min_distance = torch.reshape(ords,(-1,1)) - torch.reshape(ords,(1,-1))
# min_distance = torch.relu(min_distance)

# # keeps min. distance (???)
# keep = torch.minimum(min_distance,torch.ones(min_distance.shape))

# # positive mean sample distance between centroids
# centroid_distance = torch.reshape(class_mean,(-1,1)) - torch.reshape(class_mean,(1,-1))
# centroid_distance = torch.relu(centroid_distance)   # zero loss for correct ordering
# centroid_distance = torch.multiply(keep, centroid_distance)

# hp_ordering_loss = torch.sum(torch.relu(min_distance - centroid_distance))

# # === HPL/HPPL: Hyperplane Point Loss ===
# # (To ensure transformation place the point near the correct centroid)
# mean_centroid_of_sample = y_true_ohe.type(torch.float32) @ torch.reshape(class_mean,(-1,1))


# # --- Limit Edge Case Loss ---
# # No reason to limit distance from edge cases:
# # 1. Positive edge case (max_label) for upper loss
# # 2. Negative edge case (min_label) for lower loss
# upper_bound = (y_true - max_label + 1) * loss_bound   # Select edge case and give a large loss_bound (we want to pull it back in case if it gets too big)
# upper_bound = torch.relu(upper_bound) + margin        # Add margin to non-edge cases
# lower_bound = (-(y_true - min_label) + 1) * loss_bound
# lower_bound = torch.relu(lower_bound) + margin   

# # -- Compute Loss ---
# upper_loss = y_pred[:,None] - mean_centroid_of_sample
# upper_loss_bounded = torch.relu(upper_loss - upper_bound[:,None])
# lower_loss = -(y_pred[:,None] - mean_centroid_of_sample)
# lower_loss_bounded = torch.relu(lower_loss - lower_bound[:,None])

# hp_point_loss = torch.mean(upper_loss_bounded + lower_loss_bounded)

# # === OHPL ===
# loss = hp_point_loss + ordering_loss_weight * hp_ordering_loss
