## Dependencies

Install dependencies not available on Google Collab.
Collab provides numpy, pandas, sklearn, tensorflow, scipy, etc. (see requirements.txt)

In [None]:
!pip install pinard
!pip install scikeras

## Benchmark details

The results aggregate the combination of the following trainings configurations:
- estimation configuration: [regression, classification]
- datasets configurations: [Single Train, Cross validation with 5 folds and 2 repeats, Augmented Single Train]
- preprocessing configuration: [flat spectrum, savgol, haar, [small set], [big_set]]
- models: 
   - for all configuration: BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS
   - for single train + small_set : Stack > [ BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS,
   f_PLSRegression,f_AdaBoostRegressor,f_BaggingRegressor,f_ExtraTreesRegressor, f_GradientBoostingRegressor,f_RandomForestRegressor,
   f_ARDRegression,f_BayesianRidge,f_ElasticNet,f_ElasticNetCV,f_HuberRegressor, f_LarsCV,f_LassoCV,f_Lasso,f_LassoLars,f_LassoLarsCV,
   f_LassoLarsIC,f_LinearRegression,f_OrthogonalMatchingPursuit,f_OrthogonalMatchingPursuitCV, f_PassiveAggressiveRegressor,f_RANSACRegressor,
   f_Ridge,f_RidgeCV,f_SGDRegressor,f_TheilSenRegressor,f_GaussianProcessRegressor,f_KNeighborsRegressor, f_Pipeline,f_MLPRegressor,f_LinearSVR,
   f_NuSVR,f_SVR,f_DecisionTreeRegressor,f_ExtraTreeRegressor,f_KernelRidge,f_XGBRegressor]

We perform training in 2 steps, (1) data transformation and (2) training because the sklearn pipeline does not use test data natively.
To change with pinard update in the future.

In [48]:
import datetime
import json
import math
import numpy as np
import time
import os
from collections import OrderedDict

from contextlib import redirect_stdout
# import joblib
# import pickle

from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics \
    import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error,\
        r2_score, explained_variance_score, mean_squared_log_error, median_absolute_error
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
import tensorflow as tf

from data import load_data
from preprocessings import preprocessing_list, transform_test_data
from regressors import nn_list, ml_list, get_keras_model

tf.get_logger().setLevel('ERROR')
tf.keras.mixed_precision.set_global_policy('mixed_float16')

def get_datasheet(dataset_name, model_name, path, SEED, y_valid, y_pred):
    return {
        "model":model_name, 
        "dataset":dataset_name,
        "seed":str(SEED),
        "targetRMSE":str(float(os.path.split(path)[-1].split('_')[-1].split("RMSE")[-1])),
        "RMSE":str(mean_squared_error(y_valid, y_pred, squared=True)),
        "MAPE":str(mean_absolute_percentage_error(y_valid, y_pred)),
        "R2":str(r2_score(y_valid, y_pred)),
        "MAE":str(mean_absolute_error(y_valid, y_pred)),
        "MSE":str(mean_squared_error(y_valid, y_pred, squared=False)),
        "MedAE":str(median_absolute_error(y_valid, y_pred)),
        "EVS":str(explained_variance_score(y_valid, y_pred)),
        "MSLE":str(mean_squared_log_error(y_valid, y_pred)),
        "run":datetime.datetime.now().strftime("%Y-%m-%d  %H:%M:%S")
    }

def log_run(dataset_name, model_name, path, SEED, y_valid, y_pred):
    datasheet = get_datasheet(dataset_name, model_name, path, SEED, y_valid, y_pred)
    ### Save data
    folder = "results/" + dataset_name
    if not os.path.isdir(folder):
        os.makedirs(folder)

    canon_name = folder + "/" + model_name

        ## save predictions
    np.savetxt(canon_name + '.csv', np.column_stack((y_valid, y_pred)))

    ## save main metrics globally
    result_file = open(folder + "/_runs.txt", "a")
    log = datasheet["RMSE"] + "  ---  " + model_name + ' '*10 + datetime.datetime.now().strftime("%Y-%m-%d  %H:%M:%S") + '\n'
    result_file.write(log)
    result_file.close()

    ## save pipeline
    # joblib.dump(estimator, canon_name + '.pkl')

    return datasheet


def evaluate_pipeline(desc, model_name, data, transformers):
    print("<", model_name, ">")
    start_time = time.time()

    # Unpack args
    X_train, y_train, X_valid, y_valid = data
    dataset_name, path, global_result_file, results, SEED = desc
    y_scaler, transformer_pipeline, regressor = transformers

    # Construct pipeline
    pipeline = Pipeline([
        ('transformation', transformer_pipeline), 
        (model_name, regressor)
    ])

    # Fit estimator
    estimator = TransformedTargetRegressor(regressor = pipeline, transformer = y_scaler)
    estimator.fit(X_train, y_train)  
    # Evaluate estimator
    y_pred = estimator.predict(X_valid)
    elapsed_time = time.time() - start_time
    datasheet = log_run(dataset_name, model_name, path, SEED, y_valid, y_pred)
    datasheet["training_time"] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
    results[model_name] = datasheet

    # Save results
    results = OrderedDict(sorted(results.items(), key=lambda k_v: float(k_v[1]['RMSE'])))
    with open(global_result_file, 'w') as fp:
        json.dump(results, fp, indent=4)

    print(datasheet["RMSE"], " (", datasheet["targetRMSE"], ") in", datasheet["training_time"])
    return y_pred


def benchmark_dataset(path, SEED):
    dataset_name = ('_').join(os.path.split(path)[-1].split('_')[:-1])
    global_result_file = "results/" + dataset_name + '_results.json'
    results = {}
    if os.path.isfile(global_result_file):
        with open(global_result_file) as json_file:
            results = json.load(json_file)
   
    desc = (dataset_name, path, global_result_file, results, SEED)

    X, y, X_valid, y_valid = load_data(path)
    print("="*10, str(dataset_name).upper(), X.shape, y.shape, X_valid.shape, y_valid.shape, "="*10)
    

    #########################
    ### SINGLE RUN TRAINING
    X_train, y_train, X_test, y_test = X, y, X_valid, y_valid
    data = (X_train, y_train, X_valid, y_valid)
    for preprocessing in preprocessing_list():            
        ##### DEEP LEARNING #####
        X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="augmentation")
        for model_desc in nn_list():
            model_name = model_desc.__name__ + "-" + preprocessing.__name__ + "-" + str(SEED)
            if os.path.isfile(  "results/" + dataset_name + "/" + model_name + '.csv'):
                # print("Skipping", model_name)
                continue

            regressor = get_keras_model(dataset_name + '_' + model_name, model_desc, 7500, 750, X_test_pp, y_test_pp, verbose=0, seed=SEED)
            transformers = (y_scaler, transformer_pipeline, regressor)
            evaluate_pipeline(desc, model_name, data, transformers)
        
        # ##### MACHINE LEARNING #####
        # X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="union")
        # for regressor, mdl_name in ml_list(SEED, X_test_pp, y_test_pp):
        #     model_name = mdl_name + "-" + preprocessing.__name__ + "-" + str(SEED)
        #     if os.path.isfile(  "results/" + dataset_name + "/" + model_name + '.csv'):
        #        # print("Skipping", model_name)
        #         continue
        #     transformers = (y_scaler, transformer_pipeline, regressor)
        #     evaluate_pipeline(desc, model_name, data, transformers)

    #########################



    #########################
    # ### CROSS VALIDATION TRAINING
    # cv_predictions = {}
    # for preprocessing in preprocessing_list():
    #     fold = RepeatedKFold(n_splits=5, n_repeats=2, random_state=SEED)
    #     fold_index = 0
    #     for train_index, test_index in fold.split(X):
    #         X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
    #         data = (X_train, y_train, X_valid, y_valid)
            
    #         ##### DEEP LEARNING #####
    #         X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="augmentation")
    #         for model_desc in nn_list():
    #             model_name = model_desc.__name__ + "-" + preprocessing.__name__  + "-" + str(SEED)
    #             fold_name = model_name + "-F" + str(fold_index)
    #             if os.path.isfile(  "results/" + dataset_name + "/" + fold_name + '.csv'):
    #                # print("Skipping", model_name)
    #                 continue
    #             regressor = get_keras_model(dataset_name + '_' + fold_name, model_desc, 7500, 750, X_test_pp, y_test_pp, verbose=0, seed=SEED)
    #             y_pred = evaluate_pipeline(desc, fold_name, data, (y_scaler, transformer_pipeline, regressor))
    #             cv_predictions[model_name] = cv_predictions[model_name] + y_pred if model_name in cv_predictions else y_pred
            
    #         ##### MACHINE LEARNING #####
    #         X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="union")
    #         for regressor, mdl_name in ml_list(SEED, X_test_pp, y_test_pp):
    #             model_name = mdl_name + "-" + preprocessing.__name__ + "-" + str(SEED)
    #             fold_name = model_name + "-F" + str(fold_index)
    #             if os.path.isfile(  "results/" + dataset_name + "/" + fold_name + '.csv'):
    #              #  print("Skipping", model_name)
    #                 continue
    #             y_pred = evaluate_pipeline(desc, fold_name, data, (y_scaler, transformer_pipeline, regressor))
    #             cv_predictions[model_name] = cv_predictions[model_name] + y_pred if model_name in cv_predictions else y_pred
                
    #         fold_index +=1

    # for key, val in cv_predictions.items():
    #     y_pred = val / fold.get_n_splits()
    #     datasheet = get_datasheet(dataset_name, key, path, SEED, y_valid, y_pred)
    #     results[key +"_CV"] = datasheet
    #
    # results = OrderedDict(sorted(results.items(), key=lambda k_v: float(k_v[1]['RMSE'])))
    # with open(global_result_file, 'w') as fp:
    #     json.dump(results, fp, indent=4)

    # #########################


    


In [None]:
## Browse path and launch benchmark for every folders
%load_ext autoreload
%autoreload 2

from pathlib import Path

rootdir = Path('data/regression')
folder_list = [f for f in rootdir.glob('**/*') if f.is_dir()]

SEED = ord('D') + 31373
np.random.seed(SEED)
tf.random.set_seed(SEED)

for folder in folder_list:
    benchmark_dataset(folder, SEED)

In [None]:
### FAST GPU RESET ####
from numba import cuda 
device = cuda.get_current_device()
device.reset()