## Dependencies

Install dependencies not available on Google Collab.
Collab provides numpy, pandas, sklearn, tensorflow, scipy, etc. (see requirements.txt)

In [None]:
!pip install pinard
!pip install scikeras

## Benchmark details

The results aggregate the combination of the following trainings configurations:
- estimation configuration: [regression, classification]
- datasets configurations: [Single Train, Cross validation with 5 folds and 2 repeats, Augmented Single Train]
- preprocessing configuration: [flat spectrum, savgol, haar, [small set], [big_set]]
- models: 
   - for all configuration: BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS
   - for single train + small_set : Stack > [ BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS,
   f_PLSRegression,f_AdaBoostRegressor,f_BaggingRegressor,f_ExtraTreesRegressor, f_GradientBoostingRegressor,f_RandomForestRegressor,
   f_ARDRegression,f_BayesianRidge,f_ElasticNet,f_ElasticNetCV,f_HuberRegressor, f_LarsCV,f_LassoCV,f_Lasso,f_LassoLars,f_LassoLarsCV,
   f_LassoLarsIC,f_LinearRegression,f_OrthogonalMatchingPursuit,f_OrthogonalMatchingPursuitCV, f_PassiveAggressiveRegressor,f_RANSACRegressor,
   f_Ridge,f_RidgeCV,f_SGDRegressor,f_TheilSenRegressor,f_GaussianProcessRegressor,f_KNeighborsRegressor, f_Pipeline,f_MLPRegressor,f_LinearSVR,
   f_NuSVR,f_SVR,f_DecisionTreeRegressor,f_ExtraTreeRegressor,f_KernelRidge,f_XGBRegressor]

We perform training in 2 steps, (1) data transformation and (2) training because the sklearn pipeline does not use test data natively.
To change with pinard update in the future.

In [24]:
import datetime
import json
import math
import numpy as np
import os

from contextlib import redirect_stdout
# import joblib
# import pickle

from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics \
    import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error,\
        r2_score, explained_variance_score, mean_squared_log_error, median_absolute_error
from sklearn.pipeline import Pipeline
import tensorflow as tf

from data import load_data
from preprocessings import preprocessing_list, transform_test_data, ml_transformer_pipeline
from regressors import nn_list, ml_list, get_keras_model

tf.get_logger().setLevel('ERROR')
tf.keras.mixed_precision.set_global_policy('mixed_float16')

def log_run(dataset_name, model_name, path, estimator, y_valid, y_pred):
    # compute scores
    RMSE = str(mean_squared_error(y_valid, y_pred, squared=True))
    target_RMSE = str(float(os.path.split(path)[-1].split('_')[-1].split("RMSE")[-1]))
    print(RMSE, " vs ", target_RMSE)

    datasheet = {
        "model":model_name, 
        "dataset":dataset_name,
        "target_RMSE":target_RMSE,
        "RMSE":RMSE,
        "MAPE":str(mean_absolute_percentage_error(y_valid, y_pred)),
        "R2":str(r2_score(y_valid, y_pred)),
        "MAE":str(mean_absolute_error(y_valid, y_pred)),
        "MSE":str(mean_squared_error(y_valid, y_pred, squared=False)),
        "MedAE":str(median_absolute_error(y_valid, y_pred)),
        "EVS":str(explained_variance_score(y_valid, y_pred)),
        "MSLE":str(mean_squared_log_error(y_valid, y_pred)),
    }

    ### Save data
    folder = "results/" + dataset_name
    if not os.path.isdir(folder):
        os.makedirs(folder)

    canon_name = folder + "/" + model_name

    ## save predictions
    np.savetxt(canon_name + '.csv', np.column_stack((y_valid, y_pred)))

    ## save main metrics globally
    result_file = open(folder + "/_runs.txt", "a")
    log = RMSE + "  ---  " + model_name + ' '*10 + datetime.datetime.now().strftime("%Y-%m-%d  %H:%M:%S") + '\n'
    result_file.write(log)
    result_file.close()

    ## save pipeline
    # joblib.dump(estimator, canon_name + '.pkl')

    return datasheet


def evaluate_pipeline(dataset_name, model_name, path, pipeline, y_scaler, X_train, y_train, X_valid, y_valid):  
    # Fit estimator
    estimator = TransformedTargetRegressor(regressor = pipeline, transformer = y_scaler)
    estimator.fit(X_train, y_train)  
    # Evaluate estimator
    y_pred = estimator.predict(X_valid)
    print(y_pred[0:10])
    print(y_valid[0:10])
    return log_run(dataset_name, model_name, path, estimator, y_valid, y_pred)


def benchmark_dataset(path, SEED):
    results = {}
    dataset_name = ('_').join(os.path.split(path)[-1].split('_')[:-1])
    print("="*10, str(dataset_name).upper(), "="*10)


    X_train, y_train, X_valid, y_valid = load_data(path)
    print("Data >", X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

    ### First becnhmark in a single run
    X_test, y_test = X_valid, y_valid
    
    for preprocessing in preprocessing_list():
        # Only necessary for Kerasregressor and LWPLS in order to provide validation data to the model - should be replace with pinard 2.0
        X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test)
        ##

        # ##### DEEP LEARNING #####
        # for model_desc in nn_list():
        #     model_name = model_desc.__name__ + "-" + preprocessing.__name__ + "-" + str(SEED)
        #     print("-", model_name, end=" ")

        #     # Get model
        #     regressor = get_keras_model(dataset_name + '_' + model_name, model_desc, 5, 500, X_test_pp, y_test_pp, verbose=0, seed=SEED)
        #     pipeline = Pipeline([
        #         ('transformation', transformer_pipeline), 
        #         (model_name, regressor)
        #     ])

        #     datasheet = evaluate_pipeline(dataset_name, model_name, path, pipeline, y_scaler, X_train, y_train, X_valid, y_valid)           
        #     results[model_name] = datasheet
        
        ##### MACHINE LEARNING #####
        for regressor, mdl_name in ml_list(SEED, X_test_pp, y_test_pp):
            model_name = mdl_name + "-" + preprocessing.__name__ + "-" + str(SEED)
            pipeline = Pipeline([
                ('transformation', ml_transformer_pipeline(preprocessing)), 
                (model_name, regressor)
            ])

            print("-", model_name, end=" ")
            datasheet = evaluate_pipeline(dataset_name, model_name, path, pipeline, y_scaler, X_train, y_train, X_valid, y_valid)
            results[model_name] = datasheet
        
        break

    results = sorted(results.items(), key=lambda k_v: k_v[1]['RMSE'])
    with open("results/" + dataset_name + '_global_results.json', 'w') as fp:
        json.dump(results, fp, indent=4)
        


In [23]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

rootdir = Path('data/regression')
folder_list = [f for f in rootdir.glob('**/*') if f.is_dir()]

SEED = ord('D') + 31373
np.random.seed(SEED)
tf.random.set_seed(SEED)

for folder in folder_list:
    benchmark_dataset(folder, SEED)
    break

Data > (272, 2151) (272,) (152, 2151) (152,)
- LWPLS_2_0.25-id_preprocessing-31441 

ValueError: y_true and y_pred have different number of output (1!=2)

In [None]:
from numba import cuda 
device = cuda.get_current_device()
device.reset()

In [17]:
ord("test")

TypeError: ord() expected a character, but string of length 4 found