## Dependencies

Install dependencies not available on Google Collab.
Collab provides numpy, pandas, sklearn, tensorflow, scipy, etc. (see requirements.txt)

In [None]:
!pip install pinard
!pip install scikeras

## Benchmark details

The results aggregate the combination of the following trainings configurations:
- estimation configuration: [regression, classification]
- datasets configurations: [Single Train, Cross validation with 5 folds and 2 repeats, Augmented Single Train]
- preprocessing configuration: [flat spectrum, savgol, haar, [small set], [big_set]]
- models: 
   - for all configuration: BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS
   - for single train + small_set : Stack > [ BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS,
   f_PLSRegression,f_AdaBoostRegressor,f_BaggingRegressor,f_ExtraTreesRegressor, f_GradientBoostingRegressor,f_RandomForestRegressor,
   f_ARDRegression,f_BayesianRidge,f_ElasticNet,f_ElasticNetCV,f_HuberRegressor, f_LarsCV,f_LassoCV,f_Lasso,f_LassoLars,f_LassoLarsCV,
   f_LassoLarsIC,f_LinearRegression,f_OrthogonalMatchingPursuit,f_OrthogonalMatchingPursuitCV, f_PassiveAggressiveRegressor,f_RANSACRegressor,
   f_Ridge,f_RidgeCV,f_SGDRegressor,f_TheilSenRegressor,f_GaussianProcessRegressor,f_KNeighborsRegressor, f_Pipeline,f_MLPRegressor,f_LinearSVR,
   f_NuSVR,f_SVR,f_DecisionTreeRegressor,f_ExtraTreeRegressor,f_KernelRidge,f_XGBRegressor]

We perform training in 2 steps, (1) data transformation and (2) training because the sklearn pipeline does not use test data natively.
To change with pinard update in the future.

In [None]:
### FAST GPU RESET ####
from numba import cuda 
device = cuda.get_current_device()
device.reset()

In [1]:
import datetime
import json
import math
import numpy as np
import time
import os
from collections import OrderedDict

from contextlib import redirect_stdout
# import joblib
# import pickle

from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics \
    import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error,\
        r2_score, explained_variance_score, mean_squared_log_error, median_absolute_error
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
import tensorflow as tf

from data import load_data
from preprocessings import transform_test_data
from regressors import nn_list, ml_list, get_keras_model
from pinard import augmentation, sklearn

tf.get_logger().setLevel('ERROR')
tf.keras.mixed_precision.set_global_policy('mixed_float16')

def get_datasheet(dataset_name, model_name, path, SEED, y_valid, y_pred):
    return {
        "model":model_name, 
        "dataset":dataset_name,
        "seed":str(SEED),
        "targetRMSE":str(float(os.path.split(path)[-1].split('_')[-1].split("RMSE")[-1])),
        "RMSE":str(mean_squared_error(y_valid, y_pred, squared=False)),
        "MAPE":str(mean_absolute_percentage_error(y_valid, y_pred)),
        "R2":str(r2_score(y_valid, y_pred)),
        "MAE":str(mean_absolute_error(y_valid, y_pred)),
        "MSE":str(mean_squared_error(y_valid, y_pred, squared=True)),
        "MedAE":str(median_absolute_error(y_valid, y_pred)),
        "EVS":str(explained_variance_score(y_valid, y_pred)),
        # "MSLE":str(mean_squared_log_error(y_valid, y_pred)),
        "run":datetime.datetime.now().strftime("%Y-%m-%d  %H:%M:%S")
    }

def log_run(dataset_name, model_name, path, SEED, y_valid, y_pred, elapsed_time):
    datasheet = get_datasheet(dataset_name, model_name, path, SEED, y_valid, y_pred)
    ### Save data
    folder = "results/" + dataset_name
    if not os.path.isdir(folder):
        os.makedirs(folder)

    canon_name = folder + "/" + model_name

        ## save predictions
    np.savetxt(canon_name + '.csv', np.column_stack((y_valid, y_pred)))

    ## save main metrics globally
    result_file = open(folder + "/_runs.txt", "a")
    log = datasheet["RMSE"] + "  ---  " + model_name + " in " + time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) \
        + ' ('+ datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + ')\n'
    result_file.write(log)
    result_file.close()

    ## save pipeline
    # joblib.dump(estimator, canon_name + '.pkl')

    return datasheet

current_estimator = None
current_X_test = None
current_y_test = None
current_path = None

def callback_predict(epoch, val_loss):
    if current_estimator is None:
        return
    
    y_pred = current_estimator.predict(current_X_test)
    res = get_datasheet("", "", current_path, -1, current_y_test, y_pred)
    print('Epoch:', epoch,'> RMSE:', res['RMSE'], '(', res['targetRMSE'], ') - R²:', res['R2'], ' val_loss', val_loss)

def evaluate_pipeline(desc, model_name, data, transformers):
    print("<", model_name, ">")
    start_time = time.time()

    # Unpack args
    X_train, y_train, X_valid, y_valid = data
    dataset_name, path, global_result_file, results, SEED = desc
    global current_path
    current_path = path
    y_scaler, transformer_pipeline, regressor = transformers

    # Construct pipeline
    pipeline = Pipeline([
        ('transformation', transformer_pipeline), 
        (model_name, regressor)
    ])

    # Fit estimator
    estimator = TransformedTargetRegressor(regressor = pipeline, transformer = y_scaler)
    global current_estimator
    current_estimator = estimator
    global current_X_test
    current_X_test = X_valid
    global current_y_test
    current_y_test = y_valid
    estimator.fit(X_train, y_train)  
    # Evaluate estimator
    y_pred = estimator.predict(X_valid)
    elapsed_time = time.time() - start_time
    datasheet = log_run(dataset_name, model_name, path, SEED, y_valid, y_pred, elapsed_time)
    datasheet["training_time"] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
    results[model_name] = datasheet

    # Save results
    results = OrderedDict(sorted(results.items(), key=lambda k_v: float(k_v[1]['RMSE'])))
    with open(global_result_file, 'w') as fp:
        json.dump(results, fp, indent=4)

    print(datasheet["RMSE"], " (", datasheet["targetRMSE"], ") in", datasheet["training_time"])
    return y_pred


def benchmark_dataset(path, SEED, preprocessing_list, augment=False):
    dataset_name = ('_').join(os.path.split(path)[-1].split('_')[:-1])
    print("="*10, str(dataset_name).upper(), end=" ")
    global_result_file = "results/" + dataset_name + '_results.json'
    results = {}
    if os.path.isfile(global_result_file):
        with open(global_result_file) as json_file:
            results = json.load(json_file)
   
    desc = (dataset_name, path, global_result_file, results, SEED)

    X, y, X_valid, y_valid = load_data(path)
    print(X.shape, y.shape, X_valid.shape, y_valid.shape, "="*10)
    

    #########################
    ### SINGLE RUN TRAINING
    X_train, y_train, X_test, y_test = X, y, X_valid, y_valid

    if(augment):
        augmentation_pipeline = sklearn.SampleAugmentation([
            (2, 'rot_tr', augmentation.Rotate_Translate()),
            (1, 'rd_mult', augmentation.Random_X_Operation()),
            (1, 'simpl', augmentation.Random_Spline_Addition())
        ])
        print(X_train.shape, y_train.shape)
        X_train, y_train = augmentation_pipeline.transform(X_train, y_train)
        print(X_train.shape, y_train.shape)

    data = (X_train, y_train, X_valid, y_valid)
    for preprocessing in preprocessing_list:            
        ##### DEEP LEARNING #####
        X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="augmentation")
        for model_desc in nn_list():
            model_name = model_desc.__name__ + "-" + preprocessing.__name__ + "-" + str(SEED)
            if os.path.isfile(  "results/" + dataset_name + "/" + model_name + '.csv'):
                # print("Skipping", model_name)
                continue
            
            batch_size = 20
            if preprocessing.__name__ == "dumb_set":
                batch_size = 3
            regressor = get_keras_model(dataset_name + '_' + model_name, model_desc, 4096, batch_size, X_test_pp, y_test_pp, transfer=True, callback_func=callback_predict, verbose=0, seed=SEED)
            transformers = (y_scaler, transformer_pipeline, regressor)
            evaluate_pipeline(desc, model_name, data, transformers)
        
        # ##### MACHINE LEARNING #####
        # X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="union")
        # for regressor, mdl_name in ml_list(SEED, X_test_pp, y_test_pp):
        #     model_name = mdl_name + "-" + preprocessing.__name__ + "-" + str(SEED)
        #     if os.path.isfile(  "results/" + dataset_name + "/" + model_name + '.csv'):
        #        # print("Skipping", model_name)
        #         continue
        #     transformers = (y_scaler, transformer_pipeline, regressor)
        #     evaluate_pipeline(desc, model_name, data, transformers)

    #########################



    #########################
    # ### CROSS VALIDATION TRAINING
    # cv_predictions = {}
    # for preprocessing in preprocessing_list():
    #     fold = RepeatedKFold(n_splits=5, n_repeats=2, random_state=SEED)
    #     fold_index = 0
    #     for train_index, test_index in fold.split(X):
    #         X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
    #         data = (X_train, y_train, X_valid, y_valid)
            
    #         ##### DEEP LEARNING #####
    #         X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="augmentation")
    #         for model_desc in nn_list():
    #             model_name = model_desc.__name__ + "-" + preprocessing.__name__  + "-" + str(SEED)
    #             fold_name = model_name + "-F" + str(fold_index)
    #             if os.path.isfile(  "results/" + dataset_name + "/" + fold_name + '.csv'):
    #                # print("Skipping", model_name)
    #                 continue
    #             regressor = get_keras_model(dataset_name + '_' + fold_name, model_desc, 7500, 750, X_test_pp, y_test_pp, verbose=0, seed=SEED)
    #             y_pred = evaluate_pipeline(desc, fold_name, data, (y_scaler, transformer_pipeline, regressor))
    #             cv_predictions[model_name] = cv_predictions[model_name] + y_pred if model_name in cv_predictions else y_pred
            
    #         ##### MACHINE LEARNING #####
    #         X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="union")
    #         for regressor, mdl_name in ml_list(SEED, X_test_pp, y_test_pp):
    #             model_name = mdl_name + "-" + preprocessing.__name__ + "-" + str(SEED)
    #             fold_name = model_name + "-F" + str(fold_index)
    #             if os.path.isfile(  "results/" + dataset_name + "/" + fold_name + '.csv'):
    #              #  print("Skipping", model_name)
    #                 continue
    #             y_pred = evaluate_pipeline(desc, fold_name, data, (y_scaler, transformer_pipeline, regressor))
    #             cv_predictions[model_name] = cv_predictions[model_name] + y_pred if model_name in cv_predictions else y_pred
                
    #         fold_index +=1

    # for key, val in cv_predictions.items():
    #     y_pred = val / fold.get_n_splits()
    #     datasheet = get_datasheet(dataset_name, key, path, SEED, y_valid, y_pred)
    #     results[key +"_CV"] = datasheet
    #
    # results = OrderedDict(sorted(results.items(), key=lambda k_v: float(k_v[1]['RMSE'])))
    # with open(global_result_file, 'w') as fp:
    #     json.dump(results, fp, indent=4)

    # #########################


    


In [19]:
## Browse path and launch benchmark for every folders
%load_ext autoreload
%autoreload 2

from pathlib import Path
from preprocessings import preprocessing_list

rootdir = Path('data2/regression')
folder_list = [f for f in rootdir.glob('**/*') if f.is_dir()]

SEED = ord('D') + 31373
np.random.seed(SEED)
tf.random.set_seed(SEED)

# (preprocessing_list, nn_run, nn_cv, ml_single, ml_cv)

# benchmark_dataset("data2/regression/ALPINE_Calpine_424_Murguzur_RMSE1.36", SEED, preprocessing_list())
# benchmark_dataset("data2/regression/Cassava_TBC_3556_Davrieux_RMSE1.02", SEED, preprocessing_list(), augment=True)
# benchmark_dataset("data2/regression/ALPINE_Calpine_424_Murguzur_RMSE1.36", SEED, preprocessing_list())

for folder in folder_list:
    # print(ord(str(folder)[17]), ord('A'), ord('M'))
    if ord(str(folder)[17]) < ord("L") or ord(str(folder)[17]) > ord("M"):
        continue
    benchmark_dataset(folder, SEED, preprocessing_list())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
< decon-decon_set-31441 >




Epoch: 0 > RMSE: 23.853296 ( 2.33 ) - R²: -440.2343198841712  val_loss 0.6396484375
Epoch: 42 > RMSE: 23.34785 ( 2.33 ) - R²: -421.7331550854223  val_loss 0.6259765625
Epoch: 43 > RMSE: 21.077036 ( 2.33 ) - R²: -343.50185931758165  val_loss 0.56494140625
Epoch: 44 > RMSE: 20.060656 ( 2.33 ) - R²: -311.07772286978667  val_loss 0.5380859375
Epoch: 45 > RMSE: 19.321972 ( 2.33 ) - R²: -288.5178693401477  val_loss 0.51806640625
Epoch: 62 > RMSE: 19.0025 ( 2.33 ) - R²: -279.02321781107247  val_loss 0.509765625
Epoch: 63 > RMSE: 18.825274 ( 2.33 ) - R²: -273.82426283706  val_loss 0.5048828125
Epoch: 64 > RMSE: 17.701094 ( 2.33 ) - R²: -241.98121829018476  val_loss 0.474853515625
Epoch: 65 > RMSE: 15.357153 ( 2.33 ) - R²: -181.89166331983148  val_loss 0.41162109375
Epoch: 66 > RMSE: 12.877735 ( 2.33 ) - R²: -127.60311283606262  val_loss 0.34521484375
Epoch: 71 > RMSE: 12.307203 ( 2.33 ) - R²: -116.46033795346888  val_loss 0.330322265625
Epoch: 87 > RMSE: 7.5289607 ( 2.33 ) - R²: -42.9584717413



Epoch: 0 > RMSE: 23.88451 ( 2.33 ) - R²: -441.38988315740914  val_loss 0.63970947265625
Epoch: 16 > RMSE: 6.8204947 ( 2.33 ) - R²: -35.07482974181507  val_loss 0.618896484375
Epoch: 19 > RMSE: 10.246696 ( 2.33 ) - R²: -80.4217101598712  val_loss 0.60107421875
Epoch: 23 > RMSE: 5.764067 ( 2.33 ) - R²: -24.76502803224025  val_loss 0.265960693359375
Saved best 0.2660 at epoch 23
5.764067  ( 2.33 ) in 00:05:48
< decon-decon_set-31441 >




Epoch: 0 > RMSE: 14.131049 ( 0.44 ) - R²: -23.782662256934785  val_loss 0.37939453125
Epoch: 1 > RMSE: 14.100437 ( 0.44 ) - R²: -23.67540642896875  val_loss 0.3779296875
Epoch: 2 > RMSE: 14.008619 ( 0.44 ) - R²: -23.355094865203732  val_loss 0.37548828125
Epoch: 3 > RMSE: 13.886236 ( 0.44 ) - R²: -22.931405675540297  val_loss 0.372314453125
Epoch: 4 > RMSE: 13.611047 ( 0.44 ) - R²: -21.992285863913356  val_loss 0.364990234375
Epoch: 5 > RMSE: 13.03094 ( 0.44 ) - R²: -20.07417724397539  val_loss 0.349365234375
Epoch: 6 > RMSE: 11.814096 ( 0.44 ) - R²: -16.32208299642466  val_loss 0.31640625
Epoch: 7 > RMSE: 9.555612 ( 0.44 ) - R²: -10.332248610958542  val_loss 0.256103515625
Epoch: 8 > RMSE: 7.8434315 ( 0.44 ) - R²: -6.635039532892619  val_loss 0.209716796875
Epoch: 20 > RMSE: 7.813666 ( 0.44 ) - R²: -6.577200356208997  val_loss 0.20947265625
Epoch: 30 > RMSE: 7.3798256 ( 0.44 ) - R²: -5.759138163557929  val_loss 0.19775390625
Epoch: 32 > RMSE: 6.0134053 ( 0.44 ) - R²: -3.48786847025023



Epoch: 0 > RMSE: 14.131049 ( 0.44 ) - R²: -23.782662256934785  val_loss 0.37181180715560913
Epoch: 1 > RMSE: 14.039223 ( 0.44 ) - R²: -23.461622941971047  val_loss 0.36935603618621826
Epoch: 2 > RMSE: 13.397183 ( 0.44 ) - R²: -21.275433583221904  val_loss 0.35187843441963196
Epoch: 3 > RMSE: 7.4950795 ( 0.44 ) - R²: -5.971906650345718  val_loss 0.1876920759677887
Epoch: 6 > RMSE: 5.8275046 ( 0.44 ) - R²: -3.2146771952822384  val_loss 0.14877767860889435
Epoch: 12 > RMSE: 2.6190267 ( 0.44 ) - R²: 0.14870727961732588  val_loss 0.10953117907047272
Epoch: 13 > RMSE: 3.7868316 ( 0.44 ) - R²: -0.7797177365234425  val_loss 0.09331467747688293
Epoch: 14 > RMSE: 3.915358 ( 0.44 ) - R²: -0.9025764400470311  val_loss 0.0902772769331932
Epoch: 15 > RMSE: 4.6848054 ( 0.44 ) - R²: -1.723844115125854  val_loss 0.06366146355867386
Epoch: 47 > RMSE: 4.1429334 ( 0.44 ) - R²: -1.1301739741356758  val_loss 0.06247038021683693
Epoch: 48 > RMSE: 4.1235986 ( 0.44 ) - R²: -1.1103372189732532  val_loss 0.06089



Epoch: 0 > RMSE: 11.266404 ( 0.65 ) - R²: -0.26715438069405906  val_loss 0.2986464500427246
Epoch: 1 > RMSE: 11.2520895 ( 0.65 ) - R²: -0.2639364422025159  val_loss 0.29820358753204346
Epoch: 2 > RMSE: 11.2378435 ( 0.65 ) - R²: -0.2607380115108324  val_loss 0.29739734530448914
Epoch: 3 > RMSE: 11.19552 ( 0.65 ) - R²: -0.2512596998129717  val_loss 0.2960687577724457
Epoch: 4 > RMSE: 11.099218 ( 0.65 ) - R²: -0.22982605485733876  val_loss 0.29320719838142395
Epoch: 5 > RMSE: 10.880115 ( 0.65 ) - R²: -0.1817505212770334  val_loss 0.2872229218482971
Epoch: 6 > RMSE: 10.528209 ( 0.65 ) - R²: -0.10654177177692792  val_loss 0.27605491876602173
Epoch: 7 > RMSE: 10.069968 ( 0.65 ) - R²: -0.012313502300453694  val_loss 0.2601630687713623
Epoch: 8 > RMSE: 10.084217 ( 0.65 ) - R²: -0.015180414849607304  val_loss 0.25647255778312683
Epoch: 9 > RMSE: 10.009507 ( 0.65 ) - R²: -0.0001939551875531631  val_loss 0.2562738358974457
Epoch: 39 > RMSE: 9.625812 ( 0.65 ) - R²: 0.07501725422948435  val_loss 0.



Epoch: 0 > RMSE: 11.266404 ( 0.65 ) - R²: -0.26715438069405906  val_loss 0.27560141682624817
Epoch: 1 > RMSE: 11.140067 ( 0.65 ) - R²: -0.23889492524409195  val_loss 0.27275121212005615
Epoch: 2 > RMSE: 10.30508 ( 0.65 ) - R²: -0.060136118106337344  val_loss 0.25472384691238403
Epoch: 3 > RMSE: 10.191155 ( 0.65 ) - R²: -0.036825668505932274  val_loss 0.23789304494857788
Epoch: 5 > RMSE: 10.01072 ( 0.65 ) - R²: -0.0004364712574516094  val_loss 0.23747147619724274
Epoch: 6 > RMSE: 9.934973 ( 0.65 ) - R²: 0.014646004455420414  val_loss 0.23025228083133698
Epoch: 16 > RMSE: 4.2544694 ( 0.65 ) - R²: 0.8193034889333736  val_loss 0.1885787546634674
Epoch: 17 > RMSE: 3.3642476 ( 0.65 ) - R²: 0.8870113721770204  val_loss 0.15780071914196014
Epoch: 30 > RMSE: 3.6038568 ( 0.65 ) - R²: 0.8703436298736184  val_loss 0.15297822654247284
Epoch: 46 > RMSE: 3.2515001 ( 0.65 ) - R²: 0.8944577303158057  val_loss 0.11863744258880615
Epoch: 72 > RMSE: 4.386991 ( 0.65 ) - R²: 0.8078712786897292  val_loss 0.1



Epoch: 0 > RMSE: 28.076353 ( 10.267 ) - R²: -0.26147596678947127  val_loss 0.1434907466173172
Epoch: 1 > RMSE: 25.065573 ( 10.267 ) - R²: -0.0054324354945487485  val_loss 0.12340858578681946
Epoch: 4 > RMSE: 21.084618 ( 10.267 ) - R²: 0.2885751196296703  val_loss 0.10725518316030502
Epoch: 8 > RMSE: 18.098942 ( 10.267 ) - R²: 0.47579154681788194  val_loss 0.09189723432064056
Epoch: 9 > RMSE: 17.888037 ( 10.267 ) - R²: 0.4879375079614082  val_loss 0.09036963433027267
Epoch: 13 > RMSE: 16.931723 ( 10.267 ) - R²: 0.5412247960730328  val_loss 0.08581516146659851
Epoch: 20 > RMSE: 16.17534 ( 10.267 ) - R²: 0.5812984973334095  val_loss 0.0823463574051857
Epoch: 102 > RMSE: 15.737817 ( 10.267 ) - R²: 0.6036429729152074  val_loss 0.0805252343416214
Saved best 0.0805 at epoch 102
15.737817  ( 10.267 ) in 00:16:46
< decon-dumb_set-31441 >




Epoch: 0 > RMSE: 25.173283 ( 10.267 ) - R²: -0.014091846263132313  val_loss 0.11260968446731567
Epoch: 3 > RMSE: 18.525513 ( 10.267 ) - R²: 0.4507904956410106  val_loss 0.11185597628355026
Epoch: 36 > RMSE: 16.85721 ( 10.267 ) - R²: 0.5452538512021413  val_loss 0.10657645761966705
Epoch: 45 > RMSE: 16.465078 ( 10.267 ) - R²: 0.5661643666321429  val_loss 0.10575670748949051
epoch 00163