## Dependencies

Install dependencies not available on Google Collab.
Collab provides numpy, pandas, sklearn, tensorflow, scipy, etc. (see requirements.txt)

In [6]:
!pip install pinard
!pip install scikeras









## Benchmark details

The results aggregate the combination of the following trainings configurations:
- estimation configuration: [regression, classification]
- datasets configurations: [Single Train, Cross validation with 5 folds and 2 repeats, Augmented Single Train]
- preprocessing configuration: [flat spectrum, savgol, haar, [small set], [big_set]]
- models: 
   - for all configuration: BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS
   - for single train + small_set : Stack > [ BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS,
   f_PLSRegression,f_AdaBoostRegressor,f_BaggingRegressor,f_ExtraTreesRegressor, f_GradientBoostingRegressor,f_RandomForestRegressor,
   f_ARDRegression,f_BayesianRidge,f_ElasticNet,f_ElasticNetCV,f_HuberRegressor, f_LarsCV,f_LassoCV,f_Lasso,f_LassoLars,f_LassoLarsCV,
   f_LassoLarsIC,f_LinearRegression,f_OrthogonalMatchingPursuit,f_OrthogonalMatchingPursuitCV, f_PassiveAggressiveRegressor,f_RANSACRegressor,
   f_Ridge,f_RidgeCV,f_SGDRegressor,f_TheilSenRegressor,f_GaussianProcessRegressor,f_KNeighborsRegressor, f_Pipeline,f_MLPRegressor,f_LinearSVR,
   f_NuSVR,f_SVR,f_DecisionTreeRegressor,f_ExtraTreeRegressor,f_KernelRidge,f_XGBRegressor]

We perform training in 2 steps, (1) data transformation and (2) training because the sklearn pipeline does not use test data natively.
To change with pinard update in the future.

In [7]:
### FAST GPU RESET ####
from numba import cuda 
device = cuda.get_current_device()
device.reset()

In [1]:
import datetime
import json
import math
import numpy as np
import time
import os
from collections import OrderedDict

from contextlib import redirect_stdout
# import joblib
# import pickle

from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics \
    import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error,\
        r2_score, explained_variance_score, mean_squared_log_error, median_absolute_error
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
import tensorflow as tf

from data import load_data
from preprocessings import transform_test_data
from regressors import nn_list, ml_list, get_keras_model
from pinard import augmentation, sklearn

tf.get_logger().setLevel('ERROR')
tf.keras.mixed_precision.set_global_policy('mixed_float16')

def get_datasheet(dataset_name, model_name, path, SEED, y_valid, y_pred):
    return {
        "model":model_name, 
        "dataset":dataset_name,
        "seed":str(SEED),
        "targetRMSE":str(float(os.path.split(path)[-1].split('_')[-1].split("RMSE")[-1])),
        "RMSE":str(mean_squared_error(y_valid, y_pred, squared=False)),
        "MAPE":str(mean_absolute_percentage_error(y_valid, y_pred)),
        "R2":str(r2_score(y_valid, y_pred)),
        "MAE":str(mean_absolute_error(y_valid, y_pred)),
        "MSE":str(mean_squared_error(y_valid, y_pred, squared=True)),
        "MedAE":str(median_absolute_error(y_valid, y_pred)),
        "EVS":str(explained_variance_score(y_valid, y_pred)),
        # "MSLE":str(mean_squared_log_error(y_valid, y_pred)),
        "run":datetime.datetime.now().strftime("%Y-%m-%d  %H:%M:%S")
    }

def log_run(dataset_name, model_name, path, SEED, y_valid, y_pred, elapsed_time):
    datasheet = get_datasheet(dataset_name, model_name, path, SEED, y_valid, y_pred)
    ### Save data
    folder = "results/" + dataset_name
    if not os.path.isdir(folder):
        os.makedirs(folder)

    canon_name = folder + "/" + model_name

        ## save predictions
    np.savetxt(canon_name + '.csv', np.column_stack((y_valid, y_pred)))

    ## save main metrics globally
    result_file = open(folder + "/_runs.txt", "a")
    log = datasheet["RMSE"] + "  ---  " + model_name + " in " + time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) \
        + ' ('+ datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + ')\n'
    result_file.write(log)
    result_file.close()

    ## save pipeline
    # joblib.dump(estimator, canon_name + '.pkl')

    return datasheet

current_estimator = None
current_X_test = None
current_y_test = None
current_path = None

def callback_predict(epoch, val_loss):
    if current_estimator is None:
        return
    
    y_pred = current_estimator.predict(current_X_test)
    res = get_datasheet("", "", current_path, -1, current_y_test, y_pred)
    print('Epoch:', epoch,'> RMSE:', res['RMSE'], '(', res['targetRMSE'], ') - R²:', res['R2'], ' val_loss', val_loss)

def evaluate_pipeline(desc, model_name, data, transformers):
    print("<", model_name, ">")
    start_time = time.time()

    # Unpack args
    X_train, y_train, X_valid, y_valid = data
    dataset_name, path, global_result_file, results, SEED = desc
    global current_path
    current_path = path
    y_scaler, transformer_pipeline, regressor = transformers

    # Construct pipeline
    pipeline = Pipeline([
        ('transformation', transformer_pipeline), 
        (model_name, regressor)
    ])

    # Fit estimator
    estimator = TransformedTargetRegressor(regressor = pipeline, transformer = y_scaler)
    global current_estimator
    current_estimator = estimator
    global current_X_test
    current_X_test = X_valid
    global current_y_test
    current_y_test = y_valid
    estimator.fit(X_train, y_train)
    # Evaluate estimator
    y_pred = estimator.predict(X_valid)
    elapsed_time = time.time() - start_time
    datasheet = log_run(dataset_name, model_name, path, SEED, y_valid, y_pred, elapsed_time)
    datasheet["training_time"] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
    results[model_name] = datasheet

    # Save results
    results = OrderedDict(sorted(results.items(), key=lambda k_v: float(k_v[1]['RMSE'])))
    with open(global_result_file, 'w') as fp:
        json.dump(results, fp, indent=4)

    print(datasheet["RMSE"], " (", datasheet["targetRMSE"], ") in", datasheet["training_time"])
    return y_pred


def benchmark_dataset(path, SEED, preprocessing_list, batch_size=50, augment=False):
    dataset_name = ('_').join(os.path.split(path)[-1].split('_')[:-1])
    print("="*10, str(dataset_name).upper(), end=" ")
    global_result_file = "results/" + dataset_name + '_results.json'
    results = {}
    if os.path.isfile(global_result_file):
        with open(global_result_file) as json_file:
            results = json.load(json_file)
   
    desc = (dataset_name, path, global_result_file, results, SEED)

    X, y, X_valid, y_valid = load_data(path)
    print(X.shape, y.shape, X_valid.shape, y_valid.shape, "="*10)
    

    #########################
    ### SINGLE RUN TRAINING
    X_train, y_train, X_test, y_test = X, y, X_valid, y_valid

    if(augment):
        augmentation_pipeline = sklearn.SampleAugmentation([
            (2, 'rot_tr', augmentation.Rotate_Translate()),
            (1, 'rd_mult', augmentation.Random_X_Operation()),
            (1, 'simpl', augmentation.Random_Spline_Addition())
        ])
        print(X_train.shape, y_train.shape)
        X_train, y_train = augmentation_pipeline.transform(X_train, y_train)
        print("augmented to:", X_train.shape, y_train.shape)

    data = (X_train, y_train, X_valid, y_valid)
    for preprocessing in preprocessing_list:            
        ##### DEEP LEARNING #####
        X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="augmentation")
        for model_desc in nn_list():
            model_name = model_desc.__name__ + "-" + preprocessing.__name__ + "-" + str(SEED)
            if os.path.isfile(  "results/" + dataset_name + "/" + model_name + '.csv'):
                # print("Skipping", model_name)
                continue
            
            # batch_size = 3000
            # if preprocessing.__name__ == "dumb_set":
            #     batch_size = 3
            regressor = get_keras_model(dataset_name + '_' + model_name, model_desc, 4096, batch_size, X_test_pp, y_test_pp, transfer=True, callback_func=callback_predict, verbose=0, seed=SEED)
            transformers = (y_scaler, transformer_pipeline, regressor)
            evaluate_pipeline(desc, model_name, data, transformers)
        
        # ##### MACHINE LEARNING #####
        # X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="union")
        # for regressor, mdl_name in ml_list(SEED, X_test_pp, y_test_pp):
        #     model_name = mdl_name + "-" + preprocessing.__name__ + "-" + str(SEED)
        #     if os.path.isfile(  "results/" + dataset_name + "/" + model_name + '.csv'):
        #        # print("Skipping", model_name)
        #         continue
        #     transformers = (y_scaler, transformer_pipeline, regressor)
        #     evaluate_pipeline(desc, model_name, data, transformers)

    #########################



    #########################
    # ### CROSS VALIDATION TRAINING
    # cv_predictions = {}
    # for preprocessing in preprocessing_list():
    #     fold = RepeatedKFold(n_splits=5, n_repeats=2, random_state=SEED)
    #     fold_index = 0
    #     for train_index, test_index in fold.split(X):
    #         X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
    #         data = (X_train, y_train, X_valid, y_valid)
            
    #         ##### DEEP LEARNING #####
    #         X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="augmentation")
    #         for model_desc in nn_list():
    #             model_name = model_desc.__name__ + "-" + preprocessing.__name__  + "-" + str(SEED)
    #             fold_name = model_name + "-F" + str(fold_index)
    #             if os.path.isfile(  "results/" + dataset_name + "/" + fold_name + '.csv'):
    #                # print("Skipping", model_name)
    #                 continue
    #             regressor = get_keras_model(dataset_name + '_' + fold_name, model_desc, 7500, 750, X_test_pp, y_test_pp, verbose=0, seed=SEED)
    #             y_pred = evaluate_pipeline(desc, fold_name, data, (y_scaler, transformer_pipeline, regressor))
    #             cv_predictions[model_name] = cv_predictions[model_name] + y_pred if model_name in cv_predictions else y_pred
            
    #         ##### MACHINE LEARNING #####
    #         X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(preprocessing, X_train, y_train, X_test, y_test, type="union")
    #         for regressor, mdl_name in ml_list(SEED, X_test_pp, y_test_pp):
    #             model_name = mdl_name + "-" + preprocessing.__name__ + "-" + str(SEED)
    #             fold_name = model_name + "-F" + str(fold_index)
    #             if os.path.isfile(  "results/" + dataset_name + "/" + fold_name + '.csv'):
    #              #  print("Skipping", model_name)
    #                 continue
    #             y_pred = evaluate_pipeline(desc, fold_name, data, (y_scaler, transformer_pipeline, regressor))
    #             cv_predictions[model_name] = cv_predictions[model_name] + y_pred if model_name in cv_predictions else y_pred
                
    #         fold_index +=1

    # for key, val in cv_predictions.items():
    #     y_pred = val / fold.get_n_splits()
    #     datasheet = get_datasheet(dataset_name, key, path, SEED, y_valid, y_pred)
    #     results[key +"_CV"] = datasheet
    #
    # results = OrderedDict(sorted(results.items(), key=lambda k_v: float(k_v[1]['RMSE'])))
    # with open(global_result_file, 'w') as fp:
    #     json.dump(results, fp, indent=4)

    # #########################


    


In [6]:
## Browse path and launch benchmark for every folders
%load_ext autoreload
%autoreload 2

from pathlib import Path
from preprocessings import preprocessing_list

rootdir = Path('data/regression')
folder_list = [f for f in rootdir.glob('**/*') if f.is_dir()]

SEED = ord('D') + 31373
np.random.seed(SEED)
tf.random.set_seed(SEED)

# (preprocessing_list, nn_run, nn_cv, ml_single, ml_cv)

# benchmark_dataset("data/regression/ALPINE_Calpine_424_Murguzur_RMSE1.36", SEED, preprocessing_list(), 200)
# benchmark_dataset("data/regression/Cassava_TBC_3556_Davrieux_RMSE1.02", SEED, preprocessing_list(), augment=True)
benchmark_dataset("data/regression/LUCAS_SOCgrassland_4096_Nocita_RMSE7.2", SEED, preprocessing_list(), 20, augment=False)
benchmark_dataset("data/regression/ALPINE_Calpine_424_Murguzur_RMSE1.36", SEED, preprocessing_list(), 100, augment=False)
benchmark_dataset("data/regression/Cassava_TBC_3556_Davrieux_RMSE1.02", SEED, preprocessing_list(), 20, augment=False)
benchmark_dataset("data/regression/Meat_FatE1_215_Borggaard_RMSE2.33", SEED, preprocessing_list(), 100, augment=False)

# for folder in folder_list:
    # # print(ord(str(folder)[17]), ord('A'), ord('M'))
    # if ord(str(folder)[16]) < ord("L") or ord(str(folder)[16]) > ord("M"):
    #     continue
    # benchmark_dataset(folder, SEED, preprocessing_list(), 20, augment=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
--- Trainable: 120945 - untrainable: 0.0 > 120945.0
< transformer_nirs-decon_set-31441 >




ResourceExhaustedError: Graph execution error:

Detected at node 'model_6/multi_head_attention_22/softmax_22/Softmax' defined at (most recent call last):
    File "C:\Users\grego\AppData\Local\Programs\Python\Python37\lib\runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "C:\Users\grego\AppData\Local\Programs\Python\Python37\lib\runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\grego\AppData\Local\Programs\Python\Python37\lib\asyncio\base_events.py", line 541, in run_forever
      self._run_once()
    File "C:\Users\grego\AppData\Local\Programs\Python\Python37\lib\asyncio\base_events.py", line 1786, in _run_once
      handle._run()
    File "C:\Users\grego\AppData\Local\Programs\Python\Python37\lib\asyncio\events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
      await result
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\interactiveshell.py", line 2915, in run_cell
      raw_cell, store_history, silent, shell_futures)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\interactiveshell.py", line 2960, in _run_cell
      return runner(coro)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\interactiveshell.py", line 3186, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\interactiveshell.py", line 3377, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\grego\AppData\Local\Temp/ipykernel_27656/1583894892.py", line 19, in <module>
      benchmark_dataset("data/regression/LUCAS_SOCgrassland_4096_Nocita_RMSE7.2", SEED, preprocessing_list(), 100, augment=False)
    File "C:\Users\grego\AppData\Local\Temp/ipykernel_27656/2898103413.py", line 169, in benchmark_dataset
      evaluate_pipeline(desc, model_name, data, transformers)
    File "C:\Users\grego\AppData\Local\Temp/ipykernel_27656/2898103413.py", line 108, in evaluate_pipeline
      estimator.fit(X_train, y_train)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\sklearn\compose\_target.py", line 246, in fit
      self.regressor_.fit(X, y_trans, **fit_params)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\sklearn\pipeline.py", line 394, in fit
      self._final_estimator.fit(Xt, y, **fit_params_last_step)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\scikeras\wrappers.py", line 767, in fit
      **kwargs,
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\scikeras\wrappers.py", line 938, in _fit
      **kwargs,
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\scikeras\wrappers.py", line 526, in _fit_keras_model
      hist = self.model_.fit(x=X, y=y, **fit_args)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 889, in train_step
      y_pred = self(x, training=True)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\functional.py", line 459, in call
      inputs, training=training, mask=mask)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\functional.py", line 596, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\layers\attention\multi_head_attention.py", line 510, in call
      query, key, value, attention_mask, training)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\layers\attention\multi_head_attention.py", line 474, in _compute_attention
      attention_scores = self._masked_softmax(attention_scores, attention_mask)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\layers\attention\multi_head_attention.py", line 438, in _masked_softmax
      return self._softmax(attention_scores, attention_mask)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\layers\activation\softmax.py", line 98, in call
      return backend.softmax(inputs, axis=self.axis[0])
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\backend.py", line 5039, in softmax
      return tf.nn.softmax(x, axis=axis)
Node: 'model_6/multi_head_attention_22/softmax_22/Softmax'
OOM when allocating tensor with shape[100,2,4198,4198] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model_6/multi_head_attention_22/softmax_22/Softmax}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_16698836]