## Dependencies

Install dependencies not available on Google Collab.
Collab provides numpy, pandas, sklearn, tensorflow, scipy, etc. (see requirements.txt)

In [41]:
!pip install pinard
!pip install scikeras









## Benchmark details

The results aggregate the combination of the following trainings configurations:
- estimation configuration: [regression, classification]
- datasets configurations: [Single Train, Cross validation with 5 folds and 2 repeats, Augmented Single Train]
- preprocessing configuration: [flat spectrum, savgol, haar, [small set], [big_set]]
- models: 
   - for all configuration: BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS
   - for single train + small_set : Stack > [ BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS,
   f_PLSRegression,f_AdaBoostRegressor,f_BaggingRegressor,f_ExtraTreesRegressor, f_GradientBoostingRegressor,f_RandomForestRegressor,
   f_ARDRegression,f_BayesianRidge,f_ElasticNet,f_ElasticNetCV,f_HuberRegressor, f_LarsCV,f_LassoCV,f_Lasso,f_LassoLars,f_LassoLarsCV,
   f_LassoLarsIC,f_LinearRegression,f_OrthogonalMatchingPursuit,f_OrthogonalMatchingPursuitCV, f_PassiveAggressiveRegressor,f_RANSACRegressor,
   f_Ridge,f_RidgeCV,f_SGDRegressor,f_TheilSenRegressor,f_GaussianProcessRegressor,f_KNeighborsRegressor, f_Pipeline,f_MLPRegressor,f_LinearSVR,
   f_NuSVR,f_SVR,f_DecisionTreeRegressor,f_ExtraTreeRegressor,f_KernelRidge,f_XGBRegressor]

We perform training in 2 steps, (1) data transformation and (2) training because the sklearn pipeline does not use test data natively.
To change with pinard update in the future.

In [7]:
### FAST GPU RESET ####
from numba import cuda 
device = cuda.get_current_device()
device.reset()

In [6]:
## Browse path and launch benchmark for every folders
%load_ext autoreload
%autoreload 2

from pathlib import Path
from preprocessings import preprocessing_list

from benchmark_loop import benchmark_dataset

import tensorflow as tf

tf.get_logger().setLevel("ERROR")
tf.keras.mixed_precision.set_global_policy("mixed_float16")

rootdir = Path('data/regression')
folder_list = [f for f in rootdir.glob('**/*') if f.is_dir()]

SEED = ord('D') + 31373

# tf.keras.utils.set_random_seed(SEED)
# tf.config.experimental.enable_op_determinism()


import preprocessings
import regressors
import pinard.preprocessing as pp
from pinard import augmentation, model_selection
from sklearn.cross_decomposition import PLSRegression
from xgboost import XGBRegressor
import sys
import os.path

def str_to_class(classname):
    return getattr(sys.modules['pinard.preprocessing'], classname)

# print(str_to_class('SavitzkyGolay'))




def get_dataset_list(path):
    datasets = []
    for r, d, _ in os.walk(path):
        for folder in d:
            path = os.path.join(r, folder)
            if os.path.isdir(path):
                # if len(datasets) < 3:
                datasets.append(str(path))
    return datasets

split_configs = [
    None,
    # {'test_size':None, 'method':"random", 'random_state':SEED},
    # {'test_size':None, 'method':"stratified", 'random_state':SEED, 'n_bins':5},
    # {'test_size':0.25, 'method':"spxy", 'random_state':SEED, 'metric':"euclidean", 'pca_components':250},
]

augmentations = [
    None,
    # [(6, augmentation.Rotate_Translate())],
    # [(3, augmentation.Rotate_Translate()),(2, augmentation.Random_X_Operation()),(1, augmentation.Random_Spline_Addition()),],
    # [(3, augmentation.Rotate_Translate()),(2, augmentation.Random_X_Operation()),(2, augmentation.Random_Spline_Addition()),]
]

preprocessings_list = [
    # None,
    # preprocessings.id_preprocessing(),
    # [('haar', pp.Haar()), ('savgol', pp.SavitzkyGolay())],
    preprocessings.decon_set(),
    # preprocessings.small_set(),
    # preprocessings.transf_set(),
    # preprocessings.optimal_set_2D(),
    # preprocessings.fat_set(),
]



cv_configs = [
    None,
    # {'n_splits':5, 'n_repeats':4},
    # {'n_splits':4, 'n_repeats':2},
    # {'n_splits':3, 'n_repeats':1},
]

# import os
folder = "data/regression"
folders = get_dataset_list(folder)
print(folders)
# folders = ["data/regression/Cassava_TBC_3556_Davrieux_RMSE1.02"]

len_cv_configs = 0
for c in cv_configs:
    if c == None:
        len_cv_configs += 1
    else:
        len_cv_configs += (c['n_splits'] * c['n_repeats'])

models = [
    # (regressors.ML_Regressor(XGBRegressor), {"n_estimators":200, "max_depth":50, "seed":SEED}),
    # (regressors.Transformer_NIRS(), {'batch_size':500, 'epoch':10000, 'verbose':0, 'patience':1000, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.Decon_SepPo(), {'batch_size':50, 'epoch':10000, 'verbose':0, 'patience':1000, 'optimizer':'adam', 'loss':'mse'}),
    (regressors.Decon_Sep(), {'batch_size':2000, 'epoch':20000, 'verbose':0, 'patience':2000, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.CONV_LSTM(), {'batch_size':1000, 'epoch':20000, 'verbose':0, 'patience':2000, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.XCeption1D(), {'batch_size':500, 'epoch':10000, 'verbose':0, 'patience':1200, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.Transformer(), {'batch_size':2, 'epoch':200, 'verbose':0, 'patience':30, 'optimizer':'Adam', 'loss':'mse'}),
]

benchmark_size = len(folders) * len(split_configs) * len_cv_configs * len(augmentations) * len(preprocessings_list) * len(models)
print("Benchmarking", benchmark_size, "runs.")


benchmark_dataset(folders, split_configs, cv_configs, augmentations, preprocessings_list, models, SEED,)


# for folder in folder_list:
    # # print(ord(str(folder)[17]), ord('A'), ord('M'))
    # if ord(str(folder)[16]) < ord("L") or ord(str(folder)[16]) > ord("M"):
    #     continue
    # benchmark_dataset(folder, SEED, preprocessing_list(), 20, augment=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
['data/regression\\LUCAS_SOCcropland_8731_Nocita_RMSE3.5']
Benchmarking 1 runs.
Decon_Sep-NoSpl-NoCV-Fold_1(1)-NoAug-PP_22_53857-31441-23-01-22_02-20-08 (6111, 512) (6111, 1) (2620, 512, 22) (2620, 1)
--- Trainable: 1199201 - untrainable: 4608 > 1203809




ResourceExhaustedError: Graph execution error:

Detected at node 'gradient_tape/sequential_4/separable_conv1d_19/separable_conv2d/DepthwiseConv2dNativeBackpropFilter' defined at (most recent call last):
    File "C:\Users\grego\AppData\Local\Programs\Python\Python37\lib\runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "C:\Users\grego\AppData\Local\Programs\Python\Python37\lib\runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\grego\AppData\Local\Programs\Python\Python37\lib\asyncio\base_events.py", line 541, in run_forever
      self._run_once()
    File "C:\Users\grego\AppData\Local\Programs\Python\Python37\lib\asyncio\base_events.py", line 1786, in _run_once
      handle._run()
    File "C:\Users\grego\AppData\Local\Programs\Python\Python37\lib\asyncio\events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
      await result
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\interactiveshell.py", line 2915, in run_cell
      raw_cell, store_history, silent, shell_futures)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\interactiveshell.py", line 2960, in _run_cell
      return runner(coro)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\interactiveshell.py", line 3186, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\interactiveshell.py", line 3377, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\grego\AppData\Local\Temp/ipykernel_18624/1811637080.py", line 112, in <module>
      benchmark_dataset(folders, split_configs, cv_configs, augmentations, preprocessings_list, models, SEED,)
    File "d:\Workspace\ML\DECON\benchmark_loop.py", line 304, in benchmark_dataset
      y_pred, datasheet = evaluate_pipeline(desc, run_name, data, transformers)
    File "d:\Workspace\ML\DECON\benchmark_loop.py", line 158, in evaluate_pipeline
      estimator.fit(X_train, y_train)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\sklearn\compose\_target.py", line 246, in fit
      self.regressor_.fit(X, y_trans, **fit_params)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\sklearn\pipeline.py", line 394, in fit
      self._final_estimator.fit(Xt, y, **fit_params_last_step)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\scikeras\wrappers.py", line 767, in fit
      **kwargs,
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\scikeras\wrappers.py", line 938, in _fit
      **kwargs,
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\scikeras\wrappers.py", line 526, in _fit_keras_model
      hist = self.model_.fit(x=X, y=y, **fit_args)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\engine\training.py", line 893, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 538, in minimize
      loss, var_list=var_list, grad_loss=grad_loss, tape=tape)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\mixed_precision\loss_scale_optimizer.py", line 706, in _compute_gradients
      tape=tape)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 590, in _compute_gradients
      grads_and_vars = self._get_gradients(tape, loss, var_list, grad_loss)
    File "d:\Workspace\ML\pynirsENV\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 471, in _get_gradients
      grads = tape.gradient(loss, var_list, grad_loss)
Node: 'gradient_tape/sequential_4/separable_conv1d_19/separable_conv2d/DepthwiseConv2dNativeBackpropFilter'
OOM when allocating tensor with shape[4000,1,128,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node gradient_tape/sequential_4/separable_conv1d_19/separable_conv2d/DepthwiseConv2dNativeBackpropFilter}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_106748]

In [None]:
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, cross_val_predict
import numpy as np

def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
    """Scatter plot of the predicted vs true targets."""
    ax.plot(
        [y_true.min(), y_true.max()], [y_true.min(), y_true.max()], "--r", linewidth=2
    )
    ax.scatter(y_true, y_pred, alpha=0.2)

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines["left"].set_position(("outward", 10))
    ax.spines["bottom"].set_position(("outward", 10))
    ax.set_xlim([y_true.min(), y_true.max()])
    ax.set_ylim([y_true.min(), y_true.max()])
    ax.set_xlabel("Measured")
    ax.set_ylabel("Predicted")
    extra = plt.Rectangle(
        (0, 0), 0, 0, fc="w", fill=False, edgecolor="none", linewidth=0
    )
    ax.legend([extra], [scores], loc="upper left")
    title = title + "\n Evaluation in {:.2f} seconds".format(elapsed_time)
    ax.set_title(title)

def plot_data(d, filepath):
    plt.scatter(d[:,0], d[:,1])
    plt.xlabel('test')
    plt.ylabel('predict')
    plt.savefig(filepath + '.png')
    plt.close()

import json
from numpy import genfromtxt

path = 'results'
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.csv'):
            filepath = os.path.join(root, file)
            df = pd.read_csv(filepath)
            my_data = genfromtxt(filepath, delimiter=';')
            # print(my_data)
            plot_data(my_data, filepath.replace('csv','png'))
        # if file.endswith('.json'):
        #     print(file)
        #     dataset = file.replace('.json','')
        #     f = open(os.path.join(root, file))
        #     data = json.load(f)
        #     for key in data:
        #         print(key)

# returns JSON object as 
# a dictionary
            # filepath = os.path.join(root, file)
            # df = pd.read_csv(filepath)
            # y_res = df.iloc[:,0]
            # y_pred = df.iloc[:,1]
            # fig, axs = plt.subplots(1,1, figsize=(10,10))
            # axs = np.ravel(axs)

            # plot_regression_results(
            #     ax,
            #     y,
            #     y_pred,
            #     name,
            #     (r"$R^2={:.2f} \pm {:.2f}$" + "\n" + r"$MAE={:.2f} \pm {:.2f}$").format(
            #         np.mean(score["test_r2"]),
            #         np.std(score["test_r2"]),
            #         -np.mean(score["test_neg_mean_absolute_error"]),
            #         np.std(score["test_neg_mean_absolute_error"]),
            #     ),
            #     elapsed_time,
            # )

    



# plt.suptitle("Single predictors versus stacked predictors")
# plt.tight_layout()
# plt.subplots_adjust(top=0.9)
# plt.show()

In [None]:
from pinard.utils import load_csv
from benchmark_loop import transform_test_data
import preprocessings
import numpy as np

dumb_set = preprocessings.dumb_and_dumber_set()
Xfile = "data/regression/ALPINE_Calpine_424_Murguzur_RMSE1.36/Xcal.csv.gz"
yfile = "data/regression/ALPINE_Calpine_424_Murguzur_RMSE1.36/Ycal.csv.gz"
X_train, y_train = load_csv(Xfile, yfile, x_hdr=0, y_hdr=0, sep=";")
X_train, y_train, X_test, y_test = X_train[0:100], y_train[0:100], X_train[0:100], y_train[0:100]
X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(dumb_set, X_train, y_train, X_test, y_test, type="augmentation")

print(X_test_pp.shape)
sample = X_test_pp[0]
print(len(dumb_set))
ok = []
for i in range(len(dumb_set)-1, -1, -1):
    found = False
    for j in range(i-1, -1, -1):
        if np.allclose(sample[i], sample[j], rtol=10e-3, atol=10e-3):
            found = True
            break
    if not found:
        ok.append(dumb_set[i][0])

print(len(ok))
print(ok)