## Dependencies

Install dependencies not available on Google Collab.
Collab provides numpy, pandas, sklearn, tensorflow, scipy, etc. (see requirements.txt)

In [6]:
!pip install pinard
!pip install scikeras









## Benchmark details

The results aggregate the combination of the following trainings configurations:
- estimation configuration: [regression, classification]
- datasets configurations: [Single Train, Cross validation with 5 folds and 2 repeats, Augmented Single Train]
- preprocessing configuration: [flat spectrum, savgol, haar, [small set], [big_set]]
- models: 
   - for all configuration: BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS
   - for single train + small_set : Stack > [ BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS,
   f_PLSRegression,f_AdaBoostRegressor,f_BaggingRegressor,f_ExtraTreesRegressor, f_GradientBoostingRegressor,f_RandomForestRegressor,
   f_ARDRegression,f_BayesianRidge,f_ElasticNet,f_ElasticNetCV,f_HuberRegressor, f_LarsCV,f_LassoCV,f_Lasso,f_LassoLars,f_LassoLarsCV,
   f_LassoLarsIC,f_LinearRegression,f_OrthogonalMatchingPursuit,f_OrthogonalMatchingPursuitCV, f_PassiveAggressiveRegressor,f_RANSACRegressor,
   f_Ridge,f_RidgeCV,f_SGDRegressor,f_TheilSenRegressor,f_GaussianProcessRegressor,f_KNeighborsRegressor, f_Pipeline,f_MLPRegressor,f_LinearSVR,
   f_NuSVR,f_SVR,f_DecisionTreeRegressor,f_ExtraTreeRegressor,f_KernelRidge,f_XGBRegressor]

We perform training in 2 steps, (1) data transformation and (2) training because the sklearn pipeline does not use test data natively.
To change with pinard update in the future.

In [1]:
### FAST GPU RESET ####
from numba import cuda 
device = cuda.get_current_device()
device.reset()

In [27]:
## Browse path and launch benchmark for every folders
%load_ext autoreload
%autoreload 2

from pathlib import Path
from preprocessings import preprocessing_list

from benchmark_loop import benchmark_dataset

rootdir = Path('data/regression')
folder_list = [f for f in rootdir.glob('**/*') if f.is_dir()]

SEED = ord('D') + 31373

import preprocessings
import regressors
import pinard.preprocessing as pp
from pinard import augmentation, model_selection

import sys
import os.path

def str_to_class(classname):
    return getattr(sys.modules['pinard.preprocessing'], classname)

# print(str_to_class('SavitzkyGolay'))




def get_dataset_list(path):
    datasets = []
    for r, d, _ in os.walk(path):
        for folder in d:
            path = os.path.join(r, folder)
            if os.path.isdir(path):
                datasets.append(str(path))
    return datasets

split_list = [
    None,
    {'test_size':None, 'method':"random", 'random_state':SEED},
    {'test_size':None, 'method':"stratified", 'random_state':SEED, 'n_bins':5},
    {'test_size':None, 'method':"kennard_stone", 'random_state':SEED, 'metric':"euclidean", 'pca_components':None},
]

augmentations = [
    None,
    [(augmentation.Rotate_Translate(), 2),
    (augmentation.Random_X_Operation(), 2),
    (augmentation.Random_Spline_Addition(), 1),]
]

preprocessings_list = [
    preprocessings.id_preprocessing(),
    [pp.Haar(), pp.SavitzkyGolay()],
    preprocessings.dumb_set(),
]

models = [
    (regressors.VGG_1D(), {'batch_size':256, 'epoch':2000, 'verbose':0, 'optimizer':'Adam', 'loss':'mse'}),
    # (regressors.SKLEARN(), {'batch_size' = 256, 'epoch'=2000, 'optimizer'='Adam', 'loss'='mse'}),
]

training_config = {
    #"splitting":"random",
    #"splitting_ratio":"0.2",
    #"cross_validation" = "stratified_kfold",
    #"cross_validation_size = (4,2)
}

# import os
folder = "data/regression"

benchmark_dataset(get_dataset_list(folder), split_list, training_config, augmentations, preprocessings_list, models, SEED,)



# (preprocessing_list, nn_run, nn_cv, ml_single, ml_cv)

# benchmark_dataset("data/regression/ALPINE_Calpine_424_Murguzur_RMSE1.36", SEED, preprocessing_list(), 200)
# benchmark_dataset("data/regression/Cassava_TBC_3556_Davrieux_RMSE1.02", SEED, preprocessing_list(), augment=True)
# benchmark_dataset("data/regression/LUCAS_SOCgrassland_4096_Nocita_RMSE7.2", SEED, preprocessing_list(), 20, augment=False)
# benchmark_dataset("data/regression/ALPINE_Calpine_424_Murguzur_RMSE1.36", SEED, preprocessing_list(), 100, augment=False)
# benchmark_dataset("data/regression/Cassava_TBC_3556_Davrieux_RMSE1.02", SEED, preprocessing_list(), 20, augment=False)
# benchmark_dataset("data/regression/Meat_FatE1_215_Borggaard_RMSE2.33", SEED, preprocessing_list(), 100, augment=False)

# for folder in folder_list:
    # # print(ord(str(folder)[17]), ord('A'), ord('M'))
    # if ord(str(folder)[16]) < ord("L") or ord(str(folder)[16]) > ord("M"):
    #     continue
    # benchmark_dataset(folder, SEED, preprocessing_list(), 20, augment=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


TypeError: 'module' object is not callable

In [None]:
split_list = [
    ({"method": "random", "test_size": 0.25, "random_state": 42}, 0),
    (
        {
            "method": "k_mean",
            "test_size": 0.25,
            "random_state": 42,
            "metric": "canberra",
        },
        1,
    ),
    (
        {
            "pca_components": 4,
            "method": "k_mean",
            "test_size": 0.25,
            "random_state": 42,
            "metric": "canberra",
        },
        2,
    ),
    ({"method": "kennard_stone", "test_size": 0.25, "random_state": 42}, 3),
    (
        {
            "method": "kennard_stone",
            "test_size": 0.25,
            "random_state": 42,
            "metric": "correlation",
            "pca_components": 8,
        },
        4,
    ),
    (
        {
            "method": "kennard_stone",
            "test_size": 0.25,
            "random_state": 42,
            "metric": "correlation",
        },
        5,
    ),
    ({"method": "spxy", "test_size": 0.25, "random_state": 42}, 6),
    ({"method": "spxy", "test_size": 0.25, "random_state": 42, "pca_components": 2}, 7),
    (
        {"method": "spxy", "test_size": 0.25, "random_state": 42, "metric": "canberra"},
        8,
    ),
    ({"method": "stratified", "test_size": 0.25, "random_state": 42}, 9),
    ({"method": "stratified", "test_size": 0.25, "random_state": 42, "n_bins": 4}, 10),
    ({"method": "circular", "test_size": 0.25, "random_state": 42}, 11),
]

In [18]:
# config = {
#     'augmentation':[],
#     'preprocessing':[],
#     'runs':[
#         {
#             'model':'PLS',
#         }
#     ]
# }


# def parse_json(config, name=''):
#     collection = []
#     if isinstance(config, list):
        
#         pass
#     elif isinstance(config, dict):
#         pass
    
#     return name, collection
a = [5,6]
next(iter(a))

5