In [7]:
%reload_ext autoreload
%autoreload 2

import time

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))


from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import RepeatedKFold

from experiments.runner import ExperimentRunner
from experiments.config import Config
from operators.preprocessings import list_of_2D_sets


def get_dataset_list(path):
    datasets = []
    for r, d, _ in os.walk(path):
        for folder in d:
            path = os.path.join(r, folder)
            if os.path.isdir(path):
                datasets.append(str(path))
    return datasets


seed = 123459456
y_pipeline = MinMaxScaler()
list_of_x_transformations = list_of_2D_sets()
path = "FOOD_CHEM"
datasets = get_dataset_list(path)


pls_finetune_experiment = {
    "action": "finetune",
    "finetune_params": {
        'cv': 5,
        'n_trials': 35,
        'model_params': {
            'n_components': ('int', 5, 40),
        },
        'tuner': 'sklearn'
    }
}


configs = []
for dataset in datasets:
    # print(dataset)
    for x_tr in list_of_x_transformations:
        x_pipeline = [
            MinMaxScaler(),
            {"split": RepeatedKFold(n_splits=3, n_repeats=1)},
            x_tr,
            MinMaxScaler()
        ]
        
        configs.append(
            Config(
                dataset=dataset,
                model=PLSRegression,
                y_pipeline=y_pipeline,
                x_pipeline=x_pipeline,
                experiment=pls_finetune_experiment,
                seed=seed
            )
        )
        
print(f"Number of experiments: {len(configs)}")
start = time.time()
runner = ExperimentRunner(configs, resume_mode="restart")
dataset, model_manager = runner.run()
end = time.time()
print(f"Time elapsed: {end-start} seconds")

2024-10-16 00:57:11,423 - INFO - ### PREPARING DATA ###
2024-10-16 00:57:11,423 - INFO - ### LOADING DATASET ###


Number of experiments: 4
>> Browsing FOOD_CHEM\Rice_Amylose_313_Sampaio
No train_group file found for FOOD_CHEM\Rice_Amylose_313_Sampaio.
No test_group file found for FOOD_CHEM\Rice_Amylose_313_Sampaio.


2024-10-16 00:57:11,724 - INFO - Dataset(x_train:(203, 1154) - y_train:(203, 1), x_test:(110, 1154) - y_test:(110, 1))
2024-10-16 00:57:11,725 - INFO - ### PROCESSING DATASET ###
2024-10-16 00:57:11,729 - INFO - Dataset(x_train:(203, 1154) - y_train:(203, 1), x_test:(110, 1154) - y_test:(110, 1))
Folds size: 135-68, 135-68, 136-67
2024-10-16 00:57:11,730 - INFO - ### PREPARING MODEL ###
2024-10-16 00:57:11,731 - INFO - Running config > {'dataset': 'FOOD_CHEM\\Rice_Amylose_313_Sampaio', 'x_pipeline': [{'class': 'sklearn.preprocessing.MinMaxScaler', 'params': {'clip': False, 'copy': True, 'feature_range': [0, 1]}}, {'split': {'class': 'sklearn.model_selection.RepeatedKFold', 'params': {'cv': {'class': 'sklearn.model_selection.KFold', 'params': None}, 'n_repeats': 1, 'random_state': None, 'cvargs': {'n_splits': 3}}}}, None, {'class': 'sklearn.preprocessing.MinMaxScaler', 'params': {'clip': False, 'copy': True, 'feature_range': [0, 1]}}], 'y_pipeline': {'class': 'sklearn.preprocessing.MinM

{'initial_shape': (203, 1154), 'delimiter': ';', 'numeric_delimiter': '.', 'header_line': 0, 'final_shape': (203, 1154), 'na_handling': {'strategy': 'abort', 'nb_removed_rows': None, 'removed_rows': None}}
{'initial_shape': (110, 1154), 'delimiter': ';', 'numeric_delimiter': '.', 'header_line': 0, 'final_shape': (110, 1154), 'na_handling': {'strategy': 'abort', 'nb_removed_rows': None, 'removed_rows': None}}
Using framework: sklearn
Training fold 1, with shapes: (135, 1154) (135, 1) (68, 1154) (68, 1)
Training fold 2, with shapes: (135, 1154) (135, 1) (68, 1154) (68, 1)
Training fold 3, with shapes: (136, 1154) (136, 1) (67, 1154) (67, 1)
>> Browsing FOOD_CHEM\Wheat_Proteine_100_Kalivas1
No train_group file found for FOOD_CHEM\Wheat_Proteine_100_Kalivas1.
No test_group file found for FOOD_CHEM\Wheat_Proteine_100_Kalivas1.


2024-10-16 00:57:11,910 - INFO - Dataset(x_train:(50, 701) - y_train:(50, 1), x_test:(20, 701) - y_test:(20, 1))
2024-10-16 00:57:11,910 - INFO - ### PROCESSING DATASET ###
2024-10-16 00:57:11,912 - INFO - Dataset(x_train:(50, 701) - y_train:(50, 1), x_test:(20, 701) - y_test:(20, 1))
Folds size: 33-17, 33-17, 34-16
2024-10-16 00:57:11,912 - INFO - ### PREPARING MODEL ###
2024-10-16 00:57:11,913 - INFO - Running config > {'dataset': 'FOOD_CHEM\\Wheat_Proteine_100_Kalivas1', 'x_pipeline': [{'class': 'sklearn.preprocessing.MinMaxScaler', 'params': {'clip': False, 'copy': True, 'feature_range': [0, 1]}}, {'split': {'class': 'sklearn.model_selection.RepeatedKFold', 'params': {'cv': {'class': 'sklearn.model_selection.KFold', 'params': None}, 'n_repeats': 1, 'random_state': None, 'cvargs': {'n_splits': 3}}}}, None, {'class': 'sklearn.preprocessing.MinMaxScaler', 'params': {'clip': False, 'copy': True, 'feature_range': [0, 1]}}], 'y_pipeline': {'class': 'sklearn.preprocessing.MinMaxScaler', '

{'initial_shape': (50, 701), 'delimiter': ';', 'numeric_delimiter': '.', 'header_line': 0, 'final_shape': (50, 701), 'na_handling': {'strategy': 'abort', 'nb_removed_rows': None, 'removed_rows': None}}
{'initial_shape': (20, 701), 'delimiter': ';', 'numeric_delimiter': '.', 'header_line': 0, 'final_shape': (20, 701), 'na_handling': {'strategy': 'abort', 'nb_removed_rows': None, 'removed_rows': None}}
Using framework: sklearn
Training fold 1, with shapes: (33, 701) (33, 1) (17, 701) (17, 1)
Training fold 2, with shapes: (33, 701) (33, 1) (17, 701) (17, 1)
Training fold 3, with shapes: (34, 701) (34, 1) (16, 701) (16, 1)
>> Browsing FOOD_CHEM\Wheat_Proteine_100_Marx
No train_group file found for FOOD_CHEM\Wheat_Proteine_100_Marx.
No test_group file found for FOOD_CHEM\Wheat_Proteine_100_Marx.
{'initial_shape': (67, 701), 'delimiter': ';', 'numeric_delimiter': '.', 'header_line': 0, 'final_shape': (67, 701), 'na_handling': {'strategy': 'abort', 'nb_removed_rows': None, 'removed_rows': Non

2024-10-16 00:57:12,080 - INFO - Dataset(x_train:(67, 701) - y_train:(67, 1), x_test:(33, 701) - y_test:(33, 1))
2024-10-16 00:57:12,081 - INFO - ### PROCESSING DATASET ###
2024-10-16 00:57:12,083 - INFO - Dataset(x_train:(67, 701) - y_train:(67, 1), x_test:(33, 701) - y_test:(33, 1))
Folds size: 44-23, 45-22, 45-22
2024-10-16 00:57:12,084 - INFO - ### PREPARING MODEL ###
2024-10-16 00:57:12,084 - INFO - Running config > {'dataset': 'FOOD_CHEM\\Wheat_Proteine_100_Marx', 'x_pipeline': [{'class': 'sklearn.preprocessing.MinMaxScaler', 'params': {'clip': False, 'copy': True, 'feature_range': [0, 1]}}, {'split': {'class': 'sklearn.model_selection.RepeatedKFold', 'params': {'cv': {'class': 'sklearn.model_selection.KFold', 'params': None}, 'n_repeats': 1, 'random_state': None, 'cvargs': {'n_splits': 3}}}}, None, {'class': 'sklearn.preprocessing.MinMaxScaler', 'params': {'clip': False, 'copy': True, 'feature_range': [0, 1]}}], 'y_pipeline': {'class': 'sklearn.preprocessing.MinMaxScaler', 'para

{'initial_shape': (33, 701), 'delimiter': ';', 'numeric_delimiter': '.', 'header_line': 0, 'final_shape': (33, 701), 'na_handling': {'strategy': 'abort', 'nb_removed_rows': None, 'removed_rows': None}}
Using framework: sklearn
Training fold 1, with shapes: (44, 701) (44, 1) (23, 701) (23, 1)
Training fold 2, with shapes: (45, 701) (45, 1) (22, 701) (22, 1)
Training fold 3, with shapes: (45, 701) (45, 1) (22, 701) (22, 1)
>> Browsing FOOD_CHEM\Wheat_Proteine_100_Wang
No train_group file found for FOOD_CHEM\Wheat_Proteine_100_Wang.
No test_group file found for FOOD_CHEM\Wheat_Proteine_100_Wang.
{'initial_shape': (80, 701), 'delimiter': ';', 'numeric_delimiter': '.', 'header_line': 0, 'final_shape': (80, 701), 'na_handling': {'strategy': 'abort', 'nb_removed_rows': None, 'removed_rows': None}}
{'initial_shape': (20, 701), 'delimiter': ';', 'numeric_delimiter': '.', 'header_line': 0, 'final_shape': (20, 701), 'na_handling': {'strategy': 'abort', 'nb_removed_rows': None, 'removed_rows': Non

2024-10-16 00:57:12,266 - INFO - Metrics saved to results\FOOD_CHEMWheat_Proteine_100_Wang\PLSRegression\experiment_4069ade2\metrics.json
2024-10-16 00:57:12,267 - INFO - Evaluation Metrics: {'mse': 0.3366562451867744, 'mae': 0.4582088099712861}
2024-10-16 00:57:12,270 - INFO - Metrics saved to results\FOOD_CHEMWheat_Proteine_100_Wang\PLSRegression\experiment_4069ade2\metrics.json
2024-10-16 00:57:12,271 - INFO - Evaluation Metrics: {'mse': 0.29992427105254343, 'mae': 0.44998654267176014}
2024-10-16 00:57:12,274 - INFO - Metrics saved to results\FOOD_CHEMWheat_Proteine_100_Wang\PLSRegression\experiment_4069ade2\metrics.json
2024-10-16 00:57:12,274 - INFO - Evaluation Metrics: {'mse': 0.3095706125334708, 'mae': 0.44181682138740197}
2024-10-16 00:57:12,279 - INFO - Predictions saved to results\FOOD_CHEMWheat_Proteine_100_Wang\PLSRegression\experiment_4069ade2\predictions.csv
2024-10-16 00:57:12,283 - INFO - Updated experiments at results\FOOD_CHEMWheat_Proteine_100_Wang\PLSRegression\exp

Time elapsed: 0.8678500652313232 seconds
