## Dependencies

Install dependencies not available on Google Collab.
Collab provides numpy, pandas, sklearn, tensorflow, scipy, etc. (see requirements.txt)

In [None]:
!pip install pinard
!pip install scikeras

## Benchmark details

The results aggregate the combination of the following trainings configurations:
- estimation configuration: [regression, classification]
- datasets configurations: [Single Train, Cross validation with 5 folds and 2 repeats, Augmented Single Train]
- preprocessing configuration: [flat spectrum, savgol, haar, [small set], [big_set]]
- models: 
   - for all configuration: BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS
   - for single train + small_set : Stack > [ BACON, BACON-VG, DECON, PLS(components from 1 to 100), XGBoost, LW-PLS,
   f_PLSRegression,f_AdaBoostRegressor,f_BaggingRegressor,f_ExtraTreesRegressor, f_GradientBoostingRegressor,f_RandomForestRegressor,
   f_ARDRegression,f_BayesianRidge,f_ElasticNet,f_ElasticNetCV,f_HuberRegressor, f_LarsCV,f_LassoCV,f_Lasso,f_LassoLars,f_LassoLarsCV,
   f_LassoLarsIC,f_LinearRegression,f_OrthogonalMatchingPursuit,f_OrthogonalMatchingPursuitCV, f_PassiveAggressiveRegressor,f_RANSACRegressor,
   f_Ridge,f_RidgeCV,f_SGDRegressor,f_TheilSenRegressor,f_GaussianProcessRegressor,f_KNeighborsRegressor, f_Pipeline,f_MLPRegressor,f_LinearSVR,
   f_NuSVR,f_SVR,f_DecisionTreeRegressor,f_ExtraTreeRegressor,f_KernelRidge,f_XGBRegressor]

We perform training in 2 steps, (1) data transformation and (2) training because the sklearn pipeline does not use test data natively.
To change with pinard update in the future.

In [None]:
### FAST GPU RESET ####
from numba import cuda 
device = cuda.get_current_device()
device.reset()

In [None]:
## Browse path and launch benchmark for every folders
%load_ext autoreload
%autoreload 2

from pathlib import Path
from preprocessings import preprocessing_list

from benchmark_loop import benchmark_dataset, benchmark_dataset_multiple

import tensorflow as tf

tf.get_logger().setLevel("ERROR")
tf.keras.mixed_precision.set_global_policy("mixed_float16")

rootdir = Path('data/regression')
folder_list = [f for f in rootdir.glob('**/*') if f.is_dir()]

SEED = ord('D') + 31373

# tf.keras.utils.set_random_seed(SEED)
# tf.config.experimental.enable_op_determinism()


import preprocessings
import regressors
import pinard.preprocessing as pp
from pinard import augmentation, model_selection
from sklearn.cross_decomposition import PLSRegression
from xgboost import XGBRegressor
import sys
import os.path

def str_to_class(classname):
    return getattr(sys.modules['pinard.preprocessing'], classname)

# print(str_to_class('SavitzkyGolay'))




def get_dataset_list(path):
    datasets = []
    for r, d, _ in os.walk(path):
        for folder in d:
            # print(r, folder)
            path = os.path.join(r, folder)
            if os.path.isdir(path):
                # if len(datasets) < 3:
                datasets.append(str(path))
                # break
    return datasets

split_configs = [
    None,
    # {'test_size':None, 'method':"random", 'random_state':SEED},
    # {'test_size':None, 'method':"stratified", 'random_state':SEED, 'n_bins':5},
    # {'test_size':0.25, 'method':"spxy", 'random_state':SEED, 'metric':"euclidean", 'pca_components':250},
]

augmentations = [
    None,
    # [(6, augmentation.Rotate_Translate())],
    # [(3, augmentation.Rotate_Translate()),(2, augmentation.Random_X_Operation()),(1, augmentation.Random_Spline_Addition()),],
    # [(3, augmentation.Rotate_Translate()),(2, augmentation.Random_X_Operation()),(2, augmentation.Random_Spline_Addition()),]
]

preprocessings_list = [
    # None,
    preprocessings.id_preprocessing(),
    [("id", pp.IdentityTransformer()), ('haar', pp.Haar()), ('savgol', pp.SavitzkyGolay())],
    # preprocessings.decon_set(),
    # preprocessings.bacon_set(),
    preprocessings.small_set(),
    # preprocessings.transf_set(),
    # preprocessings.optimal_set_2D(),
    # preprocessings.fat_set(),
]



cv_configs = [
    None,
    # {'n_splits':5, 'n_repeats':4},
    # {'n_splits':4, 'n_repeats':2},
    # {'n_splits':4, 'n_repeats':1},
]

# import os
# folder = "data/regression"
# folder = "data/Paprica_2D"
folder = "data/_RefSet"
folders = get_dataset_list(folder)
# print(folders)

len_cv_configs = 0
for c in cv_configs:
    if c == None:
        len_cv_configs += 1
    else:
        len_cv_configs += (c['n_splits'] * c['n_repeats'])

models = [
    # (regressors.ML_Regressor(XGBRegressor), {"n_estimators":200, "max_depth":50, "seed":SEED}),
    # (regressors.ML_Regressor(PLSRegression), {"n_components":50}),
    # (regressors.Transformer_NIRS(), {'batch_size':500, 'epoch':10000, 'verbose':0, 'patience':1000, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.Decon_SepPo(), {'batch_size':50, 'epoch':10000, 'verbose':0, 'patience':1000, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.FFT_Conv(), {'batch_size':500, 'epoch':20000, 'verbose':0, 'patience':1000, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.Decon(), {'batch_size':100, 'epoch':20000, 'verbose':0, 'patience':400, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.ResNetV2(), {'batch_size':200, 'epoch':20000, 'verbose':0, 'patience':300, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.MLP(), {'batch_size':1000, 'epoch':20000, 'verbose':0, 'patience':2000, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.CONV_LSTM(), {'batch_size':1000, 'epoch':20000, 'verbose':0, 'patience':2000, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.XCeption1D(), {'batch_size':500, 'epoch':10000, 'verbose':0, 'patience':1200, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.Transformer(), {'batch_size':10, 'epoch':300, 'verbose':0, 'patience':30, 'optimizer':'Adam', 'loss':'mse'}),
    
    # (regressors.Decon_Sep_Multiple(), {'batch_size':200, 'epoch':20000, 'verbose':0, 'patience':200, 'optimizer':'adam', 'loss':'mse'}),
    # (regressors.Transformer_VG_Multiple(), {'batch_size':10, 'epoch':300, 'verbose':0, 'patience':30, 'optimizer':'Adam', 'loss':'mse'}),
]


from lwpls import LWPLS
from regressors import NonlinearPLSRegressor

# for i in range(5,150,5):
#     models. append(
#         (regressors.ML_Regressor(NonlinearPLSRegressor, name=f"NL_RBF_PLS_{i}"), {"n_components":i, "poly_degree":2, "gamma":0.1})
#     )
#     models. append(
#         (regressors.ML_Regressor(PLSRegression, name=f"PLS_{i}"), {"n_components":i})
#     )

models.append(
    (regressors.ML_Regressor(LWPLS, name=f"LWPLS_0-05_45"), {"max_component_number":45, "lambda_in_similarity":0.05})
)

# for i in range(10,100,50):
#     models.append(
#         (regressors.ML_Regressor(LWPLS, name=f"LWPLS_0-1_{i}"), {"max_component_number":i, "lambda_in_similarity":0.05})
#     )
    # models.append(
    #     (regressors.ML_Regressor(LWPLS, name=f"LWPLS_0-5_{i}"), {"max_component_number":i, "lambda_in_similarity":0.5})
    # )
# for i in range(10,100,5):
    # models. append(
    #     (regressors.ML_Regressor(PLSRegression, name=f"PLS_{i}"), {"n_components":i})
    # )

benchmark_size = len(folders) * len(split_configs) * len_cv_configs * len(augmentations) * len(preprocessings_list) * len(models)
print("Benchmarking", benchmark_size, "runs.")


# benchmark_dataset_multiple(folders, split_configs, cv_configs, augmentations, preprocessings_list, models, SEED)#, resampling='resample', resample_size=1024)
benchmark_dataset(folders, split_configs, cv_configs, augmentations, preprocessings_list, models, SEED) #, resampling='resample', resample_size=2048) #bins=5)
# benchmark_dataset(folders, split_configs, cv_configs, augmentations, preprocessings_list, models, SEED, resampling='crop', resample_size=2150)


# for folder in folder_list:
    # # print(ord(str(folder)[17]), ord('A'), ord('M'))
    # if ord(str(folder)[16]) < ord("L") or ord(str(folder)[16]) > ord("M"):
    #     continue
    # benchmark_dataset(folder, SEED, preprocessing_list(), 20, augment=False)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
y_train = np.array([0,1,2,2.5,3,3.5,6,8,20]).reshape(-1,1)
bins = 4
discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='uniform')
discretizer.fit(y_train)
tt = discretizer.transform(y_train)
print(tt)


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import os

import re
from scipy import signal

projdir = Path("data/paprica_2D/Paprica_2D_XY_ag")
t = "*Xcal*"
for y in projdir.glob(t):
    print(y)

# files = tuple(next(projdir.glob(n)) for n in ["*Xcal*", "*Ycal*"])

In [None]:
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, cross_val_predict
import numpy as np

def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
    """Scatter plot of the predicted vs true targets."""
    ax.plot(
        [y_true.min(), y_true.max()], [y_true.min(), y_true.max()], "--r", linewidth=2
    )
    ax.scatter(y_true, y_pred, alpha=0.2)

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines["left"].set_position(("outward", 10))
    ax.spines["bottom"].set_position(("outward", 10))
    ax.set_xlim([y_true.min(), y_true.max()])
    ax.set_ylim([y_true.min(), y_true.max()])
    ax.set_xlabel("Measured")
    ax.set_ylabel("Predicted")
    extra = plt.Rectangle(
        (0, 0), 0, 0, fc="w", fill=False, edgecolor="none", linewidth=0
    )
    ax.legend([extra], [scores], loc="upper left")
    title = title + "\n Evaluation in {:.2f} seconds".format(elapsed_time)
    ax.set_title(title)

def plot_data(d, filepath):
    plt.scatter(d[:,0], d[:,1])
    plt.xlabel('test')
    plt.ylabel('predict')
    plt.savefig(filepath + '.png')
    plt.close()

import json
from numpy import genfromtxt

path = 'results'
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.csv'):
            filepath = os.path.join(root, file)
            df = pd.read_csv(filepath)
            my_data = genfromtxt(filepath, delimiter=';')
            # print(my_data)
            plot_data(my_data, filepath.replace('csv','png'))
        # if file.endswith('.json'):
        #     print(file)
        #     dataset = file.replace('.json','')
        #     f = open(os.path.join(root, file))
        #     data = json.load(f)
        #     for key in data:
        #         print(key)

# returns JSON object as 
# a dictionary
            # filepath = os.path.join(root, file)
            # df = pd.read_csv(filepath)
            # y_res = df.iloc[:,0]
            # y_pred = df.iloc[:,1]
            # fig, axs = plt.subplots(1,1, figsize=(10,10))
            # axs = np.ravel(axs)

            # plot_regression_results(
            #     ax,
            #     y,
            #     y_pred,
            #     name,
            #     (r"$R^2={:.2f} \pm {:.2f}$" + "\n" + r"$MAE={:.2f} \pm {:.2f}$").format(
            #         np.mean(score["test_r2"]),
            #         np.std(score["test_r2"]),
            #         -np.mean(score["test_neg_mean_absolute_error"]),
            #         np.std(score["test_neg_mean_absolute_error"]),
            #     ),
            #     elapsed_time,
            # )

    



# plt.suptitle("Single predictors versus stacked predictors")
# plt.tight_layout()
# plt.subplots_adjust(top=0.9)
# plt.show()

In [None]:
from pinard.utils import load_csv
from benchmark_loop import transform_test_data
import preprocessings
import numpy as np

dumb_set = preprocessings.dumb_and_dumber_set()
Xfile = "data/regression/ALPINE_Calpine_424_Murguzur_RMSE1.36/Xcal.csv.gz"
yfile = "data/regression/ALPINE_Calpine_424_Murguzur_RMSE1.36/Ycal.csv.gz"
X_train, y_train = load_csv(Xfile, yfile, x_hdr=0, y_hdr=0, sep=";")
X_train, y_train, X_test, y_test = X_train[0:100], y_train[0:100], X_train[0:100], y_train[0:100]
X_test_pp, y_test_pp, transformer_pipeline, y_scaler = transform_test_data(dumb_set, X_train, y_train, X_test, y_test, type="augmentation")

print(X_test_pp.shape)
sample = X_test_pp[0]
print(len(dumb_set))
ok = []
for i in range(len(dumb_set)-1, -1, -1):
    found = False
    for j in range(i-1, -1, -1):
        if np.allclose(sample[i], sample[j], rtol=10e-3, atol=10e-3):
            found = True
            break
    if not found:
        ok.append(dumb_set[i][0])

print(len(ok))
print(ok)

In [None]:
import os
import csv

def count_columns(file):
    with open(file, 'r') as f:
        reader = csv.reader(f, delimiter=";")
        header = next(reader)
        header = next(reader)
        return len(header), header

def walk_directory(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                path = os.path.join(root, file)
                columns, header = count_columns(path)
                if columns > 2100 and columns < 4000:
                    print(header[0:5])
                    print(f"File: {path}, Columns: {columns}")

walk_directory('rawdata/regression') # Replace '.' with the directory path you want to traverse


In [None]:
import csv
import json
import os

def json_to_csv(json_folder, csv_file):
    csv_rows = []
    for file in os.listdir(json_folder):
        if file.endswith(".json"):
            file_path = os.path.join(json_folder, file)
            with open(file_path, "r") as f:
                json_data = json.load(f)
            for key, obj in json_data.items():
                csv_row = {}
                for key, value in obj.items():
                    csv_row[key] = value
                csv_rows.append(csv_row)

    fieldnames = []
    for row in csv_rows:
        for key in row.keys():
            if key not in fieldnames:
                fieldnames.append(key)

    with open(csv_file, "w") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in csv_rows:
            writer.writerow(row)

json_folder = "results"
csv_file = "result.csv"

json_to_csv(json_folder, csv_file)

In [None]:
import tensorflow as tf

# Check TensorFlow version
print("TensorFlow version:", tf.__version__)

# Check if TensorFlow can access the GPU
print("Is GPU available:", tf.config.list_physical_devices('GPU'))

# Run a simple computation on the GPU (if available) and time it
with tf.device('/gpu:0'):
    a = tf.random.normal([1000, 1000])
    b = tf.random.normal([1000, 1000])
    c = tf.matmul(a, b)

# Print the result
print(c)
