In [None]:
%reload_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))

import time
from pinard.presets.ref_models import decon, nicon, customizable_nicon, nicon_classification
from pinard.presets.preprocessings import decon_set, nicon_set
from pinard.data_splitters import KennardStoneSplitter
from pinard.transformations import StandardNormalVariate as SNV, SavitzkyGolay as SG, Gaussian as GS, Derivate as  Dv
from pinard.transformations import Rotate_Translate as RT, Spline_X_Simplification as SXS, Random_X_Operation as RXO
from pinard.transformations import CropTransformer
from pinard.core.runner import ExperimentRunner
from pinard.core.config import Config

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold, ShuffleSplit, GroupKFold, StratifiedShuffleSplit, BaseCrossValidator, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

model_sklearn = {
    "class": "sklearn.cross_decomposition.PLSRegression",
    "model_params": {
        "n_components": 21,
    }
}
    
finetune_pls_experiment = {
    "action": "finetune",
    "finetune_params": {
        'model_params': {
            'n_components': ('int', 5, 20),
        },
        'training_params': {},
        'tuner': 'sklearn'
    }
}

bacon_train = {"action": "train", "training_params": {"epochs": 2000, "batch_size": 500, "patience": 200, "cyclic_lr": True, "base_lr": 1e-6, "max_lr": 1e-3, "step_size": 400}}
bacon_train_short = {"action": "train", "training_params": {"epochs": 10, "batch_size": 500, "patience": 20, "cyclic_lr": True, "base_lr": 1e-6, "max_lr": 1e-3, "step_size": 40}}
bacon_finetune = {
    "action": "finetune",
    "finetune_params": {
        "n_trials": 5,
        "model_params": {
            "filters_1": [8, 16, 32, 64], 
            "filters_2": [8, 16, 32, 64], 
            "filters_3": [8, 16, 32, 64]
        }
    },
    "training_params": {
        "epochs": 10,
        "verbose":0
    }
}

full_bacon_finetune = {
    "action": "finetune",
    "training_params": {
        "epochs": 500,
        "patience": 100,
    },
    "finetune_params": {
        "nb_trials": 150,
        "model_params": {
            'spatial_dropout': (float, 0.01, 0.5),
            'filters1': [4, 8, 16, 32, 64, 128, 256],
            'kernel_size1': [3, 5, 7, 9, 11, 13, 15],
            # 'strides1': [1, 2, 3, 4, 5],
            # 'activation1': ['relu', 'selu', 'elu', 'swish'],
            'dropout_rate': (float, 0.01, 0.5),
            'filters2': [4, 8, 16, 32, 64, 128, 256],
            # 'kernel_size2': [3, 5, 7, 9, 11, 13, 15],
            # 'strides2': [1, 2, 3, 4, 5],
            'activation2': ['relu', 'selu', 'elu', 'swish'],
            'normalization_method1': ['BatchNormalization', 'LayerNormalization'],
            'filters3': [4, 8, 16, 32, 64, 128, 256],
            # 'kernel_size3': [3, 5, 7, 9, 11, 13, 15],
            # 'strides3': [1, 2, 3, 4, 5],
            'activation3': ['relu', 'selu', 'elu', 'swish'],
            # 'normalization_method2': ['BatchNormalization', 'LayerNormalization'],
            # 'dense_units': [4, 8, 16, 32, 64, 128, 256],
            'dense_activation': ['relu', 'selu', 'elu', 'swish'],
        },
        # "training_params": {
        #     "batch_size": [32, 64, 128, 256, 512],
        #     "cyclic_lr": [True, False],
        #     "base_lr": (float, 1e-6, 1e-2),
        #     "max_lr": (float, 1e-3, 1e-1),
        #     "step_size": (int, 500, 5000),
        # },
    }
}


x_pipeline_full = [
    RobustScaler(),
    {"samples": [None, None,None,None,SXS, RXO]},
    {"split": RepeatedKFold(n_splits=3, n_repeats=1)},
    {"features": [None, GS(2,1), SG, SNV, Dv, [GS, SNV], [GS, GS],[GS, SG],[SG, SNV], [GS, Dv], [SG, Dv]]},
    MinMaxScaler()
]


bacon_finetune_classif = {
    "action": "finetune",
    "task": "classification",
    "finetune_params": {
        "n_trials": 5,
        "model_params": {
            "filters_1": [8, 16, 32, 64], 
            "filters_2": [8, 16, 32, 64], 
            "filters_3": [8, 16, 32, 64]
        }
    },
    "training_params": {
        "epochs": 5,
        "verbose":0
    }
}

finetune_randomForestclassifier = {
    "action": "finetune",
    "task": "classification",
    "finetune_params": {
        'model_params': {
            'n_estimators': ('int', 5, 20),
        },
        'training_params': {},
        'tuner': 'sklearn'
    }
}

x_pipeline_PLS = [
    RobustScaler(),
    {"split": RepeatedKFold(n_splits=3, n_repeats=1)},
    {"features": [None, GS(2,1), SG, SNV, Dv, [GS, SNV], [GS, GS],[GS, SG],[SG, SNV], [GS, Dv], [SG, Dv]]},
    MinMaxScaler()
]
            
            
x_pipeline = [
    RobustScaler(), 
    {"split": RepeatedKFold(n_splits=3, n_repeats=1)}, 
    # bacon_set(),
    MinMaxScaler()
]

x_pipelineb = [
    RobustScaler(), 
    {"samples": [RT(6)], "balance": True},
    
    # {"samples": [None, RT]},
    
    {"split": RepeatedKFold(n_splits=3, n_repeats=1)}, 
    
    # {"features": 
        
        # },
    
    MinMaxScaler()
]


seed = 123459456

datasets = "sample_data/mock_data3_classif"
y_pipeline = MinMaxScaler()
# processing only
config1 = Config("sample_data/Malaria2024", x_pipeline_full, y_pipeline, None, None, seed)
## TRAINING
# regression
config2 = Config("sample_data/mock_data2", x_pipeline, y_pipeline, nicon, bacon_train_short, seed)
config3 = Config("sample_data/mock_data3", x_pipeline_PLS, y_pipeline, model_sklearn, None, seed)
# classification
config4 = Config("sample_data/mock_data3_classif", x_pipeline, None, nicon_classification, {"task":"classification", "training_params":{"epochs":10, "patience": 100, "verbose":0}}, seed*2)
config4b = Config("sample_data/Malaria2024", x_pipelineb, None, nicon_classification, {"task":"classification", "training_params":{"epochs":10, "patience": 100, "verbose":0}}, seed*2)
config5 = Config("sample_data/mock_data3_binary", x_pipeline, None, nicon_classification, {"task":"classification", "training_params":{"epochs":5}, "verbose":0}, seed*2)
config6 = Config("sample_data/WhiskyConcentration", x_pipeline, None, RandomForestClassifier, {"task":"classification"}, seed*2)
config7 = Config("sample_data/Malaria2024", x_pipeline, None, RandomForestClassifier, {"task":"classification"}, seed*2)
## FINETUNING
# regression
config8 = Config("sample_data/mock_data3", x_pipeline, y_pipeline, nicon, bacon_finetune, seed)
config9 = Config("sample_data/mock_data3", x_pipeline, y_pipeline, model_sklearn, finetune_pls_experiment, seed)
# classification
config10 = Config("sample_data/Malaria2024", x_pipeline, None, nicon_classification, bacon_finetune_classif, seed*2)
config10b = Config("sample_data/YamMould", x_pipeline, None, nicon_classification, bacon_finetune_classif, seed*2)
config11 = Config("sample_data/Malaria2024", x_pipelineb, None, RandomForestClassifier, finetune_randomForestclassifier, seed*2)
config11b = Config("sample_data/YamMould", x_pipeline, None, RandomForestClassifier, finetune_randomForestclassifier, seed*2)


configs = [config1, config2, config3, config4, config4b, config5, config6, config7, config8, config9, config10, config10b, config11, config11b]
# configs = [config4b]


start = time.time()
runner = ExperimentRunner(configs, resume_mode="restart")
dataset, model_manager = runner.run()
end = time.time()
print(f"Time elapsed: {end-start} seconds")


2025-03-31 14:49:22,246 - INFO - ### PREPARING DATA ###
2025-03-31 14:49:22,247 - INFO - ### LOADING DATASET ###


>> Browsing sample_data/Malaria2024
No train_group file found for sample_data/Malaria2024.
No test_group file found for sample_data/Malaria2024.
{'initial_shape': (2996, 1665), 'delimiter': ';', 'numeric_delimiter': '.', 'header_line': 0, 'final_shape': (2996, 1665), 'na_handling': {'strategy': 'abort', 'nb_removed_rows': None, 'removed_rows': None}}


2025-03-31 14:49:24,819 - INFO - Dataset(x_train:(2996, 1665) - y_train:(2996, 1), x_test:(1285, 1665) - y_test:(1285, 1))
2025-03-31 14:49:24,819 - INFO - ### PROCESSING DATASET ###


{'initial_shape': (1285, 1665), 'delimiter': ';', 'numeric_delimiter': '.', 'header_line': 0, 'final_shape': (1285, 1665), 'na_handling': {'strategy': 'abort', 'nb_removed_rows': None, 'removed_rows': None}}


2025-03-31 14:49:30,291 - INFO - Dataset(x_train:(5740, 1665) - y_train:(5740, 1), x_test:(1285, 1665) - y_test:(1285, 1))
Folds size: 3826-1914, 3827-1913, 3827-1913
2025-03-31 14:49:30,291 - INFO - ### PREPARING MODEL ###


(1, 2996, 1, 1665) (2996, 1)
{np.float32(0.0): 2870, np.float32(1.0): 126}
{np.float32(0.0): 0, np.float32(1.0): 2744}
(1, 5740, 1, 1665) (5740, 1)
{np.float32(0.0): 2870, np.float32(1.0): 2870}
Model cloned
Model cloned
Using framework: tensorflow


2025-03-31 14:49:30,407 - INFO - Running config > {'dataset': 'sample_data/Malaria2024', 'x_pipeline': [{'class': 'sklearn.preprocessing.RobustScaler', 'params': {'copy': True, 'quantile_range': [25.0, 75.0], 'unit_variance': False, 'with_centering': True, 'with_scaling': True}}, {'samples': [{'class': 'pinard.transformations.Rotate_Translate', 'params': {'apply_on': 6, 'copy': True, 'p_range': 2, 'random_state': None, 'y_factor': 3}}], 'balance': True}, {'split': {'class': 'sklearn.model_selection.RepeatedKFold', 'params': {'cv': {'class': 'sklearn.model_selection.KFold', 'params': None}, 'n_repeats': 1, 'random_state': None, 'cvargs': {'n_splits': 3}}}}, {'class': 'sklearn.preprocessing.MinMaxScaler', 'params': {'clip': False, 'copy': True, 'feature_range': [0, 1]}}], 'y_pipeline': None, 'model': {'function': 'pinard.presets.ref_models.nicon_classification'}, 'experiment': {'task': 'classification', 'training_params': {'epochs': 10, 'patience': 100, 'verbose': 0, 'loss': 'binary_cros

Training fold with shapes: (3826, 1665, 1) (3826, 1) (1914, 1665, 1) (1914, 1)
binary_crossentropy ['accuracy']
Training with shapes: (3826, 1665, 1) (3826, 1) (1914, 1665, 1) (1914, 1)
Training fold with shapes: (3827, 1665, 1) (3827, 1) (1913, 1665, 1) (1913, 1)
binary_crossentropy ['accuracy']
Training with shapes: (3827, 1665, 1) (3827, 1) (1913, 1665, 1) (1913, 1)
Training fold with shapes: (3827, 1665, 1) (3827, 1) (1913, 1665, 1) (1913, 1)
binary_crossentropy ['accuracy']
Training with shapes: (3827, 1665, 1) (3827, 1) (1913, 1665, 1) (1913, 1)


2025-03-31 14:49:52,015 - INFO - Saved model to results\sample_dataMalaria2024\nicon_classification\experiment_8ad77f9b


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


2025-03-31 14:49:52,642 - INFO - Evaluation Metrics fold_0: {'accuracy': 0.954863813229572}
2025-03-31 14:49:52,643 - INFO - Evaluation Metrics fold_1: {'accuracy': 0.9540856031128405}
2025-03-31 14:49:52,644 - INFO - Evaluation Metrics fold_2: {'accuracy': 0.954863813229572}
2025-03-31 14:49:52,645 - INFO - Evaluation Metrics mean: {'accuracy': 0.954863813229572}
2025-03-31 14:49:52,645 - INFO - Evaluation Metrics best: {'accuracy': 0.954863813229572}
2025-03-31 14:49:52,646 - INFO - Evaluation Metrics weighted: {'accuracy': 0.954863813229572}
2025-03-31 14:49:52,656 - INFO - Metrics saved to results\sample_dataMalaria2024\nicon_classification\experiment_8ad77f9b\metrics.json
2025-03-31 14:49:52,671 - INFO - Predictions saved to results\sample_dataMalaria2024\nicon_classification\experiment_8ad77f9b\predictions.csv
2025-03-31 14:49:52,688 - INFO - Updated experiments at results\sample_dataMalaria2024\nicon_classification\experiments.json
2025-03-31 14:49:52,702 - INFO - Updated experi

Weights: [0.33342391 0.33315217 0.33342391]
Time elapsed: 30.45787763595581 seconds


In [None]:
import pinard
print(pinard.__version__)

# load malaria manually and apply RT transformation manually and display chart of transformed samples

config1 = Config("sample_data/Malaria2024", x_pipeline, y_pipeline, None, None, seed)
runner = ExperimentRunner([config1], resume_mode="restart")
dataset, model_manager = runner.run()


In [8]:
import ace_tools as tools
import pandas as pd

# Sklearn metrics list
sklearn_metrics = [
    "explained_variance", "r2", "max_error", "matthews_corrcoef",
    "neg_median_absolute_error", "neg_mean_absolute_error",
    "neg_mean_absolute_percentage_error", "neg_mean_squared_error",
    "neg_mean_squared_log_error", "neg_root_mean_squared_error",
    "neg_root_mean_squared_log_error", "neg_mean_poisson_deviance",
    "neg_mean_gamma_deviance", "d2_absolute_error_score", "accuracy",
    "top_k_accuracy", "roc_auc", "roc_auc_ovr", "roc_auc_ovo",
    "roc_auc_ovr_weighted", "roc_auc_ovo_weighted", "balanced_accuracy",
    "average_precision", "neg_log_loss", "neg_brier_score",
    "positive_likelihood_ratio", "neg_negative_likelihood_ratio",
    "adjusted_rand_score", "rand_score", "homogeneity_score",
    "completeness_score", "v_measure_score", "mutual_info_score",
    "adjusted_mutual_info_score", "normalized_mutual_info_score",
    "fowlkes_mallows_score"
]

# Tensorflow/keras metrics list
tensorflow_metrics = [
    "MeanSquaredError", "RootMeanSquaredError", "MeanAbsoluteError",
    "MeanAbsolutePercentageError", "MeanSquaredLogarithmicError",
    "CosineSimilarity", "LogCoshError", "R2Score", "AUC",
    "FalseNegatives", "FalsePositives", "Precision", "PrecisionAtRecall",
    "Recall", "RecallAtPrecision", "SensitivityAtSpecificity",
    "SpecificityAtSensitivity", "TrueNegatives", "TruePositives",
    "Hinge", "SquaredHinge", "CategoricalHinge", "KLDivergence",
    "Poisson", "BinaryCrossentropy", "CategoricalCrossentropy",
    "SparseCategoricalCrossentropy", "Accuracy", "BinaryAccuracy",
    "CategoricalAccuracy", "SparseCategoricalAccuracy",
    "TopKCategoricalAccuracy", "SparseTopKCategoricalAccuracy",
    "F1Score", "FBetaScore", "IoU", "BinaryIoU", "MeanIoU",
    "OneHotIoU", "OneHotMeanIoU"
]

# Metric name mapping: (tensorflow_name, sklearn_name, abbreviation, method_name)
# Initialize with common names
metrics_mapping = [
    ("MeanSquaredError", "neg_mean_squared_error", "mse", "Mean Squared Error"),
    ("RootMeanSquaredError", "neg_root_mean_squared_error", "rmse", "Root Mean Squared Error"),
    ("MeanAbsoluteError", "neg_mean_absolute_error", "mae", "Mean Absolute Error"),
    ("MeanAbsolutePercentageError", "neg_mean_absolute_percentage_error", "mape", "Mean Absolute Percentage Error"),
    ("MeanSquaredLogarithmicError", "neg_mean_squared_log_error", "msle", "Mean Squared Logarithmic Error"),
    ("CosineSimilarity", None, "cos_sim", "Cosine Similarity"),
    ("LogCoshError", None, "log_cosh", "Log Cosh Error"),
    ("R2Score", "r2", "r2", "R2 Score"),
    ("AUC", "roc_auc", "auc", "Area Under the Curve"),
    ("Precision", None, "prec", "Precision"),
    ("Recall", None, "recall", "Recall"),
    ("Accuracy", "accuracy", "acc", "Accuracy"),
    ("TopKCategoricalAccuracy", "top_k_accuracy", "top_k_acc", "Top K Categorical Accuracy"),
    ("BinaryCrossentropy", None, "bin_crossentropy", "Binary Crossentropy"),
    ("CategoricalCrossentropy", None, "cat_crossentropy", "Categorical Crossentropy"),
    ("SparseCategoricalCrossentropy", None, "sparse_cat_crossentropy", "Sparse Categorical Crossentropy"),
    ("F1Score", None, "f1", "F1 Score"),
    ("IoU", None, "iou", "Intersection over Union")
]

# Add remaining metrics with None in the missing columns
for metric in sklearn_metrics:
    if not any(metric in row for row in metrics_mapping):
        metrics_mapping.append((None, metric, None, None))

for metric in tensorflow_metrics:
    if not any(metric in row for row in metrics_mapping):
        metrics_mapping.append((metric, None, None, None))

# Create dataframe
df = pd.DataFrame(metrics_mapping, columns=["tensorflow_name", "sklearn_name", "abbreviation", "method_name"])

# Display the dataframe to the user
tools.display_dataframe_to_user(name="Metric Comparison", dataframe=df)

ModuleNotFoundError: No module named 'ace_tools'

In [None]:
# Example 1: Applying a Simple Data Transformation using Pinard

from sklearn.preprocessing import MinMaxScaler
from pinard.transformations import StandardNormalVariate as SNV
from pinard.core.runner import ExperimentRunner
from pinard.core.config import Config
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))


# Define the dataset path
dataset_path = "sample_data/mock_data"

# Define the data transformation pipeline
# Here we apply Standard Normal Variate (SNV) transformation
x_pipeline = [
    SNV(),            # Apply SNV transformation
    MinMaxScaler()    # Scale features to [0,1]
]

# No model is used in this example; we focus on data transformation
config = Config(
    dataset_path,
    x_pipeline,
    None,
    None,
    None,
    seed=42
)

# Run the experiment
runner = ExperimentRunner(configs=[config])
dataset, model_manager = runner.run()

# Access the transformed data
transformed_data = dataset.x_train

print("Transformed data shape:", transformed_data.shape)

In [None]:
# Example 2: Applying a Preprocessing Pipeline and Training a Simple Model

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from pinard.transformations import StandardNormalVariate as SNV, SavitzkyGolay as SG
from pinard.core.runner import ExperimentRunner
from pinard.core.config import Config
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))


# Define the dataset path
dataset_path = "sample_data/mock_data"

# Define the data transformation pipeline
x_pipeline = [
    SG(window_length=11, polyorder=2),  # Apply Savitzky-Golay filter
    SNV(),                              # Apply SNV transformation
    MinMaxScaler()                      # Scale features to [0,1]
]

# Define the model
model = {
    "class": "sklearn.linear_model.LinearRegression",
    "model_params": {}
}

# Training parameters
train_params = {
    "action": "train",
    "training_params": {}
}

# Define the configuration
config = Config(
    dataset_path=dataset_path,
    x_pipeline=x_pipeline,
    y_pipeline=None,
    model=model,
    experiment_params=train_params,
    seed=42
)

# Run the experiment
runner = ExperimentRunner(configs=[config])
dataset, model_manager = runner.run()

# Access the trained model
trained_model = model_manager.models[0].model

# Print model coefficients
print("Model coefficients:", trained_model.coef_)

In [None]:
# Example 3: Using Cross-Validation with Pinard

from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import MinMaxScaler
from pinard.transformations import StandardNormalVariate as SNV, SavitzkyGolay as SG
from pinard.core.runner import ExperimentRunner
from pinard.core.config import Config
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))


# Define the dataset path
dataset_path = "sample_data/mock_data"

# Define the data transformation pipeline with cross-validation
x_pipeline = [
    {"split": RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)},
    SG(window_length=11, polyorder=2),
    SNV(),
    MinMaxScaler()
]

# Define the model
model = {
    "class": "sklearn.linear_model.Ridge",
    "model_params": {
        "alpha": 1.0
    }
}

# Training parameters
train_params = {
    "action": "train",
    "training_params": {}
}

# Define the configuration
config = Config(
    dataset_path=dataset_path,
    x_pipeline=x_pipeline,
    y_pipeline=None,
    model=model,
    experiment_params=train_params,
    seed=42
)

# Run the experiment
runner = ExperimentRunner(configs=[config])
dataset, model_manager = runner.run()

# Evaluate the model
scores = model_manager.models[0].scores
print("Cross-validation scores:", scores)

In [None]:
# Example 4: Fine-tuning a Model with Pinard

from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import MinMaxScaler
from pinard.transformations import StandardNormalVariate as SNV, SavitzkyGolay as SG
from pinard.core.runner import ExperimentRunner
from pinard.core.config import Config
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))


# Define the dataset path
dataset_path = "sample_data/mock_data"

# Define the data transformation pipeline with cross-validation
x_pipeline = [
    {"split": RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)},
    SG(window_length=11, polyorder=2),
    SNV(),
    MinMaxScaler()
]

# Define the model
model = {
    "class": "sklearn.linear_model.Ridge",
    "model_params": {}
}

# Define the finetune parameters
finetune_params = {
    "action": "finetune",
    "finetune_params": {
        'model_params': {
            'alpha': ('float', 0.1, 10.0)
        },
        'training_params': {},
        'n_trials': 20,
        'tuner': 'sklearn'  # Use scikit-learn's GridSearchCV
    }
}

# Define the configuration
config = Config(
    dataset_path=dataset_path,
    x_pipeline=x_pipeline,
    y_pipeline=None,
    model=model,
    experiment_params=finetune_params,
    seed=42
)

# Run the experiment
runner = ExperimentRunner(configs=[config])
dataset, model_manager = runner.run()

# Get the best model and its parameters
best_model = model_manager.models[0].best_model
best_params = model_manager.models[0].best_params

print("Best model parameters:", best_params)

In [None]:
# Example 4: Fine-tuning a Model with Pinard

from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import MinMaxScaler
from pinard.transformations import StandardNormalVariate as SNV, SavitzkyGolay as SG
from pinard.core.runner import ExperimentRunner
from pinard.core.config import Config
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))


# Define the dataset path
dataset_path = "sample_data/mock_data"

# Define the data transformation pipeline with cross-validation
x_pipeline = [
    {"split": RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)},
    SG(window_length=11, polyorder=2),
    SNV(),
    MinMaxScaler()
]

# Define the model
model = {
    "class": "sklearn.linear_model.Ridge",
    "model_params": {}
}

# Define the finetune parameters
finetune_params = {
    "action": "finetune",
    "finetune_params": {
        'model_params': {
            'alpha': ('float', 0.1, 10.0)
        },
        'training_params': {},
        'n_trials': 20,
        'tuner': 'sklearn'  # Use scikit-learn's GridSearchCV
    }
}

# Define the configuration
config = Config(
    dataset_path=dataset_path,
    x_pipeline=x_pipeline,
    y_pipeline=None,
    model=model,
    experiment_params=finetune_params,
    seed=42
)

# Run the experiment
runner = ExperimentRunner(configs=[config])
dataset, model_manager = runner.run()

# Get the best model and its parameters
best_model = model_manager.models[0].best_model
best_params = model_manager.models[0].best_params

print("Best model parameters:", best_params)

In [None]:
# Example 6: Fine-tuning a Custom TensorFlow Model with Pinard

from kerastuner import HyperModel
import tensorflow as tf
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import MinMaxScaler
from pinard.transformations import StandardNormalVariate as SNV, SavitzkyGolay as SG
from pinard.core.runner import ExperimentRunner
from pinard.core.config import Config
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))


# Define the dataset path
dataset_path = "sample_data/mock_data"

# Placeholder for input shape (to be defined later)
input_shape = None

# Define a hypermodel for Keras Tuner


class MyHyperModel(HyperModel):
    def build(self, hp):
        model = tf.keras.Sequential()
        units = hp.Int('units', min_value=32, max_value=128, step=32)
        model.add(tf.keras.layers.Dense(units=units, activation='relu', input_shape=(input_shape,)))
        model.add(tf.keras.layers.Dense(1))
        optimizer = hp.Choice('optimizer', ['adam', 'sgd'])
        model.compile(optimizer=optimizer, loss='mse')
        return model


# Define the model
model = {
    "class": "tensorflow.keras.models.Sequential",
    "model_params": {
        "build_fn": MyHyperModel()
    }
}

# Define the finetune parameters
finetune_params = {
    "action": "finetune",
    "finetune_params": {
        'model_params': {
            'units': ('int', 32, 128, 32),
            'optimizer': ['adam', 'sgd']
        },
        'training_params': {
            'epochs': 50,
            'batch_size': 32
        },
        'n_trials': 20,
        'tuner': 'keras'  # Use Keras Tuner
    }
}

# Define the data transformation pipeline with cross-validation
x_pipeline = [
    {"split": RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)},
    SG(window_length=11, polyorder=2),
    SNV(),
    MinMaxScaler()
]

# Define the configuration
config = Config(
    dataset_path=dataset_path,
    x_pipeline=x_pipeline,
    y_pipeline=None,
    model=model,
    experiment_params=finetune_params,
    seed=42
)

# Run the experiment
runner = ExperimentRunner(configs=[config])

# Set the input shape after data is loaded
dataset, model_manager = runner.load_data_only()
input_shape = dataset.x_train.shape[1]
model['model_params']['build_fn'] = MyHyperModel()

# Run the finetuning
dataset, model_manager = runner.run()

# Get the best model and its parameters
best_model = model_manager.models[0].best_model
best_params = model_manager.models[0].best_params

print("Best model parameters:", best_params)

In [None]:
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator


class LogTransformer(TransformerMixin, BaseEstimator):
    """
    Applies a logarithmic transformation to the data.
    """

    def __init__(self, copy=True, offset=1e-6):
        """
        Parameters
        ----------
        copy : bool, default=True
            Set to False to perform inplace computation.
        offset : float, default=1e-6
            A small constant to add to the data to avoid log(0).
        """
        self.copy = copy
        self.offset = offset

    def fit(self, X, y=None):
        return self

    def transform(self, X, copy=None):
        """
        Apply the logarithmic transformation.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to transform.
        copy : bool, default=None
            Copy the input X or not.

        Returns
        -------
        X_new : array-like of shape (n_samples, n_features)
            Transformed data.
        """
        if copy is None:
            copy = self.copy

        X = np.asarray(X)
        if copy:
            X = X.copy()

        X = np.log(X + self.offset)
        return X

In [None]:
from sklearn.model_selection import BaseCrossValidator
from sklearn.utils import check_random_state
import numpy as np


class StratifiedFeatureSplitter(BaseCrossValidator):
    """
    Stratified splitter based on a continuous feature.
    """

    def __init__(self, n_splits=5, feature_index=0, random_state=None):
        """
        Parameters
        ----------
        n_splits : int, default=5
            Number of folds.
        feature_index : int, default=0
            Index of the feature to stratify on.
        random_state : int or RandomState instance, default=None
            Random state for reproducibility.
        """
        self.n_splits = n_splits
        self.feature_index = feature_index
        self.random_state = random_state

    def split(self, X, y=None, groups=None):
        X = np.asarray(X)
        feature = X[:, self.feature_index]
        percentiles = np.percentile(feature, np.linspace(0, 100, self.n_splits + 1))

        indices = np.arange(len(X))
        rng = check_random_state(self.random_state)
        rng.shuffle(indices)

        bins = np.digitize(feature[indices], percentiles[1:-1], right=True)

        for fold in range(self.n_splits):
            test_mask = bins == fold
            train_mask = ~test_mask
            yield indices[train_mask], indices[test_mask]

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

In [None]:
# Example: Using Custom LogTransformer and StratifiedFeatureSplitter in Pinard

from sklearn.metrics import mean_squared_error
from pinard.core.runner import ExperimentRunner
from pinard.core.config import Config
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))


# Assuming LogTransformer and StratifiedFeatureSplitter are defined as above

# Define the dataset path
dataset_path = "sample_data/mock_data"

# Define the data transformation pipeline
x_pipeline = [
    {"split": StratifiedFeatureSplitter(n_splits=5, feature_index=0, random_state=42)},
    LogTransformer(offset=1e-6),
    MinMaxScaler()
]

# Define the model
model = {
    "class": "sklearn.linear_model.LinearRegression",
    "model_params": {}
}

# Training parameters
train_params = {
    "action": "train",
    "training_params": {}
}

# Define the configuration
config = Config(
    dataset_path=dataset_path,
    x_pipeline=x_pipeline,
    y_pipeline=None,
    model=model,
    experiment_params=train_params,
    seed=42
)

# Run the experiment
runner = ExperimentRunner(configs=[config])
dataset, model_manager = runner.run()

# Access the trained model
trained_model = model_manager.models[0].model

# Evaluate the model

y_pred = trained_model.predict(dataset.x_test)
mse = mean_squared_error(dataset.y_test, y_pred)
print("Test MSE:", mse)

In [None]:
class RatioTransformer(TransformerMixin, BaseEstimator):
    """
    Creates a new feature by taking the ratio of two features.
    """

    def __init__(self, numerator_index=0, denominator_index=1, copy=True):
        """
        Parameters
        ----------
        numerator_index : int, default=0
            Index of the numerator feature.
        denominator_index : int, default=1
            Index of the denominator feature.
        copy : bool, default=True
            Set to False to perform inplace computation.
        """
        self.numerator_index = numerator_index
        self.denominator_index = denominator_index
        self.copy = copy

    def fit(self, X, y=None):
        return self

    def transform(self, X, copy=None):
        """
        Create a new feature as the ratio of two existing features.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to transform.
        copy : bool, default=None
            Copy the input X or not.

        Returns
        -------
        X_new : array-like of shape (n_samples, n_features + 1)
            Transformed data with the new ratio feature added.
        """
        if copy is None:
            copy = self.copy

        X = np.asarray(X)
        if copy:
            X = X.copy()

        numerator = X[:, self.numerator_index]
        denominator = X[:, self.denominator_index] + 1e-6  # Avoid division by zero
        ratio_feature = (numerator / denominator).reshape(-1, 1)

        X_new = np.hstack((X, ratio_feature))
        return X_new

In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import BaseCrossValidator


class ClusterBasedSplitter(BaseCrossValidator):
    """
    Splits data into training and testing sets based on clustering.
    """

    def __init__(self, n_splits=5, n_clusters=5, random_state=None):
        """
        Parameters
        ----------
        n_splits : int, default=5
            Number of splits/folds.
        n_clusters : int, default=5
            Number of clusters to form.
        random_state : int or RandomState instance, default=None
            Random state for reproducibility.
        """
        self.n_splits = n_splits
        self.n_clusters = n_clusters
        self.random_state = random_state

    def split(self, X, y=None, groups=None):
        X = np.asarray(X)
        kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
        cluster_labels = kmeans.fit_predict(X)

        unique_clusters = np.unique(cluster_labels)
        rng = check_random_state(self.random_state)
        rng.shuffle(unique_clusters)

        clusters_per_split = np.array_split(unique_clusters, self.n_splits)

        for cluster_group in clusters_per_split:
            test_indices = np.where(np.isin(cluster_labels, cluster_group))[0]
            train_indices = np.setdiff1d(np.arange(len(X)), test_indices)
            yield train_indices, test_indices

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

In [None]:
# Example: Using Custom ClusterBasedSplitter in Pinard

from sklearn.metrics import mean_squared_error
from pinard.core.runner import ExperimentRunner
from pinard.core.config import Config
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))


# Assuming ClusterBasedSplitter is defined as above

# Define the dataset path
dataset_path = "sample_data/mock_data"

# Define the data transformation pipeline
x_pipeline = [
    StandardScaler(),
    {"split": ClusterBasedSplitter(n_splits=5, n_clusters=5, random_state=42)}
]

# Define the model
model = {
    "class": "sklearn.svm.SVR",
    "model_params": {
        "kernel": "rbf",
        "C": 1.0,
        "epsilon": 0.1
    }
}

# Training parameters
train_params = {
    "action": "train",
    "training_params": {}
}

# Define the configuration
config = Config(
    dataset_path=dataset_path,
    x_pipeline=x_pipeline,
    y_pipeline=None,
    model=model,
    experiment_params=train_params,
    seed=42
)

# Run the experiment
runner = ExperimentRunner(configs=[config])
dataset, model_manager = runner.run()

# Access the trained model
trained_model = model_manager.models[0].model

# Evaluate the model

y_pred = trained_model.predict(dataset.x_test)
mse = mean_squared_error(dataset.y_test, y_pred)
print("Test MSE:", mse)