### Imports

In [1]:
import itertools
import random
from typing import Any, Dict, List, Tuple

import numpy as np
import openml
import pandas as pd
from pandas import DataFrame, Series
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from skopt import BayesSearchCV
from skopt.space import Categorical, Integer, Real

# from evaluate_pipeline import evaluate_pipeline
# from load_dataset import load_dataset_from_id
# from perform_optimazaion_of_pipeline_with_random_search import (
#     perform_optimazaion_of_pipeline_with_random_search,
# )
# from split_dataset import split_dataset

### Utill functions

In [2]:
def perform_optimazaion_of_pipeline_with_bayesian_method(
    pipeline: Pipeline,
    search_space: Dict[str, Any],
    X: DataFrame,
    y: DataFrame,
    n_iter=100,
) -> BayesSearchCV:
    opt: BayesSearchCV = BayesSearchCV(
        pipeline,
        # [(space, # of evaluations)]
        search_spaces=search_space,
        n_iter=n_iter,
        n_jobs=-1,
        cv=5,
    )
    opt.fit(X, y)
    return opt

In [3]:
def get_bayes_best_configuration(
        pipeline: Pipeline,
        search: list[(Dict[str, object], int)],
        X_train,
        Y_train,
        X_test,
        Y_test
    ):
    max = float("-inf")
    best = None
    iter = None
    for config in search:
        model = perform_optimazaion_of_pipeline_with_bayesian_method(
            pipeline,
            [config],
            X_train,
            Y_train
        )
        score = model.score(X_test, Y_test)
        if max < score:
            best = model.best_params_
            iter = config[1] 
    return  (best, iter)

In [4]:
def optimize_pipeline_over_params_combinations(
    pipeline: Pipeline,
    parameters_grid: List[dict],
    X: DataFrame,
    y: DataFrame,
    X_val: DataFrame,
    y_val: DataFrame,
) -> Pipeline:
    # thats the teta^(j)*
    best_score = float("-inf")
    best_params = None

    for params in parameters_grid:
        # Update the pipeline parameters
        pipeline_params = {f"{key}": value for key, value in params.items()}
        pipeline.set_params(**pipeline_params)

        pipeline.fit(X, y)
        score = pipeline.score(X_val, y_val)

        if score > best_score:
            best_score = score
            best_params = pipeline_params

    pipeline.set_params(**best_params)
    return pipeline

In [5]:
def split_dataset(
    data: pd.DataFrame, class_: str
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    X: pd.DataFrame = data.drop(labels=class_, axis=1)
    y: pd.DataFrame = data[class_]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test

In [6]:
def load_dataset_from_id(id: int) -> DataFrame:
    return openml.datasets.get_dataset(dataset_id=id).get_data(
        dataset_format="dataframe"
    )[0]

In [7]:
def calculate_mse(model: Pipeline, X_test: pd.DataFrame, y_test: pd.Series):
    # Ensure X_test and y_test are the correct types
    if not isinstance(X_test, (pd.DataFrame, np.ndarray)):
        raise ValueError("X_test must be a pandas DataFrame or numpy array")
    if not isinstance(y_test, (pd.Series, np.ndarray)):
        raise ValueError("y_test must be a pandas Series or numpy array")

    # Generating predictions and calculating MSE
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error: {mse}")

In [8]:
def evaluate_pipeline(
    pipeline: Pipeline,
    X: pd.DataFrame,
    y: pd.Series,
    X_val: pd.DataFrame,
    y_val: pd.Series,
):
    if not isinstance(X_val, (pd.DataFrame, np.ndarray)):
        raise ValueError("X_test must be a pandas DataFrame or numpy array")
    if not isinstance(y_val, (pd.Series, np.ndarray)):
        raise ValueError("y_test must be a pandas Series or numpy array")

    pipeline.fit(X, y)

    test_score = pipeline.score(X_val, y_val)
    train_score = pipeline.score(X, y)
    print("Parameter set: " + str(pipeline.named_steps["model"]))
    print("Test score R^2: " + str(test_score))
    print("Train score R^2: " + str(train_score))
    calculate_mse(pipeline, X_val, y_val)

In [9]:
def evaluate_model_performance(model: Pipeline, X, y) -> float:
    model.fit(X=X, y=y)
    return model.score(X=X, y=y)

In [10]:
def experiment(
    datasets: List[Tuple[DataFrame, Series]], model: Pipeline, config
) -> List[float]:
    performances: List[float] = []
    for X, y in datasets:
        model.set_params(**config)
        performance: float = evaluate_model_performance(model=model, X=X, y=y)
        performances.append(performance)
    return performances

In [11]:
def find_optimal_configuration_accross_datasets(
    config_space,
    datasets: List[Tuple[DataFrame, Series]],
    model: Pipeline,
    summary_func,
):
    best_config = None
    best_summary_score = float("0")

    for config in config_space:
        performances = experiment(datasets=datasets, model=model, config=config)
        summary_score = summary_func(performances)

        if summary_score > best_summary_score:
            best_summary_score = summary_score
            best_config = config

    return best_config

In [12]:
def get_column_transformer() -> ColumnTransformer:
    num_pipeline = Pipeline(
        steps=[("impute", SimpleImputer(strategy="mean")), ("scale", MinMaxScaler())]
    )
    cat_pipeline = Pipeline(
        steps=[
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
        ]
    )

    col_trans = ColumnTransformer(
        transformers=[
            (
                "num_pipeline",
                num_pipeline,
                make_column_selector(dtype_include=np.number),
            ),
            ("cat_pipeline", cat_pipeline, make_column_selector(dtype_include=object)),
        ],
        remainder="drop",
        n_jobs=-1,
    )
    return col_trans

In [13]:
def evaluate_pipeline_on_datasets(
    pipeline: Pipeline, optimal_config, datasets: List[Tuple[DataFrame, Series]]
):
    for X, y in datasets:
        pipeline.set_params(**optimal_config)
        evaluate_pipeline(
            pipeline=pipeline,
            X=X,
            y=y,
            X_val=X,
            y_val=y,
        )

### Load datasets

In [14]:
fish_market_dataset_id = (
    43308  # https://www.openml.org/search?type=data&id=43308&sort=runs&status=active
)
liver_disorders_dataset_id = (
    8  # https://www.openml.org/search?type=data&id=8&sort=runs&status=active
)
diabetes_dataset_id = (
    44223  # https://www.openml.org/search?type=data&id=44223&sort=runs&status=active
)

lisbona_house_prices_dataset_id = (
    43660  # https://www.openml.org/search?type=data&id=43660&sort=runs&status=active
)


fish_market_dataset: DataFrame = load_dataset_from_id(id=fish_market_dataset_id)
fish_market_regression_class = "Weight"

liver_disorders_dataset: DataFrame = load_dataset_from_id(id=liver_disorders_dataset_id)
liver_disorders_regression_class = "drinks"
diabetes_dataset: DataFrame = load_dataset_from_id(id=diabetes_dataset_id)
diabetes_regression_class = "class"

lisbona_house_prices_dataset: DataFrame = load_dataset_from_id(
    id=lisbona_house_prices_dataset_id
)
lisbona_house_prices_regression_class = "Price"



### Split data into train and test sets

In [15]:
(
    X_train_fish_market,
    X_test_fish_market,
    y_train_fish_market,
    y_test_fish_market,
) = split_dataset(data=fish_market_dataset, class_=fish_market_regression_class)

(
    X_train_liver_disorders,
    X_test_liver_disorders,
    y_train_liver_disorders,
    y_test_liver_disorders,
) = split_dataset(data=liver_disorders_dataset, class_=liver_disorders_regression_class)

X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = split_dataset(
    diabetes_dataset, diabetes_regression_class
)

(
    X_train_lisbona_house_prices,
    X_test_lisbona_house_prices,
    y_train_lisbona_house_prices,
    y_test_lisbona_house_prices,
) = split_dataset(lisbona_house_prices_dataset, lisbona_house_prices_regression_class)

In [16]:
def get_datasets() -> List[Tuple[DataFrame, Series]]:
    return [
        (X_train_fish_market, y_train_fish_market),
        (X_train_liver_disorders, y_train_liver_disorders),
        (X_train_diabetes, y_train_diabetes),
        (X_train_lisbona_house_prices, y_train_lisbona_house_prices),
    ]

### TODO: visualize the data

### Create generic column transformer

### Create pipelines

#### 1. Decision Tree

In [17]:
def get_decision_tree_pipeline() -> Pipeline:
    decision_tree = DecisionTreeRegressor()
    col_trans: ColumnTransformer = get_column_transformer()
    decision_tree_pipeline = Pipeline(
        steps=[("column_transformer", col_trans), ("model", decision_tree)]
    )
    return decision_tree_pipeline

In [18]:
def get_parameter_grid_decision_tree():
    # parameters space
    ccp_alpha_values = [i * 0.1 for i in range(11)]
    max_depth_values = range(1, 31, 1)
    min_samples_split_values = range(2, 61, 1)
    min_samples_leaf_values = range(1, 61, 1)

    all_combinations = list(
        itertools.product(
            ccp_alpha_values,
            max_depth_values,
            min_samples_split_values,
            min_samples_leaf_values,
        )
    )
    selected_combinations: List[Tuple[float, int, int, int]] = random.sample(
        all_combinations, 100
    )
    parameter_names = [
        "model__ccp_alpha",
        "model__max_depth",
        "model__min_samples_split",
        "model__min_samples_leaf",
    ]

    parameters_grid = [
        dict(zip(parameter_names, combination)) for combination in selected_combinations
    ]
    return parameters_grid

In [19]:
datasets: List[Tuple[DataFrame, Series]] = get_datasets()
decison_tree_pipeline: Pipeline = get_decision_tree_pipeline()
parameters_grid_decision_tree = get_parameter_grid_decision_tree()

optimal_config_decision_tree = find_optimal_configuration_accross_datasets(
    config_space=parameters_grid_decision_tree,
    datasets=datasets,
    model=decison_tree_pipeline,
    summary_func=np.mean,  # Or np.median for a more robust approach
)

In [20]:
print(optimal_config_decision_tree)

{'model__ccp_alpha': 0.1, 'model__max_depth': 12, 'model__min_samples_split': 6, 'model__min_samples_leaf': 5}


In [21]:
evaluate_pipeline_on_datasets(
    get_decision_tree_pipeline(), optimal_config_decision_tree, datasets
)

Parameter set: DecisionTreeRegressor(ccp_alpha=0.1, max_depth=12, min_samples_leaf=5,
                      min_samples_split=6)
Test score R^2: 0.9442644593770035
Train score R^2: 0.9442644593770035
Mean Squared Error: 6851.4708197725295
Parameter set: DecisionTreeRegressor(ccp_alpha=0.1, max_depth=12, min_samples_leaf=5,
                      min_samples_split=6)
Test score R^2: 0.4677313223280586
Train score R^2: 0.4677313223280586
Mean Squared Error: 5.9745003469969
Parameter set: DecisionTreeRegressor(ccp_alpha=0.1, max_depth=12, min_samples_leaf=5,
                      min_samples_split=6)
Test score R^2: 0.7948233807576879
Train score R^2: 0.7948233807576879
Mean Squared Error: 1246.7348014748864
Parameter set: DecisionTreeRegressor(ccp_alpha=0.1, max_depth=12, min_samples_leaf=5,
                      min_samples_split=6)
Test score R^2: 0.8189419523948761
Train score R^2: 0.8189419523948761
Mean Squared Error: 36995875727.45061


#### 2. ElasticNet

In [22]:
def get_elasticnet_pipeline() -> Pipeline:
    elastic_net = ElasticNet(max_iter=10000)
    col_trans: ColumnTransformer = get_column_transformer()
    decision_tree_pipeline = Pipeline(
        steps=[("column_transformer", col_trans), ("model", elastic_net)]
    )
    return decision_tree_pipeline

In [23]:
def get_parameter_grid_for_elasticnet():
    # parameters space
    alpha = [i * 0.05 for i in range(21)]
    l1_ratio = [i * 0.05 for i in range(21)]
    all_combinations = list(
        itertools.product(
            alpha,
            l1_ratio,
        )
    )
    selected_combinations: List[Tuple[float, int, int, int]] = random.sample(
        all_combinations, 100
    )
    parameter_names = [
        "model__alpha",
        "model__l1_ratio",
    ]

    parameters_grid_elasticnet = [
        dict(zip(parameter_names, combination)) for combination in selected_combinations
    ]
    return parameters_grid_elasticnet

In [24]:
datasets: List[Tuple[DataFrame, Series]] = get_datasets()
elastic_net_pipeline: Pipeline = get_elasticnet_pipeline()
parameters_grid_elasticnet = get_parameter_grid_for_elasticnet()
optimal_config_elasticnet = find_optimal_configuration_accross_datasets(
    config_space=parameters_grid_elasticnet,
    datasets=datasets,
    model=elastic_net_pipeline,
    summary_func=np.mean,
)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_c

In [25]:
print(optimal_config_elasticnet)

{'model__alpha': 0.0, 'model__l1_ratio': 0.30000000000000004}


In [26]:
evaluate_pipeline_on_datasets(
    get_elasticnet_pipeline(), optimal_config_elasticnet, datasets
)

Parameter set: ElasticNet(alpha=0.0, l1_ratio=0.30000000000000004, max_iter=10000)
Test score R^2: 0.9285956799012827
Train score R^2: 0.9285956799012827
Mean Squared Error: 8777.605995988224
Parameter set: ElasticNet(alpha=0.0, l1_ratio=0.30000000000000004, max_iter=10000)
Test score R^2: 0.22349020246245288
Train score R^2: 0.22349020246245288
Mean Squared Error: 8.716008003938809
Parameter set: ElasticNet(alpha=0.0, l1_ratio=0.30000000000000004, max_iter=10000)
Test score R^2: 0.5279193863361497
Train score R^2: 0.5279193863361497
Mean Squared Error: 2868.549702835578


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(


Parameter set: ElasticNet(alpha=0.0, l1_ratio=0.30000000000000004, max_iter=10000)
Test score R^2: 0.8448543637107147
Train score R^2: 0.8448543637107147
Mean Squared Error: 31701151955.04967


  model = cd_fast.enet_coordinate_descent(


#### 3. Random Forest

In [27]:
def get_random_forest_pipeline():
    random_forest = RandomForestRegressor()
    col_trans: ColumnTransformer = get_column_transformer()
    random_forest_pipeline = Pipeline(
        steps=[("column_transformer", col_trans), ("model", random_forest)]
    )
    return random_forest_pipeline

In [28]:
def get_parameter_grid_for_random_forest():
    # parameters space
    max_depth_values = range(1, 31, 1)
    min_samples_split_values = range(2, 61, 1)
    min_samples_leaf_values = range(1, 61, 1)
    n_estimators_values = range(1, 200, 1)

    all_combinations = list(
        itertools.product(
            max_depth_values,
            min_samples_split_values,
            min_samples_leaf_values,
            n_estimators_values,
        )
    )
    selected_combinations: List[Tuple[float, int, int, int]] = random.sample(
        all_combinations, 100
    )
    parameter_names = [
        "model__max_depth",
        "model__min_samples_split",
        "model__min_samples_leaf",
        "model__n_estimators",
    ]

    parameters_grid_random_forest = [
        dict(zip(parameter_names, combination)) for combination in selected_combinations
    ]
    return parameters_grid_random_forest

In [29]:
datasets: List[Tuple[DataFrame, Series]] = get_datasets()
random_forest_pipeline: Pipeline = get_random_forest_pipeline()
parameters_grid_random_forest = get_parameter_grid_for_random_forest()

optimal_config_random_forest = find_optimal_configuration_accross_datasets(
    config_space=parameters_grid_random_forest,
    datasets=datasets,
    model=random_forest_pipeline,
    summary_func=np.mean,
)

In [None]:
print(optimal_config_random_forest)

In [None]:
evaluate_pipeline_on_datasets(
    get_random_forest_pipeline(), optimal_config_random_forest, datasets
)

### Bayes - decision tree

In [None]:
decision_tree_params = {
    "model__ccp_alpha": Real(0.11, 1.21, prior="log-uniform"),
    "model__max_depth": Integer(1, 31, prior="log-uniform"),
    "model__min_samples_split": Integer(2, 61, prior="log-uniform"),
    "model__min_samples_leaf": Integer(2, 61, prior="log-uniform"),
}

decision_tree_params = {
    "model__ccp_alpha": Real(0.11, 1.21, prior="log-uniform"),
    "model__max_depth": Integer(1,31, prior="log-uniform"),
    "model__min_samples_split": Integer(2,61, prior="log-uniform"),
    "model__min_samples_leaf": Integer(2,61, prior="log-uniform"),
}

out = get_bayes_best_configuration(
    get_decision_tree_pipeline(),
    [(decision_tree_params, 30), (decision_tree_params, 30), (decision_tree_params, 30), (decision_tree_params, 30)],
    X_train_fish_market,
    y_train_fish_market,
    X_test_fish_market,
    y_test_fish_market
)

print(out[0])
print(out[1])
