### Imports

In [None]:
import itertools
import random
from typing import Any, Dict, List, Tuple

import numpy as np
from pandas import DataFrame, Series
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from skopt import BayesSearchCV
from skopt.space import Integer, Real

from utills.dataset import load_dataset_from_id, split_dataset
from utills.pipeline import evaluate_pipeline_on_datasets, get_column_transformer

### Utill functions

In [None]:
def perform_optimazaion_of_pipeline_with_bayesian_method(
    pipeline: Pipeline,
    search_space: Dict[str, Any],
    X: DataFrame,
    y: DataFrame,
    n_iter=100,
) -> BayesSearchCV:
    opt: BayesSearchCV = BayesSearchCV(
        pipeline,
        # [(space, # of evaluations)]
        search_spaces=search_space,
        n_iter=n_iter,
        n_jobs=-1,
        cv=5,
    )
    opt.fit(X, y)
    return opt

In [None]:
def get_bayes_best_configuration(
    pipeline: Pipeline,
    search,  #: List[(Dict[str, object], int)],
    X_train,
    Y_train,
    X_test,
    Y_test,
):
    max = float("-inf")
    best = None
    iter = None
    for config in search:
        model = perform_optimazaion_of_pipeline_with_bayesian_method(
            pipeline, [config], X_train, Y_train
        )
        score = model.score(X_test, Y_test)
        if max < score:
            best = model.best_params_
            iter = config[1]
    return (best, iter)

In [None]:
def optimize_pipeline_over_params_combinations(
    pipeline: Pipeline,
    parameters_grid: List[dict],
    X: DataFrame,
    y: DataFrame,
    X_val: DataFrame,
    y_val: DataFrame,
) -> Pipeline:
    # thats the teta^(j)*
    best_score = float("-inf")
    best_params = None

    for params in parameters_grid:
        # Update the pipeline parameters
        pipeline_params = {f"{key}": value for key, value in params.items()}
        pipeline.set_params(**pipeline_params)

        pipeline.fit(X, y)
        score = pipeline.score(X_val, y_val)

        if score > best_score:
            best_score = score
            best_params = pipeline_params

    pipeline.set_params(**best_params)
    return pipeline

In [None]:
def evaluate_model_performance(model: Pipeline, X, y) -> float:
    model.fit(X=X, y=y)
    return model.score(X=X, y=y)

In [None]:
def experiment(
    datasets: List[Tuple[DataFrame, Series]], model: Pipeline, config
) -> List[float]:
    performances: List[float] = []
    for X, y in datasets:
        model.set_params(**config)
        performance: float = evaluate_model_performance(model=model, X=X, y=y)
        performances.append(performance)
    return performances

In [None]:
def find_optimal_configuration_accross_datasets(
    config_space,
    datasets: List[Tuple[DataFrame, Series]],
    model: Pipeline,
    summary_func,
):
    best_config = None
    best_summary_score = float("0")

    for config in config_space:
        performances = experiment(datasets=datasets, model=model, config=config)
        summary_score = summary_func(performances)

        if summary_score > best_summary_score:
            best_summary_score = summary_score
            best_config = config

    return best_config

### Load datasets

In [None]:
fish_market_dataset_id = (
    43308  # https://www.openml.org/search?type=data&id=43308&sort=runs&status=active
)
liver_disorders_dataset_id = (
    8  # https://www.openml.org/search?type=data&id=8&sort=runs&status=active
)
diabetes_dataset_id = (
    44223  # https://www.openml.org/search?type=data&id=44223&sort=runs&status=active
)

lisbona_house_prices_dataset_id = (
    43660  # https://www.openml.org/search?type=data&id=43660&sort=runs&status=active
)


fish_market_dataset: DataFrame = load_dataset_from_id(id=fish_market_dataset_id)
fish_market_regression_class = "Weight"

liver_disorders_dataset: DataFrame = load_dataset_from_id(id=liver_disorders_dataset_id)
liver_disorders_regression_class = "drinks"
diabetes_dataset: DataFrame = load_dataset_from_id(id=diabetes_dataset_id)
diabetes_regression_class = "class"

lisbona_house_prices_dataset: DataFrame = load_dataset_from_id(
    id=lisbona_house_prices_dataset_id
)
lisbona_house_prices_regression_class = "Price"

### Split data into train and test sets

In [None]:
(
    X_train_fish_market,
    X_test_fish_market,
    y_train_fish_market,
    y_test_fish_market,
) = split_dataset(data=fish_market_dataset, class_=fish_market_regression_class)

(
    X_train_liver_disorders,
    X_test_liver_disorders,
    y_train_liver_disorders,
    y_test_liver_disorders,
) = split_dataset(data=liver_disorders_dataset, class_=liver_disorders_regression_class)

X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = split_dataset(
    diabetes_dataset, diabetes_regression_class
)

(
    X_train_lisbona_house_prices,
    X_test_lisbona_house_prices,
    y_train_lisbona_house_prices,
    y_test_lisbona_house_prices,
) = split_dataset(lisbona_house_prices_dataset, lisbona_house_prices_regression_class)

In [None]:
def get_datasets() -> List[Tuple[DataFrame, Series]]:
    return [
        (X_train_fish_market, y_train_fish_market),
        (X_train_liver_disorders, y_train_liver_disorders),
        (X_train_diabetes, y_train_diabetes),
        (X_train_lisbona_house_prices, y_train_lisbona_house_prices),
    ]

### TODO: visualize the data

### Create generic column transformer

### Create pipelines

#### 1. Decision Tree

In [None]:
def get_decision_tree_pipeline() -> Pipeline:
    decision_tree = DecisionTreeRegressor()
    col_trans: ColumnTransformer = get_column_transformer()
    decision_tree_pipeline = Pipeline(
        steps=[("column_transformer", col_trans), ("model", decision_tree)]
    )
    return decision_tree_pipeline

In [None]:
def get_parameter_grid_decision_tree():
    # parameters space
    ccp_alpha_values = [i * 0.1 for i in range(11)]
    max_depth_values = range(1, 31, 1)
    min_samples_split_values = range(2, 61, 1)
    min_samples_leaf_values = range(1, 61, 1)

    all_combinations = list(
        itertools.product(
            ccp_alpha_values,
            max_depth_values,
            min_samples_split_values,
            min_samples_leaf_values,
        )
    )
    selected_combinations: List[Tuple[float, int, int, int]] = random.sample(
        all_combinations, 100
    )
    parameter_names = [
        "model__ccp_alpha",
        "model__max_depth",
        "model__min_samples_split",
        "model__min_samples_leaf",
    ]

    parameters_grid = [
        dict(zip(parameter_names, combination)) for combination in selected_combinations
    ]
    return parameters_grid

In [None]:
datasets: List[Tuple[DataFrame, Series]] = get_datasets()
decison_tree_pipeline: Pipeline = get_decision_tree_pipeline()
parameters_grid_decision_tree = get_parameter_grid_decision_tree()

optimal_config_decision_tree = find_optimal_configuration_accross_datasets(
    config_space=parameters_grid_decision_tree,
    datasets=datasets,
    model=decison_tree_pipeline,
    summary_func=np.mean,  # Or np.median for a more robust approach
)

In [None]:
print(optimal_config_decision_tree)

In [None]:
evaluate_pipeline_on_datasets(
    get_decision_tree_pipeline(), optimal_config_decision_tree, datasets
)

#### 2. ElasticNet

In [None]:
def get_elasticnet_pipeline() -> Pipeline:
    elastic_net = ElasticNet(max_iter=10000)
    col_trans: ColumnTransformer = get_column_transformer()
    decision_tree_pipeline = Pipeline(
        steps=[("column_transformer", col_trans), ("model", elastic_net)]
    )
    return decision_tree_pipeline

In [None]:
def get_parameter_grid_for_elasticnet():
    # parameters space
    alpha = [i * 0.05 for i in range(21)]
    l1_ratio = [i * 0.05 for i in range(21)]
    all_combinations = list(
        itertools.product(
            alpha,
            l1_ratio,
        )
    )
    selected_combinations: List[Tuple[float, int, int, int]] = random.sample(
        all_combinations, 100
    )
    parameter_names = [
        "model__alpha",
        "model__l1_ratio",
    ]

    parameters_grid_elasticnet = [
        dict(zip(parameter_names, combination)) for combination in selected_combinations
    ]
    return parameters_grid_elasticnet

In [None]:
datasets: List[Tuple[DataFrame, Series]] = get_datasets()
elastic_net_pipeline: Pipeline = get_elasticnet_pipeline()
parameters_grid_elasticnet = get_parameter_grid_for_elasticnet()
optimal_config_elasticnet = find_optimal_configuration_accross_datasets(
    config_space=parameters_grid_elasticnet,
    datasets=datasets,
    model=elastic_net_pipeline,
    summary_func=np.mean,
)

In [None]:
print(optimal_config_elasticnet)

In [None]:
evaluate_pipeline_on_datasets(
    get_elasticnet_pipeline(), optimal_config_elasticnet, datasets
)

#### 3. Random Forest

In [None]:
def get_random_forest_pipeline():
    random_forest = RandomForestRegressor()
    col_trans: ColumnTransformer = get_column_transformer()
    random_forest_pipeline = Pipeline(
        steps=[("column_transformer", col_trans), ("model", random_forest)]
    )
    return random_forest_pipeline

In [None]:
def get_parameter_grid_for_random_forest():
    # parameters space
    max_depth_values = range(1, 31, 1)
    min_samples_split_values = range(2, 61, 1)
    min_samples_leaf_values = range(1, 61, 1)
    n_estimators_values = range(1, 200, 1)

    all_combinations = list(
        itertools.product(
            max_depth_values,
            min_samples_split_values,
            min_samples_leaf_values,
            n_estimators_values,
        )
    )
    selected_combinations: List[Tuple[float, int, int, int]] = random.sample(
        all_combinations, 100
    )
    parameter_names = [
        "model__max_depth",
        "model__min_samples_split",
        "model__min_samples_leaf",
        "model__n_estimators",
    ]

    parameters_grid_random_forest = [
        dict(zip(parameter_names, combination)) for combination in selected_combinations
    ]
    return parameters_grid_random_forest

In [None]:
datasets: List[Tuple[DataFrame, Series]] = get_datasets()
random_forest_pipeline: Pipeline = get_random_forest_pipeline()
parameters_grid_random_forest = get_parameter_grid_for_random_forest()

optimal_config_random_forest = find_optimal_configuration_accross_datasets(
    config_space=parameters_grid_random_forest,
    datasets=datasets,
    model=random_forest_pipeline,
    summary_func=np.mean,
)

In [None]:
print(optimal_config_random_forest)

In [None]:
evaluate_pipeline_on_datasets(
    get_random_forest_pipeline(), optimal_config_random_forest, datasets
)

### Bayes - decision tree

In [None]:
decision_tree_params = {
    "model__ccp_alpha": Real(0.11, 1.21, prior="log-uniform"),
    "model__max_depth": Integer(1, 31, prior="log-uniform"),
    "model__min_samples_split": Integer(2, 61, prior="log-uniform"),
    "model__min_samples_leaf": Integer(2, 61, prior="log-uniform"),
}

decision_tree_params = {
    "model__ccp_alpha": Real(0.11, 1.21, prior="log-uniform"),
    "model__max_depth": Integer(1, 31, prior="log-uniform"),
    "model__min_samples_split": Integer(2, 61, prior="log-uniform"),
    "model__min_samples_leaf": Integer(2, 61, prior="log-uniform"),
}

out = get_bayes_best_configuration(
    get_decision_tree_pipeline(),
    [
        (decision_tree_params, 30),
        (decision_tree_params, 30),
        (decision_tree_params, 30),
        (decision_tree_params, 30),
    ],
    X_train_fish_market,
    y_train_fish_market,
    X_test_fish_market,
    y_test_fish_market,
)

print(out[0])
print(out[1])