### Imports

In [1]:
import itertools
import random
from typing import Any, Dict, List, Tuple

import numpy as np
import openml
import pandas as pd
from pandas import DataFrame
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor

# from evaluate_pipeline import evaluate_pipeline
# from load_dataset import load_dataset_from_id
# from perform_optimazaion_of_pipeline_with_random_search import (
#     perform_optimazaion_of_pipeline_with_random_search,
# )
# from split_dataset import split_dataset

### Utill functions

In [2]:
def optimize_pipeline_over_params_combinations(
    pipeline: Pipeline,
    parameters_grid: List[dict],
    X: DataFrame,
    y: DataFrame,
    X_val: DataFrame,
    y_val: DataFrame,
) -> Pipeline:
    best_score = float("-inf")
    best_params = None

    for params in parameters_grid:
        # Update the pipeline parameters
        pipeline_params = {f"model__{key}": value for key, value in params.items()}
        pipeline.set_params(**pipeline_params)

        pipeline.fit(X, y)
        score = pipeline.score(X_val, y_val)

        if score > best_score:
            best_score = score
            best_params = pipeline_params

    pipeline.set_params(**best_params)
    return pipeline

In [3]:
def split_dataset(
    data: pd.DataFrame, class_: str
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    X: pd.DataFrame = data.drop(labels=class_, axis=1)
    y: pd.DataFrame = data[class_]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test

In [4]:
def load_dataset_from_id(id: int) -> DataFrame:
    return openml.datasets.get_dataset(dataset_id=id).get_data(
        dataset_format="dataframe"
    )[0]

In [5]:
def calculate_mse(model: Pipeline, X_test: pd.DataFrame, y_test: pd.Series):
    # Ensure X_test and y_test are the correct types
    if not isinstance(X_test, (pd.DataFrame, np.ndarray)):
        raise ValueError("X_test must be a pandas DataFrame or numpy array")
    if not isinstance(y_test, (pd.Series, np.ndarray)):
        raise ValueError("y_test must be a pandas Series or numpy array")

    # Generating predictions and calculating MSE
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error: {mse}")

In [6]:
def evaluate_pipeline(
    pipeline: Pipeline,
    X: pd.DataFrame,
    y: pd.Series,
    X_val: pd.DataFrame,
    y_val: pd.Series,
):
    if not isinstance(X_val, (pd.DataFrame, np.ndarray)):
        raise ValueError("X_test must be a pandas DataFrame or numpy array")
    if not isinstance(y_val, (pd.Series, np.ndarray)):
        raise ValueError("y_test must be a pandas Series or numpy array")

    test_score = pipeline.score(X_val, y_val)
    train_score = pipeline.score(X, y)
    print("Parameter set: " + str(pipeline.named_steps["model"]))
    print("Test score R^2: " + str(test_score))
    print("Train score R^2: " + str(train_score))
    calculate_mse(pipeline, X_val, y_val)

### Load datasets

In [7]:
fish_market_dataset_id = (
    43308  # https://www.openml.org/search?type=data&id=43308&sort=runs&status=active
)
liver_disorders_dataset_id = (
    8  # https://www.openml.org/search?type=data&id=8&sort=runs&status=active
)
diabetes_dataset_id = (
    44223  # https://www.openml.org/search?type=data&id=44223&sort=runs&status=active
)

lisbona_house_prices_dataset_id = (
    43660  # https://www.openml.org/search?type=data&id=43660&sort=runs&status=active
)


fish_market_dataset: DataFrame = load_dataset_from_id(id=fish_market_dataset_id)
fish_market_regression_class = "Weight"

liver_disorders_dataset: DataFrame = load_dataset_from_id(id=liver_disorders_dataset_id)
liver_disorders_regression_class = "drinks"
diabetes_dataset: DataFrame = load_dataset_from_id(id=diabetes_dataset_id)
diabetes_regression_class = "class"

lisbona_house_prices_dataset: DataFrame = load_dataset_from_id(
    id=lisbona_house_prices_dataset_id
)
lisbona_house_prices_regression_class = "Price"



### Split data into train and test sets

In [8]:
(
    X_train_fish_market,
    X_test_fish_market,
    y_train_fish_market,
    y_test_fish_market,
) = split_dataset(data=fish_market_dataset, class_=fish_market_regression_class)

(
    X_train_liver_disorders,
    X_test_liver_disorders,
    y_train_liver_disorders,
    y_test_liver_disorders,
) = split_dataset(data=liver_disorders_dataset, class_=liver_disorders_regression_class)

X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = split_dataset(
    diabetes_dataset, diabetes_regression_class
)

(
    X_train_lisbona_house_prices,
    X_test_lisbona_house_prices,
    y_train_lisbona_house_prices,
    y_test_lisbona_house_prices,
) = split_dataset(lisbona_house_prices_dataset, lisbona_house_prices_regression_class)

### TODO: visualize the data

### Create generic column transformer

In [9]:
num_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), ("scale", MinMaxScaler())]
)
cat_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)


col_trans = ColumnTransformer(
    transformers=[
        ("num_pipeline", num_pipeline, make_column_selector(dtype_include=np.number)),
        ("cat_pipeline", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder="drop",
    n_jobs=-1,
)

### Create models TODO: create also otehr pipelines

In [10]:
elastic_net = ElasticNet()
knn = KNeighborsRegressor()

### Create pipelines

#### 1. Decision Tree

In [11]:
decision_tree = DecisionTreeRegressor()
decision_tree_pipeline = Pipeline(
    steps=[("column_transformer", col_trans), ("model", decision_tree)]
)

#### Select parameters

In [16]:
# parameters space
ccp_alpha_values = [i * 0.1 for i in range(11)]
max_depth_values = range(1, 31, 1)
min_samples_split_values = range(2, 61, 1)
min_samples_leaf_values = range(1, 61, 1)

all_combinations = list(
    itertools.product(
        ccp_alpha_values,
        max_depth_values,
        min_samples_split_values,
        min_samples_leaf_values,
    )
)
selected_combinations: List[Tuple[float, int, int, int]] = random.sample(
    all_combinations, 100
)
parameter_names = ["ccp_alpha", "max_depth", "min_samples_split", "min_samples_leaf"]

parameters_grid = [
    dict(zip(parameter_names, combination)) for combination in selected_combinations
]

#### Train and evaluate the model on fish market dataset

In [17]:
ecision_tree_pipeline = optimize_pipeline_over_params_combinations(
    pipeline=decision_tree_pipeline,
    parameters_grid=parameters_grid,
    X=X_train_fish_market,
    y=y_train_fish_market,
    X_val=X_test_fish_market,
    y_val=y_test_fish_market,
)

evaluate_pipeline(
    pipeline=decision_tree_pipeline,
    X=X_train_fish_market,
    y=y_train_fish_market,
    X_val=X_test_fish_market,
    y_val=y_test_fish_market,
)

Parameter set: DecisionTreeRegressor(ccp_alpha=1.0, max_depth=29, min_samples_leaf=4)
Test score R^2: 0.8287637344030986
Train score R^2: 0.7815269691092457
Mean Squared Error: 24356.524298952434


#### Train the model on liver disorders dataset

In [18]:
decision_tree_pipeline = optimize_pipeline_over_params_combinations(
    pipeline=decision_tree_pipeline,
    parameters_grid=parameters_grid,  
    X=X_train_lisbona_house_prices,
    y=y_train_lisbona_house_prices,
    X_val=X_test_lisbona_house_prices,
    y_val=y_test_lisbona_house_prices,
)

evaluate_pipeline(
    pipeline=decision_tree_pipeline,
    X=X_train_lisbona_house_prices,
    y=y_train_lisbona_house_prices,
    X_val=X_test_lisbona_house_prices,
    y_val=y_test_lisbona_house_prices,
)

Parameter set: DecisionTreeRegressor(ccp_alpha=0.9, max_depth=22, min_samples_split=20)
Test score R^2: 0.5555960561491127
Train score R^2: 0.5362524536474875
Mean Squared Error: 25006470422.091454
