### Imports

In [1]:
from typing import Any, Dict, Tuple

import numpy as np
import openml
import pandas as pd
from pandas import DataFrame
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor

# from evaluate_pipeline import evaluate_pipeline
# from load_dataset import load_dataset_from_id
# from perform_optimazaion_of_pipeline_with_random_search import (
#     perform_optimazaion_of_pipeline_with_random_search,
# )
# from split_dataset import split_dataset

### Utill functions

In [2]:
def perform_optimazaion_of_pipeline_with_random_search(
    pipeline: Pipeline,
    param_grid: Dict[str, Any],
    X: DataFrame,
    y: DataFrame,
    n_iter: int = 100,
) -> RandomizedSearchCV:
    search: RandomizedSearchCV = RandomizedSearchCV(
        pipeline,
        param_grid,
        n_iter=n_iter,
        n_jobs=-1,
        cv=5,
    )
    search.fit(X, y)
    return search

In [3]:
def split_dataset(
    data: pd.DataFrame, class_: str
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    X: pd.DataFrame = data.drop(labels=class_, axis=1)
    y: pd.DataFrame = data[class_]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test

In [4]:
def load_dataset_from_id(id: int) -> DataFrame:
    return openml.datasets.get_dataset(dataset_id=id).get_data(
        dataset_format="dataframe"
    )[0]

In [20]:
def calculate_mse(model: Pipeline, X_test: pd.DataFrame, y_test: pd.Series):
    # Ensure X_test and y_test are the correct types
    if not isinstance(X_test, (pd.DataFrame, np.ndarray)):
        raise ValueError("X_test must be a pandas DataFrame or numpy array")
    if not isinstance(y_test, (pd.Series, np.ndarray)):
        raise ValueError("y_test must be a pandas Series or numpy array")

    # Generating predictions and calculating MSE
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error: {mse}")

In [21]:
def evaluate_random_search_optimizer(
    model: RandomizedSearchCV, X_test: pd.DataFrame, y_test: pd.Series
):
    # Ensure X_test and y_test are the correct types
    if not isinstance(X_test, (pd.DataFrame, np.ndarray)):
        raise ValueError("X_test must be a pandas DataFrame or numpy array")
    if not isinstance(y_test, (pd.Series, np.ndarray)):
        raise ValueError("y_test must be a pandas Series or numpy array")

    # Evaluating the best model on the test set
    test_score = model.score(X_test, y_test)

    # Printing information about the best model and its performance
    print("Test Score of the best model: " + str(test_score))
    print("Best Score of train set: " + str(model.best_score_))
    print("Best parameter set: " + str(model.best_params_))

    calculate_mse(model, X_test, y_test)

### Load datasets

In [6]:
fish_market_dataset_id = (
    43308  # https://www.openml.org/search?type=data&id=43308&sort=runs&status=active
)
liver_disorders_dataset_id = (
    8  # https://www.openml.org/search?type=data&id=8&sort=runs&status=active
)
diabetes_dataset_id = (
    44223  # https://www.openml.org/search?type=data&id=44223&sort=runs&status=active
)

lisbona_house_prices_dataset_id = (
    43660  # https://www.openml.org/search?type=data&id=43660&sort=runs&status=active
)


fish_market_dataset: DataFrame = load_dataset_from_id(id=fish_market_dataset_id)
fish_market_regression_class = "Weight"

liver_disorders_dataset: DataFrame = load_dataset_from_id(id=liver_disorders_dataset_id)
liver_disorders_regression_class = "drinks"
diabetes_dataset: DataFrame = load_dataset_from_id(id=diabetes_dataset_id)
diabetes_regression_class = "class"

lisbona_house_prices_dataset: DataFrame = load_dataset_from_id(
    id=lisbona_house_prices_dataset_id
)
lisbona_house_prices_regression_class = "Price"



### Split data into train and test sets

In [7]:
(
    X_train_fish_market,
    X_test_fish_market,
    y_train_fish_market,
    y_test_fish_market,
) = split_dataset(data=fish_market_dataset, class_=fish_market_regression_class)

(
    X_train_liver_disorders,
    X_test_liver_disorders,
    y_train_liver_disorders,
    y_test_liver_disorders,
) = split_dataset(data=liver_disorders_dataset, class_=liver_disorders_regression_class)

X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = split_dataset(
    diabetes_dataset, diabetes_regression_class
)

(
    X_train_lisbona_house_prices,
    X_test_lisbona_house_prices,
    y_train_lisbona_house_prices,
    y_test_lisbona_house_prices,
) = split_dataset(lisbona_house_prices_dataset, lisbona_house_prices_regression_class)

### TODO: visualize the data

### Create generic column transformer

In [8]:
num_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), ("scale", MinMaxScaler())]
)
cat_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)


col_trans = ColumnTransformer(
    transformers=[
        ("num_pipeline", num_pipeline, make_column_selector(dtype_include=np.number)),
        ("cat_pipeline", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder="drop",
    n_jobs=-1,
)

### Create models

In [9]:
elastic_net = ElasticNet()
knn = KNeighborsRegressor()

### Create pipelines

#### 1. Decision Tree

In [12]:
decision_tree = DecisionTreeRegressor()

decision_tree_pipeline = Pipeline(
    steps=[("column_transformer", col_trans), ("model", decision_tree)]
)

decision_tree_params = {
    "model__ccp_alpha": [i * 0.1 for i in range(11)],
    "model__max_depth": range(1, 31, 1),
    "model__min_samples_split": range(2, 61, 1),
    "model__min_samples_leaf": range(1, 61, 1),
}

rs_decision_tree = perform_optimazaion_of_pipeline_with_random_search(
    decision_tree_pipeline,
    decision_tree_params,
    X_train_fish_market,
    y_train_fish_market,
)

In [15]:
evaluate_random_search_optimizer(
    rs_decision_tree, X_test_fish_market, y_test_fish_market
)

Test Score of the best model: 0.9509240510031336
Best Score of train set: 0.9338157406641239
Best parameter set: {'model__min_samples_split': 5, 'model__min_samples_leaf': 2, 'model__max_depth': 7, 'model__ccp_alpha': 0.1}
Mean Squared Error: 6980.527986111111
