In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, KBinsDiscretizer, RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin, clone
import os
import joblib
import itertools
import logging
import optuna

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from src.feature_engineering import *
from src.modeling import *

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#config
DATA_PATH = "data/"
OUTPUT_PATH = "output/"
BLUEPRINT_PATH = "output/blueprint/"
MODEL_PATH = "models/"

PREPROCESSING_LIST = {
    'outlier_removal' : {
        'enabled' : True
    },
    'robust_scaler'  : {
        'enabled' : True
    },
    'polynomial' : {
        'enabled' : True
    },
    'binning' : {
        'enabled' : True
    },
    'pca' : {
        'enabled' : True
    }
}

In [3]:
# --- Setup logging ---
logging.basicConfig(
    filename=os.path.join(OUTPUT_PATH, "08_optuna_optimization.log"),
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logging.info(f"starting optuna optimization...")

In [4]:
logging.info("reading train and test data...")

df_train = pd.read_csv(DATA_PATH + "train.csv")
# df_train = df_train.sample(n=10000).reset_index(drop=True)

df_test = pd.read_csv(DATA_PATH + "test.csv")

target_col = "BeatsPerMinute"
feature_cols = [f for f in df_train.columns if f not in ('id', target_col)]

X_train = df_train[feature_cols].copy().reset_index(drop=True)
y_train = df_train[target_col].copy().reset_index(drop=True)

logging.info(f"X_train shape : {X_train.shape}")
logging.info(f"y_train shape : {y_train.shape}")

X_test = df_test[feature_cols].copy().reset_index(drop=True)

logging.info(f"X_test shape : {X_test.shape}")

In [5]:
logging.info("defining preprocessing combinations...")

available_steps = []

if PREPROCESSING_LIST["outlier_removal"]["enabled"]:
    available_steps.append(("outlier_removal", OutlierRemoval()))

if PREPROCESSING_LIST["robust_scaler"]["enabled"]:
    available_steps.append(("robust_scaler", RobustScaler()))

if PREPROCESSING_LIST["polynomial"]["enabled"]:
    available_steps.append(("polynomial", PolynomialFeatures(interaction_only=False, include_bias=False, degree=2)))

if PREPROCESSING_LIST["binning"]["enabled"]:
    available_steps.append(("binning", KBinsDiscretizer(encode="ordinal")))

# if PREPROCESSING_LIST["standardization"]["enabled"]:
#     available_steps.append(("standardization", StandardScaler()) 

if PREPROCESSING_LIST["pca"]["enabled"]:
    available_steps.append(("pca", PCA(n_components=0.85)))

def generate_all_combinations(steps):
    for r in range(1, len(steps) + 1):
        for combo in itertools.combinations(steps, r):
            yield combo

preprocessing_combinations = list(generate_all_combinations(available_steps))

logging.info(f"preprocessing combination count : {len(preprocessing_combinations)}")

BayesianRidge

In [None]:
logging.info("starting BayesianRidge optuna optimization...")

def objective_bay(trial, preprocessing_steps, X, y, n_splits=5):
    # Hyperparameter suggestions
    tol = trial.suggest_float("tol", 1e-8, 1e-2, log=True)
    alpha_1 = trial.suggest_float("alpha_1", 1e-9, 1e3, log=True)
    alpha_2 = trial.suggest_float("alpha_2", 1e-9, 1e3, log=True)
    lambda_1 = trial.suggest_float("lambda_1", 1e-9, 1e3, log=True)
    lambda_2 = trial.suggest_float("lambda_2", 1e-9, 1e3, log=True)
    fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])

    model = Pipeline(
        preprocessing_steps + [
            ("regressor", BayesianRidge(
                tol=tol,
                alpha_1=alpha_1,
                alpha_2=alpha_2,
                lambda_1=lambda_1,
                lambda_2=lambda_2,
                fit_intercept=fit_intercept,
                compute_score=False,
                verbose=False
            ))
        ]
    )

    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1)
    return -scores.mean()

model_name = "bayesian_ridge"
results = []
for combo in preprocessing_combinations:
    steps = []
    steps_name = []
    for step_name, transformer in combo:
        steps.append((step_name, transformer))
        steps_name.append(step_name)

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial:objective_bay(trial, steps, X_train, y_train), n_trials=30)

    logging.info(f"model_name: {model_name}. steps : {','.join(steps_name)}. best params: {study.best_params}. best rmse: {study.best_value}")
    
    results.append({
        'model' : model_name,
        'steps' : ','.join(steps_name),
        'best_params' : study.best_params,
        'best_rmse' : study.best_value,
    })

    df_results = pd.DataFrame(results).sort_values('best_rmse', ascending=True)
    #save temporary results to csv
    df_results.to_csv(os.path.join(OUTPUT_PATH, f"08_optuna_optimization_{model_name}_results.csv"), index=False)

In [None]:
df_results.head(10)

LGBM