### IMPORTS

In [1]:
# IMPORTS AND PRINTS ALL VERSIONS SO WE CAN REPRODUCE RESULTS EXACTLY LATER              

import sys                                           # STANDARD LIB TO ACCESS PYTHON RUNTIME DETAILS
import platform                                      # STANDARD LIB TO GET OS/PLATFORM INFORMATION
import warnings                                      # STANDARD LIB TO CONTROL WARNING MESSAGES
import os                                            # STANDARD LIB TO QUERY CPU COUNT FOR PARALLELISM
from pathlib import Path                             # STANDARD LIB FOR SAFE, CROSS-PLATFORM PATH HANDLING
from typing import Dict, Any, Tuple                  # TYPE HINTS FOR CLARITY

import numpy as np                                   # NUMERICAL COMPUTING
import pandas as pd                                  # DATAFRAMES AND DATA MANIPULATION

import matplotlib                                    # BASE PLOTTING BACKEND
import matplotlib.pyplot as plt                      # STATEFUL PLOTTING INTERFACE
from matplotlib import rcParams                      # IMPORTS RCPARAMS TO SET GLOBAL STYLES
import seaborn as sns                                # STATISTICAL PLOTTING BUILT ON TOP OF MATPLOTLIB
import plotly                                        # INTERACTIVE PLOTTING (NOT USED HERE BUT KEPT FOR CONSISTENCY)

import sklearn                                       # SCIKIT-LEARN: CLASSIC MACHINE LEARNING
from sklearn.model_selection import train_test_split # TRAIN/TEST SPLITTING
from sklearn.model_selection import KFold, cross_val_score  # K-FOLD SPLITTING AND CV
from sklearn.pipeline import Pipeline                # TO BUILD CLEAN, REUSABLE PREPROCESSING PIPELINES
from sklearn.impute import SimpleImputer             # SIMPLE STRATEGIES TO IMPUTE MISSING VALUES
from sklearn.preprocessing import (                  # SCALERS + TRANSFORMS
    StandardScaler, MinMaxScaler, RobustScaler, 
    PolynomialFeatures, PowerTransformer
)  
from sklearn.compose import ColumnTransformer        # APPLY TRANSFORMS TO COLUMNS (WE USE ALL NUMERIC)
from sklearn.metrics import (                        # METRICS FOR REGRESSION
    r2_score,
    mean_absolute_error,
    mean_squared_error,
    explained_variance_score,
)
from sklearn.multioutput import MultiOutputRegressor    # WRAPS SINGLE-OUTPUT MODELS FOR MULTI-OUTPUT TARGETS
from sklearn.linear_model import (                      # LINEAR FAMILY
    LinearRegression, Ridge, Lasso, ElasticNet
)  
from sklearn.neighbors import KNeighborsRegressor       # KNN REGRESSOR
from sklearn.svm import SVR                             # SUPPORT VECTOR REGRESSION
from sklearn.compose import TransformedTargetRegressor  # FOR SCALING TARGETS

import optuna                                        # HYPERPARAMETER OPTIMIZATION
from optuna.samplers import TPESampler               # ADVANCED SAMPLER
from optuna.pruners import MedianPruner              # EARLY STOPPING PRUNER
import shap                                          # MODEL INTERPRETABILITY 
import joblib                                        # SERIALIZATION (SAVING/LOADING MODELS)

warnings.filterwarnings("ignore")                    # SUPPRESSES NON-CRITICAL WARNINGS FOR CLEANER OUTPUT

print("PYTHON:", sys.version)                        # PRINTS PYTHON VERSION
print("OS:", platform.platform())                    # PRINTS OPERATING SYSTEM DETAILS
print("NUMPY:", np.__version__)                      # PRINTS NUMPY VERSION
print("PANDAS:", pd.__version__)                     # PRINTS PANDAS VERSION
print("SCIKIT-LEARN:", sklearn.__version__)          # PRINTS SCIKIT-LEARN VERSION
print("MATPLOTLIB:", matplotlib.__version__)         # PRINTS MATPLOTLIB VERSION
print("SEABORN:", sns.__version__)                   # PRINTS SEABORN VERSION
print("PLOTLY:", plotly.__version__)                 # PRINTS PLOTLY VERSION
print("OPTUNA:", optuna.__version__)                 # PRINTS OPTUNA VERSION
print("SHAP:", shap.__version__)                     # PRINTS SHAP VERSION

  from .autonotebook import tqdm as notebook_tqdm


PYTHON: 3.12.4 (tags/v3.12.4:8e8a4ba, Jun  6 2024, 19:30:16) [MSC v.1940 64 bit (AMD64)]
OS: Windows-11-10.0.22631-SP0
NUMPY: 2.1.3
PANDAS: 2.3.1
SCIKIT-LEARN: 1.7.1
MATPLOTLIB: 3.10.3
SEABORN: 0.13.2
PLOTLY: 6.2.0
OPTUNA: 4.4.0
SHAP: 0.48.0


### PLOTTING SETUP

In [2]:
# GLOBAL PLOTTING STYLE TO KEEP ALL FIGURES CRISP, BOLD, UPPERCASE, AND HIGH-IMPACT      
FIGSIZE = (8, 6)                                      # DEFAULT FIGURE SIZE
DPI = 500                                             # HIGH RESOLUTION
LINEWIDTH = 2.0                                       # BOLD LINE WIDTH
GRID_LINEWIDTH = 1.5                                  # BOLD GRID LINES
FONTSIZE_TITLE = 20                                   # LARGE TITLE SIZE
FONTSIZE_LABEL = 16                                   # LARGE AXIS LABEL SIZE
FONTSIZE_TICK = 14                                    # LARGE TICK LABEL SIZE
FONTSIZE_LEGEND = 14                                  # LARGE LEGEND FONT SIZE

def init_plot_style() -> None:                        # DEFINES A FUNCTION TO INITIALIZE GLOBAL STYLE
    """SET GLOBAL MATPLOTLIB STYLE FOR BOLD, UPPERCASE, HIGH-RES FIGURES."""  
    rcParams["figure.figsize"] = FIGSIZE              # SETS FIGURE SIZE
    rcParams["figure.dpi"] = DPI                      # SETS DPI
    rcParams["savefig.dpi"] = DPI                     # HIGH-RES SAVED FIGURES
    rcParams["font.weight"] = "bold"                  # MAKES TEXT BOLD
    rcParams["axes.titleweight"] = "bold"             # BOLD TITLES
    rcParams["axes.labelweight"] = "bold"             # BOLD LABELS
    rcParams["axes.titlesize"] = FONTSIZE_TITLE       # TITLE FONT SIZE
    rcParams["axes.labelsize"] = FONTSIZE_LABEL       # LABEL FONT SIZE
    rcParams["xtick.labelsize"] = FONTSIZE_TICK       # X-TICK LABEL SIZE
    rcParams["ytick.labelsize"] = FONTSIZE_TICK       # Y-TICK LABEL SIZE
    rcParams["legend.fontsize"] = FONTSIZE_LEGEND     # LEGEND FONT SIZE
    rcParams["legend.title_fontsize"] = FONTSIZE_LEGEND  # LEGEND TITLE FONT SIZE
    rcParams["lines.linewidth"] = LINEWIDTH           # DEFAULT LINE WIDTH
    rcParams["grid.linewidth"] = GRID_LINEWIDTH       # GRID LINE WIDTH
    rcParams["axes.grid"] = True                      # ENABLE GRID BY DEFAULT
    rcParams["grid.alpha"] = 0.3                      # GRID TRANSPARENCY
    rcParams["axes.spines.top"] = True                # SHOW TOP SPINE
    rcParams["axes.spines.right"] = True              # SHOW RIGHT SPINE

def boldify_axes(ax: plt.Axes,
                 title: str = "",
                 xlabel: str = "",
                 ylabel: str = "",
                 legend: bool = True) -> None:
    """UPPERCASE + BOLD ALL TEXT ELEMENTS ON AN AXES OBJECT."""  
    if title:                                                    # CHECKS IF TITLE IS PROVIDED
        ax.set_title(title.upper(), weight="bold", size=FONTSIZE_TITLE)    # SETS BOLD, UPPERCASE TITLE
    if xlabel:                                                   # CHECKS IF XLABEL IS PROVIDED
        ax.set_xlabel(xlabel.upper(), weight="bold", size=FONTSIZE_LABEL)  # SETS BOLD, UPPERCASE XLABEL
    if ylabel:                                                   # CHECKS IF YLABEL IS PROVIDED
        ax.set_ylabel(ylabel.upper(), weight="bold", size=FONTSIZE_LABEL)  # SETS BOLD, UPPERCASE YLABEL

    for tick in ax.get_xticklabels():                             # LOOPS OVER X TICKS
        tick.set_fontweight("bold")                               # MAKES THEM BOLD
        tick.set_fontsize(FONTSIZE_TICK)                          # SETS FONT SIZE
        tick.set_text(str(tick.get_text()).upper())               # UPPERCASES TEXT

    for tick in ax.get_yticklabels():                             # LOOPS OVER Y TICKS
        tick.set_fontweight("bold")                               # MAKES THEM BOLD
        tick.set_fontsize(FONTSIZE_TICK)                          # SETS FONT SIZE
        tick.set_text(str(tick.get_text()).upper())               # UPPERCASES TEXT

    for spine in ax.spines.values():                              # ITERATES OVER SPINES
        spine.set_linewidth(2.0)                                  # MAKES SPINES THICK

    if legend and ax.get_legend() is not None:                     # IF LEGEND EXISTS, FORMAT IT
        leg = ax.get_legend()                                      # GETS LEGEND HANDLE
        if leg.get_title() is not None:                            # IF LEGEND TITLE EXISTS
            leg.get_title().set_text(leg.get_title().get_text().upper())  # UPPERCASE LEGEND TITLE
            leg.get_title().set_fontweight("bold")                 # BOLD LEGEND TITLE
        for text in leg.get_texts():                               # FOR EACH LEGEND LABEL
            text.set_text(text.get_text().upper())                 # UPPERCASE TEXT
            text.set_fontweight("bold")                            # BOLD TEXT
            text.set_fontsize(FONTSIZE_LEGEND)                     # SET FONT SIZE

def finalize_figure(fig: plt.Figure, suptitle: str = "") -> None:
    """APPLY SUPTITLE (UPPERCASE, BOLD) AND TIGHT LAYOUT."""       
    if suptitle:                                                   # IF SUPTITLE PROVIDED
        fig.suptitle(suptitle.upper(), fontsize=FONTSIZE_TITLE, fontweight="bold")  # SETS BOLD, UPPERCASE SUPTITLE
    fig.tight_layout()                                             # TIGHT LAYOUT TO PREVENT CLIPPING

init_plot_style()                                                  # INITIALIZES THE GLOBAL STYLE

### CONFIG & REPRODUCIBILITY

In [None]:
# CONFIGURATION FOR REPRODUCIBILITY AND PROJECT-SPECIFIC CONSTANTS                       

GLOBAL_SEED = 42                                     # GLOBAL SEED FOR REPRODUCIBILITY
np.random.seed(GLOBAL_SEED)                          # SETS NUMPY SEED

# PATHS (ADAPT LOCALLY): THE USER REQUESTED ..\DATA\DATA[P].csv                          
DATA_CSV_PATH = Path("../DATA/DATA[P].csv")          # RELATIVE PATH AS SPECIFIED BY USER
RESULTS_DIR = Path("../DATA/")                        # DIRECTORY TO SAVE RESULTS
RESULTS_DIR.mkdir(parents=True, exist_ok=True)       # CREATES DIRECTORY IF NOT EXISTS

FEATURE_COLS = ["il", "iw", "pw", "ro"]              # FEATURES FROM USER
TARGET_COLS = ["frequency", "return loss", "gain"]   # MULTI-OUTPUT TARGETS FROM USER

TEST_SIZE = 0.2                                       # 80/20 TRAIN-TEST SPLIT
N_SPLITS = 5                                          # 5-FOLD CV
OPTUNA_TRIALS = 100                                   # OPTUNA TRIAL BUDGET
N_TRIALS = 1000                                       # VERY HIGH-BUDGET SEARCH 
N_JOBS = max(1, (os.cpu_count() or 2) - 1)            # PARALLEL TRIALS USING AVAILABLE CORES MINUS ONE
USE_TARGET_SCALING = True                             # SCALE TARGETS
USE_POWER_TRANSFORM = True                            # ALLOW YEO-JOHNSON AS A TUNABLE OPTION
USE_POLYNOMIALS = True                                # ALLOW POLYNOMIAL FEATURES AS A TUNABLE OPTION
POLY_DEGREE = 2                                       # DEGREE 2 POLYNOMIALS 
MAX_POLY_DEGREE = 5                                   # UPPER BOUND FOR POLYNOMIAL DEGREE

print("GLOBAL SEED:", GLOBAL_SEED)                   # CONFIRMS GLOBAL SEED
print("TRAIN-TEST SPLIT:", 1 - TEST_SIZE, TEST_SIZE) # PRINTS TRAIN/TEST RATIO
print("K-FOLDS:", N_SPLITS)                          # PRINTS NUMBER OF FOLDS
print("N_TRIALS:", N_TRIALS, "| N_JOBS:", N_JOBS)    # PRINTS TRIALS AND PARALLEL JOBS

GLOBAL SEED: 42
TRAIN-TEST SPLIT: 0.8 0.2
K-FOLDS: 5
N_TRIALS: 1000 | N_JOBS: 5


### DATA READING

In [4]:
# SAFE CSV READER TO LOAD THE CLEANED DATA                                               

def safe_read_csv(path: Path) -> pd.DataFrame:       # DEFINES A SAFE CSV READER
    """SAFELY READ A CSV FILE AND RETURN A PANDAS DATAFRAME WITH CLEAR ERRORS."""  
    if not path.exists():                            # CHECKS IF FILE EXISTS
        raise FileNotFoundError(f"FILE NOT FOUND: {path}")  # RAISES ERROR IF NOT FOUND
    df_local = pd.read_csv(path)                     # READS CSV
    if df_local.empty:                               # CHECKS IF EMPTY
        raise ValueError("THE CSV FILE IS EMPTY.")   # RAISES ERROR IF EMPTY
    return df_local                                  # RETURNS DATAFRAME

df = safe_read_csv(DATA_CSV_PATH)                    # LOADS THE CLEANED DATA
print("DATA LOADED:", df.shape)                      # PRINTS SHAPE

DATA LOADED: (1296, 7)


### TRAIN-TEST SPLIT & K-FOLD

In [5]:
# WE SPLIT ONCE INTO TRAIN/TEST (80/20), THEN USE K-FOLD ON THE TRAIN SET FOR CV        

X = df[FEATURE_COLS]                          # EXTRACTS FEATURES AS NUMPY ARRAY
Y = df[TARGET_COLS]                           # EXTRACTS TARGETS AS NUMPY ARRAY

X_train, X_test, y_train, y_test = train_test_split( # PERFORMS TRAIN-TEST SPLIT
    X, Y, test_size=TEST_SIZE, random_state=GLOBAL_SEED, shuffle=True
)                                                    

print("TRAIN SHAPE (X, Y):", X_train.shape, y_train.shape)  # PRINTS TRAIN SHAPES
print("TEST SHAPE  (X, Y):", X_test.shape, y_test.shape)    # PRINTS TEST SHAPES

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=GLOBAL_SEED)  # DEFINES K-FOLD SPLITTER
print("K-FOLD READY.")                                 

TRAIN SHAPE (X, Y): (1036, 4) (1036, 3)
TEST SHAPE  (X, Y): (260, 4) (260, 3)
K-FOLD READY.


### METRICS HELPERS

In [6]:
# UTILITY TO CALCULATE ALL REQUESTED METRICS IN ONE PLACE                                  

def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]: # DEFINES METRICS FUNCTION
    """RETURN A DICTIONARY OF REGRESSION METRICS FOR MULTI-OUTPUT TARGETS."""       
    r2 = r2_score(y_true, y_pred, multioutput="uniform_average")                    # COMPUTES R2
    mae = mean_absolute_error(y_true, y_pred, multioutput="uniform_average")        # COMPUTES MAE
    mse = mean_squared_error(y_true, y_pred, multioutput="uniform_average")         # COMPUTES MSE
    rmse = np.sqrt(mse)                                                             # COMPUTES RMSE
    evs = explained_variance_score(y_true, y_pred, multioutput="uniform_average")   # COMPUTES EXPLAINED VARIANCE
    return {"r2": r2, "mae": mae, "mse": mse, "rmse": rmse, "explained_variance": evs}  # RETURNS ALL METRICS

def print_metrics(name: str, metrics: Dict[str, float]) -> None:      # DEFINES PRINTING FUNCTION
    """PRETTY-PRINT METRICS WITH UPPERCASE KEYS."""                    
    print(f"=== {name.upper()} METRICS ===")                          # PRINTS HEADER
    for k, v in metrics.items():                                      # LOOPS OVER METRICS
        print(f"{k.upper()}: {v:.4f}")                                # PRINTS EACH METRIC

### PREPROCESSING PIPELINES

In [7]:
# NUMERIC PIPELINE FOR FEATURES: IMPUTE MEDIAN + STANDARD SCALER                         

numeric_transformer = Pipeline(steps=[               # BUILDS A PIPELINE
    ("imputer", SimpleImputer(strategy="median")),   # MISSING VALUE IMPUTATION
    ("scaler", StandardScaler()),                    # STANDARD SCALING
])                                                   # CLOSES PIPELINE

preprocessor = ColumnTransformer(                    # WRAPS TRANSFORMER (ALL COLUMNS ARE NUMERIC)
    transformers=[("num", numeric_transformer, FEATURE_COLS)],  # APPLIES TO ALL FEATURE COLS
    remainder="drop"                                 # DROPS ANY OTHER COLUMNS (THERE ARE NONE)
)                                                    

y_scaler = StandardScaler() if USE_TARGET_SCALING else None  # CREATES Y SCALER IF REQUESTED

print("PREPROCESSORS READY. TARGET SCALING:", USE_TARGET_SCALING)  

PREPROCESSORS READY. TARGET SCALING: True


### FEATURE ENGINEERING (POLYNOMIALS)

In [8]:
# WE WILL ADD POLYNOMIAL/INTERACTION TERMS **ONLY** FOR LINEAR AND DNN MODELS            

from sklearn.preprocessing import PolynomialFeatures  # IMPORTS POLYNOMIAL FEATURES

if USE_POLYNOMIALS:                                   # IF POLYNOMIALS ENABLED
    poly = PolynomialFeatures(degree=POLY_DEGREE, include_bias=False)  # DEFINES POLY TRANSFORMER
    print("POLYNOMIAL FEATURES ENABLED. DEGREE:", POLY_DEGREE)         # PRINTS STATUS
else:
    poly = None                                        # OTHERWISE NONE
    print("POLYNOMIAL FEATURES DISABLED.")             # PRINTS STATUS

POLYNOMIAL FEATURES ENABLED. DEGREE: 2


### BASELINE MODELS

In [9]:
# WE DEFINE BUILDERS FOR: LINEAR REGRESSION, KNN, SVR (WITH TARGET SCALING)             

def build_linear_pipeline() -> Pipeline:              # BUILDS LINEAR REGRESSION PIPELINE
    """RETURN A PIPELINE: PREPROCESSOR (+ OPTIONAL POLY) + LINEAR REGRESSION WITH TARGET SCALING."""  
    reg = LinearRegression()                          # BASE REGRESSOR
    if USE_TARGET_SCALING:                            # IF WE SCALE TARGETS
        reg = TransformedTargetRegressor(             # WRAP REGRESSOR
            regressor=reg, transformer=StandardScaler(with_mean=True, with_std=True)
        )                                             # CLOSES TTR
    steps = [("pre", preprocessor)]                   # STARTS STEPS WITH PREPROCESSOR
    if USE_POLYNOMIALS:                               # IF POLY IS ENABLED
        steps.append(("poly", poly))                  # ADD POLY STEP
    steps.append(("reg", reg))                        # ADD REGRESSOR STEP
    return Pipeline(steps=steps)                      # RETURNS FULL PIPELINE

def build_knn_pipeline(n_neighbors: int = 5, weights: str = "uniform", p: int = 2) -> Pipeline:  # BUILDS KNN PIPELINE
    """RETURN A PIPELINE WITH KNN HYPERPARAMETERS AND TARGET SCALING."""                
    base = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, p=p)             # DEFINES KNN
    if USE_TARGET_SCALING:                                                                # IF TARGET SCALING
        base = TransformedTargetRegressor(                                                # WRAP WITH TTR
            regressor=base, transformer=StandardScaler(with_mean=True, with_std=True)
        )                                                                                # CLOSES TTR
    steps = [("pre", preprocessor)]                                                      # STARTS PIPELINE
    steps.append(("reg", base))                                                          # ADDS REGRESSOR
    return Pipeline(steps=steps)                                                         # RETURNS

def build_svr_pipeline(C: float = 1.0, epsilon: float = 0.1, gamma: str = "scale", kernel: str = "rbf") -> Pipeline:  # BUILDS SVR PIPELINE
    """RETURN A PIPELINE WITH SVR (WRAPPED FOR MULTI-OUTPUT) AND TARGET SCALING."""     
    svr = SVR(C=C, epsilon=epsilon, gamma=gamma, kernel=kernel)                          # DEFINES SVR
    mo = MultiOutputRegressor(svr)                                                       # WRAPS INTO MULTI-OUTPUT
    if USE_TARGET_SCALING:                                                               # IF TARGET SCALING
        mo = TransformedTargetRegressor(                                                 # WRAP WITH TTR
            regressor=mo, transformer=StandardScaler(with_mean=True, with_std=True)
        )                                                                                # CLOSES TTR
    steps = [("pre", preprocessor)]                                                      # STARTS PIPELINE
    steps.append(("reg", mo))                                                            # ADDS REGRESSOR
    return Pipeline(steps=steps)                                                         # RETURNS

### CROSS-VALIDATION EVALUATION

In [10]:
# HELPER TO RUN K-FOLD CV AND RETURN MEAN/STD R2, PLUS FIT ON FULL TRAIN AND EVAL TEST     

def evaluate_sklearn_pipeline(name: str, pipe: Pipeline) -> Dict[str, Any]:  # DEFINES EVALUATION FUNCTION
    """FIT/VALIDATE A SKLEARN PIPELINE WITH K-FOLD AND TEST EVALUATION."""   
    # CROSS-VALIDATION R2 SCORES                                                     
    cv_scores = cross_val_score(pipe, X_train, y_train, cv=kf, scoring="r2", n_jobs=-1)  # RUNS CV
    pipe.fit(X_train, y_train)                                                    # FITS ON FULL TRAIN
    y_pred_test = pipe.predict(X_test)                                            # PREDICTS ON TEST
    metrics = regression_metrics(y_test, y_pred_test)                             # COMPUTES METRICS
    result = {                                                                    # BUILDS RESULT DICT
        "model": name,
        "cv_r2_mean": np.mean(cv_scores),
        "cv_r2_std": np.std(cv_scores),
        **metrics
    }                                                                              # CLOSES DICT
    print_metrics(name, metrics)                                                   # PRINTS TEST METRICS
    print(f"CV R2 MEAN: {result['cv_r2_mean']:.4f} | CV R2 STD: {result['cv_r2_std']:.4f}")  # PRINTS CV SCORES
    return result                                                                   # RETURNS RESULT

### RUN BASELINES

In [11]:
# EVALUATE LINEAR, KNN, SVR BEFORE HYPERPARAMETER TUNING               

results = []                                          # LIST TO COLLECT RESULTS

linear_pipe = build_linear_pipeline()                 # BUILDS LINEAR PIPELINE
results.append(evaluate_sklearn_pipeline("linear_regression", linear_pipe))  # EVALUATES AND STORES

knn_pipe_default = build_knn_pipeline()               # BUILDS DEFAULT KNN
results.append(evaluate_sklearn_pipeline("knn", knn_pipe_default))          # EVALUATES AND STORES

svr_pipe_default = build_svr_pipeline()               # BUILDS DEFAULT SVR
results.append(evaluate_sklearn_pipeline("svr", svr_pipe_default))          # EVALUATES AND STORES

=== LINEAR_REGRESSION METRICS ===
R2: 0.4494
MAE: 2.4070
MSE: 18.5081
RMSE: 4.3021
EXPLAINED_VARIANCE: 0.4499
CV R2 MEAN: 0.5193 | CV R2 STD: 0.0168
=== KNN METRICS ===
R2: 0.6935
MAE: 1.7046
MSE: 9.7397
RMSE: 3.1208
EXPLAINED_VARIANCE: 0.6949
CV R2 MEAN: 0.6715 | CV R2 STD: 0.0379
=== SVR METRICS ===
R2: 0.5161
MAE: 1.9102
MSE: 17.9677
RMSE: 4.2388
EXPLAINED_VARIANCE: 0.5302
CV R2 MEAN: 0.6058 | CV R2 STD: 0.0373


### HYPERPARAMETER TUNING (OPTUNA)

In [None]:
if False:

    # WE WILL OPTIMIZE KNN AND SVR WITH OPTUNA (100 TRIALS EACH)                              

    def objective_knn(trial: optuna.trial.Trial) -> float:      # DEFINES OPTUNA OBJECTIVE FOR KNN
        """OPTIMIZE KNN HYPERPARAMETERS USING CV R2 SCORE."""  
        n_neighbors = trial.suggest_int("n_neighbors", 1, 50)   # SUGGESTS N_NEIGHBORS
        weights = trial.suggest_categorical("weights", ["uniform", "distance"])           # SUGGESTS WEIGHTS
        p = trial.suggest_int("p", 1, 2)                        # SUGGESTS MINKOWSKI P (1=MANHATTAN, 2=EUCLIDEAN)
        pipe = build_knn_pipeline(n_neighbors=n_neighbors, weights=weights, p=p)          # BUILDS PIPELINE
        scores = cross_val_score(pipe, X_train, y_train, cv=kf, scoring="r2", n_jobs=-1)  # CV R2
        return scores.mean()                                    # RETURNS MEAN R2 (MAXIMIZE)

    def objective_svr(trial: optuna.trial.Trial) -> float:             # DEFINES OPTUNA OBJECTIVE FOR SVR
        """OPTIMIZE SVR HYPERPARAMETERS USING CV R2 SCORE."""  
        C = trial.suggest_float("C", 1e-3, 1e3, log=True)              # SUGGESTS C ON LOG SCALE
        epsilon = trial.suggest_float("epsilon", 1e-4, 1.0, log=True)  # SUGGESTS EPSILON
        gamma = trial.suggest_categorical("gamma", ["scale", "auto"])  # SUGGESTS GAMMA MODE
        pipe = build_svr_pipeline(C=C, epsilon=epsilon, gamma=gamma, kernel="rbf")        # BUILDS PIPELINE
        scores = cross_val_score(pipe, X_train, y_train, cv=kf, scoring="r2", n_jobs=-1)  # CV R2
        return scores.mean()                                           # RETURNS MEAN R2

    print("STARTING OPTUNA STUDY: KNN")                                     # PRINTS STATUS
    study_knn = optuna.create_study(direction="maximize")                   # CREATES STUDY FOR KNN
    study_knn.optimize(objective_knn, n_trials=OPTUNA_TRIALS, show_progress_bar=False)  # RUNS OPTIMIZATION
    print("BEST KNN PARAMS:", study_knn.best_params)                        # PRINTS BEST PARAMS
    best_knn_pipe = build_knn_pipeline(**study_knn.best_params)             # BUILDS PIPELINE WITH BEST PARAMS
    results.append(evaluate_sklearn_pipeline("knn_optuna", best_knn_pipe))  # EVALUATES BEST KNN

    print("STARTING OPTUNA STUDY: SVR")                   # PRINTS STATUS
    study_svr = optuna.create_study(direction="maximize") # CREATES STUDY FOR SVR
    study_svr.optimize(objective_svr, n_trials=OPTUNA_TRIALS, show_progress_bar=False)  # RUNS OPTIMIZATION
    print("BEST SVR PARAMS:", study_svr.best_params)      # PRINTS BEST PARAMS
    best_svr_pipe = build_svr_pipeline(**study_svr.best_params)                         # BUILDS PIPELINE WITH BEST PARAMS
    results.append(evaluate_sklearn_pipeline("svr_optuna", best_svr_pipe))              # EVALUATES BEST SVR

### OPTUNA: OBJECTIVE & SEARCH SPACE

In [13]:
# WE DEFINE A SINGLE OBJECTIVE THAT CHOOSES BETWEEN MULTIPLE MODELS AND A VERY WIDE SEARCH SPACE        
# MODELS INCLUDED: RIDGE, LASSO, ELASTICNET, LINEAR (NO REG), KNN, SVR                                  # LISTS MODELS
# WE TUNE: SCALERS, IMPUTERS, POWER TRANSFORM, POLY DEGREE, MODEL-SPECIFIC HYPERPARAMETERS              # LISTS TUNED PARTS

SCALERS = {
    "standard": StandardScaler(),                   # STANDARD SCALER
    "minmax": MinMaxScaler(),                       # MIN-MAX SCALER
    "robust": RobustScaler(),                       # ROBUST SCALER
    "none": "passthrough",                          # NO SCALING
}                                                   

def build_pipeline_from_trial(trial: optuna.trial.Trial) -> Pipeline:  # BUILDS A PIPELINE ACCORDING TO TRIAL
    """CONSTRUCT A SKLEARN PIPELINE (WITH MULTI-OUTPUT WRAPPING & TARGET SCALING) FROM TRIAL PARAMETERS."""  

    # ---------------------- GENERIC PREPROCESSING CHOICES ---------------------- #  
    impute_strategy = trial.suggest_categorical("imputer", ["median", "mean"])   # CHOOSES IMPUTER STRATEGY
    scaler_key = trial.suggest_categorical("scaler", list(SCALERS.keys()))       # CHOOSES SCALER TYPE
    use_power = trial.suggest_categorical("power_transform", [True, False]) if USE_POWER_TRANSFORM else False  # YEO-JOHNSON
    use_poly = trial.suggest_categorical("use_poly", [True, False]) if USE_POLYNOMIALS else False  # POLY FEATURES FLAG
    poly_degree = trial.suggest_int("poly_degree", 2, MAX_POLY_DEGREE) if use_poly else 1          # POLY DEGREE

    # ---------------------- MODEL FAMILY CHOICE ---------------------- #         
    model_name = trial.suggest_categorical(
        "model",
        ["linear", "ridge", "lasso", "elasticnet", "knn", "svr"]
    )                                                                             # CHOOSES MODEL FAMILY

    # ---------------------- MODEL-SPECIFIC PARAMS ---------------------- #       
    if model_name == "linear":                                                    # LINEAR REGRESSION (NO PARAMS)
        base_estimator = LinearRegression()                                       # SETS BASE ESTIMATOR
    elif model_name == "ridge":                                                   # RIDGE
        alpha = trial.suggest_float("ridge_alpha", 1e-6, 1e3, log=True)           # RIDGE ALPHA
        base_estimator = Ridge(alpha=alpha, random_state=GLOBAL_SEED)             # RIDGE REGRESSOR
    elif model_name == "lasso":                                                   # LASSO
        alpha = trial.suggest_float("lasso_alpha", 1e-6, 1e3, log=True)           # LASSO ALPHA
        base_estimator = Lasso(alpha=alpha, random_state=GLOBAL_SEED, max_iter=10000) # LASSO
    elif model_name == "elasticnet":                                              # ELASTICNET
        alpha = trial.suggest_float("elastic_alpha", 1e-6, 1e3, log=True)         # ELASTICNET ALPHA
        l1_ratio = trial.suggest_float("elastic_l1_ratio", 0.0, 1.0)              # L1 RATIO
        base_estimator = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=GLOBAL_SEED, max_iter=10000)  # ELASTICNET
    elif model_name == "knn":                                                     # KNN
        n_neighbors = trial.suggest_int("knn_n_neighbors", 1, 200)                # N NEIGHBORS
        weights = trial.suggest_categorical("knn_weights", ["uniform", "distance"])  # WEIGHTS
        p = trial.suggest_int("knn_p", 1, 2)                                      # P NORM
        leaf_size = trial.suggest_int("knn_leaf_size", 10, 100)                   # LEAF SIZE
        base_estimator = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, p=p, leaf_size=leaf_size)  # KNN
    elif model_name == "svr":                                                     # SVR
        kernel = trial.suggest_categorical("svr_kernel", ["rbf", "poly", "linear"])  # KERNEL
        C = trial.suggest_float("svr_C", 1e-4, 1e4, log=True)                     # C
        epsilon = trial.suggest_float("svr_epsilon", 1e-4, 1.0, log=True)         # EPSILON
        if kernel in ["rbf", "poly"]:                                             # GAMMA ONLY FOR RBF/POLY
            gamma = trial.suggest_categorical("svr_gamma", ["scale", "auto"])     # GAMMA
        else:
            gamma = "scale"                                                       # DUMMY VALUE
        degree = trial.suggest_int("svr_degree", 2, 5) if kernel == "poly" else 3 # DEGREE IF POLY
        base_estimator = SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma, degree=degree)  # SVR
        base_estimator = MultiOutputRegressor(base_estimator)                      # WRAPS FOR MULTI-OUTPUT
    else:
        raise ValueError("UNKNOWN MODEL")                                          # SAFEGUARD

    # WRAP NON-MULTIOUTPUT LINEARS INTO MULTIOUTPUTREGRESSOR                        
    if model_name in ["linear", "ridge", "lasso", "elasticnet", "knn"]:
        base_estimator = MultiOutputRegressor(base_estimator)                      # WRAPS

    # ---------------------- TARGET SCALING ---------------------- #               
    if USE_TARGET_SCALING:                                                         # IF SCALING TARGETS
        base_estimator = TransformedTargetRegressor(                               # WRAP WITH TTR
            regressor=base_estimator,
            transformer=StandardScaler(with_mean=True, with_std=True)
        )                                                                          # CLOSES TTR

    # ---------------------- BUILD PREPROCESSOR ---------------------- #           
    numeric_steps = []                                                             # INIT STEPS LIST
    numeric_steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))     # ADD IMPUTER
    if scaler_key != "none":                                                       # IF SCALER IS NOT NONE
        numeric_steps.append(("scaler", SCALERS[scaler_key]))                      # ADD SCALER
    if use_power:                                                                  # IF POWER TRANSFORM ENABLED
        numeric_steps.append(("power", PowerTransformer(method="yeo-johnson")))    # ADD YEO-JOHNSON
    pre_num = Pipeline(numeric_steps)                                              # CREATE NUMERIC PIPELINE

    preprocessor = ColumnTransformer(                                              # WRAP IN COLUMNTRANSFORMER
        transformers=[("num", pre_num, FEATURE_COLS)], remainder="drop"            # APPLY TO ALL FEATURE COLS
    )                                                                              # CLOSES COLUMNTRANSFORMER

    steps = [("pre", preprocessor)]                                                # START PIPELINE STEPS
    if use_poly and poly_degree > 1:                                               # IF POLY ENABLED
        steps.append(("poly", PolynomialFeatures(degree=poly_degree, include_bias=False)))  # ADD POLY
    steps.append(("reg", base_estimator))                                          # ADD REGRESSOR

    pipe = Pipeline(steps=steps)                                                   # BUILD FULL PIPELINE
    return pipe                                                                    # RETURN PIPELINE

def objective(trial: optuna.trial.Trial) -> float:                                 # DEFINES OBJECTIVE
    """OPTIMIZE MEAN CV R2 (PRIMARY) WITH MEDIAN PRUNING; STORE EXTRA METRICS AS ATTRS."""  
    pipe = build_pipeline_from_trial(trial)                                        # BUILDS PIPELINE FROM TRIAL
    scores = cross_val_score(pipe, X_train, y_train, cv=kf, scoring="r2", n_jobs=-1)  # 5-FOLD R2
    trial.set_user_attr("cv_r2_mean", float(np.mean(scores)))                      # STORE MEAN R2
    trial.set_user_attr("cv_r2_std", float(np.std(scores)))                        # STORE STD R2
    return scores.mean()                                                           # RETURN MEAN R2 (TO MAXIMIZE)

### OPTUNA STUDY

In [14]:
# WE USE TPESAMPLER (SMART SAMPLING) + MEDIANPRUNER (EARLY STOPPING) AND PARALLEL EXECUTION 

sampler = TPESampler(seed=GLOBAL_SEED, multivariate=True, group=True)            # TPE SAMPLER WITH MULTIVARIATE SUPPORT
pruner = MedianPruner(n_startup_trials=50, n_warmup_steps=0, interval_steps=1)   # MEDIAN PRUNER FOR EARLY STOPPING

study = optuna.create_study(direction="maximize", sampler=sampler, pruner=pruner)     # CREATES STUDY
print("STARTING OPTUNA STUDY WITH HIGH-BUDGET SEARCH...")                             # STATUS PRINT

study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, show_progress_bar=False)  # RUNS OPTIMIZATION

print("BEST TRIAL NUMBER:", study.best_trial.number)                             # PRINTS BEST TRIAL ID
print("BEST CV R2:", study.best_trial.value)                                     # PRINTS BEST CV R2
print("BEST PARAMS:")                                                            # HEADER
for k, v in study.best_trial.params.items():                                     # ITERATES OVER PARAMS
    print(f"  {k}: {v}")                                                         # PRINTS PARAM

[I 2025-07-27 09:55:56,718] A new study created in memory with name: no-name-6f6cb362-4181-4629-a837-6905e72200b1


STARTING OPTUNA STUDY WITH HIGH-BUDGET SEARCH...


[I 2025-07-27 09:55:56,987] Trial 1 finished with value: 0.7306011621920386 and parameters: {'imputer': 'median', 'scaler': 'none', 'power_transform': True, 'use_poly': True, 'poly_degree': 5, 'model': 'ridge', 'ridge_alpha': 1.1503697514576374e-05}. Best is trial 1 with value: 0.7306011621920386.
[I 2025-07-27 09:55:57,087] Trial 3 finished with value: 0.1843561697884858 and parameters: {'imputer': 'median', 'scaler': 'robust', 'power_transform': False, 'use_poly': True, 'poly_degree': 3, 'model': 'lasso', 'lasso_alpha': 0.315281374839352}. Best is trial 1 with value: 0.7306011621920386.
[I 2025-07-27 09:55:57,150] Trial 4 finished with value: -0.012326268423982094 and parameters: {'imputer': 'mean', 'scaler': 'standard', 'power_transform': False, 'use_poly': False, 'model': 'elasticnet', 'elastic_alpha': 1.5213123753058109, 'elastic_l1_ratio': 0.6798062867785436}. Best is trial 1 with value: 0.7306011621920386.
[I 2025-07-27 09:55:57,185] Trial 0 finished with value: 0.26346109029036

KeyboardInterrupt: 

In [16]:
print("STUDY INTERRUPTED AFTER 502 TRIALS")
print("BEST TRIAL NUMBER:", study.best_trial.number)                             # PRINTS BEST TRIAL ID
print("BEST CV R2:", study.best_trial.value)                                     # PRINTS BEST CV R2
print("BEST PARAMS:")                                                            # HEADER
for k, v in study.best_trial.params.items():                                     # ITERATES OVER PARAMS
    print(f"  {k}: {v}")                                                         # PRINTS PARAM

STUDY INTERRUPTED AFTER 502 TRIALS
BEST TRIAL NUMBER: 459
BEST CV R2: 0.8188295024528929
BEST PARAMS:
  imputer: median
  scaler: robust
  power_transform: False
  use_poly: False
  model: svr
  svr_kernel: rbf
  svr_C: 3474.2154702644193
  svr_epsilon: 0.00011411129759067382
  svr_gamma: scale


In [17]:
trials = study.trials                                                         # GET ALL TRIALS FROM STUDY
models = set(trial.params.get("model") for trial in trials if "model" in trial.params)  # EXTRACT UNIQUE MODEL NAMES

best_trials_per_model = {}                                                    # DICT TO STORE BEST TRIAL PER MODEL

for model_name in models:                                                     # LOOP OVER EACH MODEL NAME
    model_trials = [t for t in trials if t.params.get("model") == model_name and t.state == optuna.trial.TrialState.COMPLETE]  # FILTER COMPLETED TRIALS FOR MODEL
    if not model_trials:                                                      # SKIP IF NO TRIALS FOR MODEL
        continue
    best_trial = max(model_trials, key=lambda t: t.value)                     # FIND TRIAL WITH MAX OBJECTIVE VALUE (BEST CV R2)
    best_trials_per_model[model_name] = best_trial                            # STORE BEST TRIAL

print("\nBEST TRIALS PER MODEL FAMILY:")                                      # HEADER PRINT

for model_name, trial in best_trials_per_model.items():                       # LOOP OVER BEST TRIALS PER MODEL
    print(f"MODEL: {model_name.upper()}")                                     # PRINT MODEL NAME IN UPPERCASE
    print(f"  Trial Number: {trial.number}")                                  # PRINT TRIAL NUMBER
    print(f"  CV R2 Score: {trial.value:.4f}")                                # PRINT CV R2 SCORE WITH 4 DECIMAL PLACES
    print("  Params:")                                                        # PRINT PARAMS HEADER
    for k, v in trial.params.items():                                         # LOOP OVER ALL PARAMETERS IN TRIAL
        print(f"    {k}: {v}")                                                # PRINT PARAMETER NAME AND VALUE
    print("-" * 40)                                                           # PRINT SEPARATOR LINE


BEST TRIALS PER MODEL FAMILY:
MODEL: KNN
  Trial Number: 464
  CV R2 Score: 0.7043
  Params:
    imputer: mean
    scaler: minmax
    power_transform: True
    use_poly: True
    poly_degree: 2
    model: knn
    knn_n_neighbors: 10
    knn_weights: uniform
    knn_p: 1
    knn_leaf_size: 10
----------------------------------------
MODEL: LASSO
  Trial Number: 115
  CV R2 Score: 0.7449
  Params:
    imputer: mean
    scaler: robust
    power_transform: False
    use_poly: True
    poly_degree: 5
    model: lasso
    lasso_alpha: 0.0007944698383043709
----------------------------------------
MODEL: SVR
  Trial Number: 459
  CV R2 Score: 0.8188
  Params:
    imputer: median
    scaler: robust
    power_transform: False
    use_poly: False
    model: svr
    svr_kernel: rbf
    svr_C: 3474.2154702644193
    svr_epsilon: 0.00011411129759067382
    svr_gamma: scale
----------------------------------------
MODEL: LINEAR
  Trial Number: 72
  CV R2 Score: 0.7330
  Params:
    imputer: median
