### IMPORTS

In [None]:
# IMPORTS AND PRINTS ALL VERSIONS SO WE CAN REPRODUCE RESULTS EXACTLY LATER              

import sys                                           # STANDARD LIB TO ACCESS PYTHON RUNTIME DETAILS
import platform                                      # STANDARD LIB TO GET OS/PLATFORM INFORMATION
import warnings                                      # STANDARD LIB TO CONTROL WARNING MESSAGES
import os                                            # STANDARD LIB TO QUERY CPU COUNT FOR PARALLELISM
from pathlib import Path                             # STANDARD LIB FOR SAFE, CROSS-PLATFORM PATH HANDLING
from typing import Dict, Any, Tuple                  # TYPE HINTS FOR CLARITY

import numpy as np                                   # NUMERICAL COMPUTING
import pandas as pd                                  # DATAFRAMES AND DATA MANIPULATION

import matplotlib                                    # BASE PLOTTING BACKEND
import matplotlib.pyplot as plt                      # STATEFUL PLOTTING INTERFACE
from matplotlib import rcParams                      # IMPORTS RCPARAMS TO SET GLOBAL STYLES
import seaborn as sns                                # STATISTICAL PLOTTING BUILT ON TOP OF MATPLOTLIB
import plotly                                        # INTERACTIVE PLOTTING (NOT USED HERE BUT KEPT FOR CONSISTENCY)

import sklearn                                       # SCIKIT-LEARN: CLASSIC MACHINE LEARNING
from sklearn.model_selection import train_test_split # TRAIN/TEST SPLITTING
from sklearn.model_selection import KFold, cross_val_score, cross_validate  # K-FOLD SPLITTING AND CV
from sklearn.pipeline import Pipeline                # TO BUILD CLEAN, REUSABLE PREPROCESSING PIPELINES
from sklearn.impute import SimpleImputer             # SIMPLE STRATEGIES TO IMPUTE MISSING VALUES
from sklearn.preprocessing import (                  # SCALERS + TRANSFORMS
    StandardScaler, MinMaxScaler, RobustScaler, 
    PolynomialFeatures, PowerTransformer
)  
from sklearn.compose import ColumnTransformer        # APPLY TRANSFORMS TO COLUMNS (WE USE ALL NUMERIC)
from sklearn.metrics import (                        # METRICS FOR REGRESSION
    r2_score,
    mean_absolute_error,
    mean_squared_error,
    explained_variance_score,
    make_scorer,
)
from sklearn.multioutput import MultiOutputRegressor    # WRAPS SINGLE-OUTPUT MODELS FOR MULTI-OUTPUT TARGETS
from sklearn.linear_model import (                      # LINEAR FAMILY
    LinearRegression, Ridge, Lasso, ElasticNet
)  
from sklearn.neighbors import KNeighborsRegressor       # KNN REGRESSOR
from sklearn.svm import SVR                             # SUPPORT VECTOR REGRESSION
from sklearn.preprocessing import PolynomialFeatures    # POLYNOMIAL FEATURES
from sklearn.compose import TransformedTargetRegressor  # FOR SCALING TARGETS
from sklearn.ensemble import StackingRegressor          # FOR STACKING REGRESSOR


import optuna                                        # HYPERPARAMETER OPTIMIZATION
from optuna.samplers import TPESampler               # ADVANCED SAMPLER
from optuna.pruners import MedianPruner              # EARLY STOPPING PRUNER
import shap                                          # MODEL INTERPRETABILITY 
import joblib                                        # SERIALIZATION (SAVING/LOADING MODELS)

warnings.filterwarnings("ignore")                    # SUPPRESSES NON-CRITICAL WARNINGS FOR CLEANER OUTPUT

print("PYTHON:", sys.version)                        # PRINTS PYTHON VERSION
print("OS:", platform.platform())                    # PRINTS OPERATING SYSTEM DETAILS
print("NUMPY:", np.__version__)                      # PRINTS NUMPY VERSION
print("PANDAS:", pd.__version__)                     # PRINTS PANDAS VERSION
print("SCIKIT-LEARN:", sklearn.__version__)          # PRINTS SCIKIT-LEARN VERSION
print("MATPLOTLIB:", matplotlib.__version__)         # PRINTS MATPLOTLIB VERSION
print("SEABORN:", sns.__version__)                   # PRINTS SEABORN VERSION
print("PLOTLY:", plotly.__version__)                 # PRINTS PLOTLY VERSION
print("OPTUNA:", optuna.__version__)                 # PRINTS OPTUNA VERSION
print("SHAP:", shap.__version__)                     # PRINTS SHAP VERSION

PYTHON: 3.12.4 (tags/v3.12.4:8e8a4ba, Jun  6 2024, 19:30:16) [MSC v.1940 64 bit (AMD64)]
OS: Windows-11-10.0.22631-SP0
NUMPY: 2.1.3
PANDAS: 2.3.1
SCIKIT-LEARN: 1.7.1
MATPLOTLIB: 3.10.3
SEABORN: 0.13.2
PLOTLY: 6.2.0
OPTUNA: 4.4.0
SHAP: 0.48.0


In [64]:
%pip install -U scikit-learn
print("SCIKIT-LEARN:", sklearn.__version__)

Note: you may need to restart the kernel to use updated packages.
SCIKIT-LEARN: 1.7.1


### PLOTTING SETUP

In [2]:
# GLOBAL PLOTTING STYLE TO KEEP ALL FIGURES CRISP, BOLD, UPPERCASE, AND HIGH-IMPACT      
FIGSIZE = (8, 6)                                      # DEFAULT FIGURE SIZE
DPI = 500                                             # HIGH RESOLUTION
LINEWIDTH = 2.0                                       # BOLD LINE WIDTH
GRID_LINEWIDTH = 1.5                                  # BOLD GRID LINES
FONTSIZE_TITLE = 20                                   # LARGE TITLE SIZE
FONTSIZE_LABEL = 16                                   # LARGE AXIS LABEL SIZE
FONTSIZE_TICK = 14                                    # LARGE TICK LABEL SIZE
FONTSIZE_LEGEND = 14                                  # LARGE LEGEND FONT SIZE

def init_plot_style() -> None:                        # DEFINES A FUNCTION TO INITIALIZE GLOBAL STYLE
    """SET GLOBAL MATPLOTLIB STYLE FOR BOLD, UPPERCASE, HIGH-RES FIGURES."""  
    rcParams["figure.figsize"] = FIGSIZE              # SETS FIGURE SIZE
    rcParams["figure.dpi"] = DPI                      # SETS DPI
    rcParams["savefig.dpi"] = DPI                     # HIGH-RES SAVED FIGURES
    rcParams["font.weight"] = "bold"                  # MAKES TEXT BOLD
    rcParams["axes.titleweight"] = "bold"             # BOLD TITLES
    rcParams["axes.labelweight"] = "bold"             # BOLD LABELS
    rcParams["axes.titlesize"] = FONTSIZE_TITLE       # TITLE FONT SIZE
    rcParams["axes.labelsize"] = FONTSIZE_LABEL       # LABEL FONT SIZE
    rcParams["xtick.labelsize"] = FONTSIZE_TICK       # X-TICK LABEL SIZE
    rcParams["ytick.labelsize"] = FONTSIZE_TICK       # Y-TICK LABEL SIZE
    rcParams["legend.fontsize"] = FONTSIZE_LEGEND     # LEGEND FONT SIZE
    rcParams["legend.title_fontsize"] = FONTSIZE_LEGEND  # LEGEND TITLE FONT SIZE
    rcParams["lines.linewidth"] = LINEWIDTH           # DEFAULT LINE WIDTH
    rcParams["grid.linewidth"] = GRID_LINEWIDTH       # GRID LINE WIDTH
    rcParams["axes.grid"] = True                      # ENABLE GRID BY DEFAULT
    rcParams["grid.alpha"] = 0.3                      # GRID TRANSPARENCY
    rcParams["axes.spines.top"] = True                # SHOW TOP SPINE
    rcParams["axes.spines.right"] = True              # SHOW RIGHT SPINE

def boldify_axes(ax: plt.Axes,
                 title: str = "",
                 xlabel: str = "",
                 ylabel: str = "",
                 legend: bool = True) -> None:
    """UPPERCASE + BOLD ALL TEXT ELEMENTS ON AN AXES OBJECT."""  
    if title:                                                    # CHECKS IF TITLE IS PROVIDED
        ax.set_title(title.upper(), weight="bold", size=FONTSIZE_TITLE)    # SETS BOLD, UPPERCASE TITLE
    if xlabel:                                                   # CHECKS IF XLABEL IS PROVIDED
        ax.set_xlabel(xlabel.upper(), weight="bold", size=FONTSIZE_LABEL)  # SETS BOLD, UPPERCASE XLABEL
    if ylabel:                                                   # CHECKS IF YLABEL IS PROVIDED
        ax.set_ylabel(ylabel.upper(), weight="bold", size=FONTSIZE_LABEL)  # SETS BOLD, UPPERCASE YLABEL

    for tick in ax.get_xticklabels():                             # LOOPS OVER X TICKS
        tick.set_fontweight("bold")                               # MAKES THEM BOLD
        tick.set_fontsize(FONTSIZE_TICK)                          # SETS FONT SIZE
        tick.set_text(str(tick.get_text()).upper())               # UPPERCASES TEXT

    for tick in ax.get_yticklabels():                             # LOOPS OVER Y TICKS
        tick.set_fontweight("bold")                               # MAKES THEM BOLD
        tick.set_fontsize(FONTSIZE_TICK)                          # SETS FONT SIZE
        tick.set_text(str(tick.get_text()).upper())               # UPPERCASES TEXT

    for spine in ax.spines.values():                              # ITERATES OVER SPINES
        spine.set_linewidth(2.0)                                  # MAKES SPINES THICK

    if legend and ax.get_legend() is not None:                     # IF LEGEND EXISTS, FORMAT IT
        leg = ax.get_legend()                                      # GETS LEGEND HANDLE
        if leg.get_title() is not None:                            # IF LEGEND TITLE EXISTS
            leg.get_title().set_text(leg.get_title().get_text().upper())  # UPPERCASE LEGEND TITLE
            leg.get_title().set_fontweight("bold")                 # BOLD LEGEND TITLE
        for text in leg.get_texts():                               # FOR EACH LEGEND LABEL
            text.set_text(text.get_text().upper())                 # UPPERCASE TEXT
            text.set_fontweight("bold")                            # BOLD TEXT
            text.set_fontsize(FONTSIZE_LEGEND)                     # SET FONT SIZE

def finalize_figure(fig: plt.Figure, suptitle: str = "") -> None:
    """APPLY SUPTITLE (UPPERCASE, BOLD) AND TIGHT LAYOUT."""       
    if suptitle:                                                   # IF SUPTITLE PROVIDED
        fig.suptitle(suptitle.upper(), fontsize=FONTSIZE_TITLE, fontweight="bold")  # SETS BOLD, UPPERCASE SUPTITLE
    fig.tight_layout()                                             # TIGHT LAYOUT TO PREVENT CLIPPING

init_plot_style()                                                  # INITIALIZES THE GLOBAL STYLE

### CONFIG & REPRODUCIBILITY

In [3]:
# CONFIGURATION FOR REPRODUCIBILITY AND PROJECT-SPECIFIC CONSTANTS                       

GLOBAL_SEED = 42                                     # GLOBAL SEED FOR REPRODUCIBILITY
np.random.seed(GLOBAL_SEED)                          # SETS NUMPY SEED

# PATHS (ADAPT LOCALLY): THE USER REQUESTED ..\DATA\DATA[P].csv                          
DATA_CSV_PATH = Path("../DATA/DATA[P].csv")          # RELATIVE PATH AS SPECIFIED BY USER
RESULTS_DIR = Path("../DATA/")                        # DIRECTORY TO SAVE RESULTS
RESULTS_DIR.mkdir(parents=True, exist_ok=True)       # CREATES DIRECTORY IF NOT EXISTS

FEATURE_COLS = ["il", "iw", "pw", "ro"]              # FEATURES FROM USER
TARGET_COLS = ["frequency", "return loss", "gain"]   # MULTI-OUTPUT TARGETS FROM USER

TEST_SIZE = 0.2                                       # 80/20 TRAIN-TEST SPLIT
N_SPLITS = 5                                          # 5-FOLD CV
OPTUNA_TRIALS = 100                                   # OPTUNA TRIAL BUDGET
N_TRIALS = 1000                                       # VERY HIGH-BUDGET SEARCH 
N_JOBS = max(1, (os.cpu_count() or 2) - 1)            # PARALLEL TRIALS USING AVAILABLE CORES MINUS ONE
USE_TARGET_SCALING = True                             # SCALE TARGETS
USE_POWER_TRANSFORM = True                            # ALLOW YEO-JOHNSON AS A TUNABLE OPTION
USE_POLYNOMIALS = True                                # ALLOW POLYNOMIAL FEATURES AS A TUNABLE OPTION
POLY_DEGREE = 2                                       # DEGREE 2 POLYNOMIALS 
MAX_POLY_DEGREE = 5                                   # UPPER BOUND FOR POLYNOMIAL DEGREE

print("GLOBAL SEED:", GLOBAL_SEED)                   # CONFIRMS GLOBAL SEED
print("TRAIN-TEST SPLIT:", 1 - TEST_SIZE, TEST_SIZE) # PRINTS TRAIN/TEST RATIO
print("K-FOLDS:", N_SPLITS)                          # PRINTS NUMBER OF FOLDS
print("N_TRIALS:", N_TRIALS, "| N_JOBS:", N_JOBS)    # PRINTS TRIALS AND PARALLEL JOBS

GLOBAL SEED: 42
TRAIN-TEST SPLIT: 0.8 0.2
K-FOLDS: 5
N_TRIALS: 1000 | N_JOBS: 5


### DATA READING

In [4]:
# SAFE CSV READER TO LOAD THE CLEANED DATA                                               

def safe_read_csv(path: Path) -> pd.DataFrame:       # DEFINES A SAFE CSV READER
    """SAFELY READ A CSV FILE AND RETURN A PANDAS DATAFRAME WITH CLEAR ERRORS."""  
    if not path.exists():                            # CHECKS IF FILE EXISTS
        raise FileNotFoundError(f"FILE NOT FOUND: {path}")  # RAISES ERROR IF NOT FOUND
    df_local = pd.read_csv(path)                     # READS CSV
    if df_local.empty:                               # CHECKS IF EMPTY
        raise ValueError("THE CSV FILE IS EMPTY.")   # RAISES ERROR IF EMPTY
    return df_local                                  # RETURNS DATAFRAME

df = safe_read_csv(DATA_CSV_PATH)                    # LOADS THE CLEANED DATA
print("DATA LOADED:", df.shape)                      # PRINTS SHAPE

DATA LOADED: (1296, 7)


### TRAIN-TEST SPLIT & K-FOLD

In [5]:
# WE SPLIT ONCE INTO TRAIN/TEST (80/20), THEN USE K-FOLD ON THE TRAIN SET FOR CV        

X = df[FEATURE_COLS]                          # EXTRACTS FEATURES AS NUMPY ARRAY
Y = df[TARGET_COLS]                           # EXTRACTS TARGETS AS NUMPY ARRAY

X_train, X_test, y_train, y_test = train_test_split( # PERFORMS TRAIN-TEST SPLIT
    X, Y, test_size=TEST_SIZE, random_state=GLOBAL_SEED, shuffle=True
)                                                    

print("TRAIN SHAPE (X, Y):", X_train.shape, y_train.shape)  # PRINTS TRAIN SHAPES
print("TEST SHAPE  (X, Y):", X_test.shape, y_test.shape)    # PRINTS TEST SHAPES

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=GLOBAL_SEED)  # DEFINES K-FOLD SPLITTER
print("K-FOLD READY.")                                 

TRAIN SHAPE (X, Y): (1036, 4) (1036, 3)
TEST SHAPE  (X, Y): (260, 4) (260, 3)
K-FOLD READY.


### METRICS HELPERS

In [6]:
# UTILITY TO CALCULATE ALL REQUESTED METRICS IN ONE PLACE                                  

def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]: # DEFINES METRICS FUNCTION
    """RETURN A DICTIONARY OF REGRESSION METRICS FOR MULTI-OUTPUT TARGETS."""       
    r2 = r2_score(y_true, y_pred, multioutput="uniform_average")                    # COMPUTES R2
    mae = mean_absolute_error(y_true, y_pred, multioutput="uniform_average")        # COMPUTES MAE
    mse = mean_squared_error(y_true, y_pred, multioutput="uniform_average")         # COMPUTES MSE
    rmse = np.sqrt(mse)                                                             # COMPUTES RMSE
    evs = explained_variance_score(y_true, y_pred, multioutput="uniform_average")   # COMPUTES EXPLAINED VARIANCE
    return {"r2": r2, "mae": mae, "mse": mse, "rmse": rmse, "explained_variance": evs}  # RETURNS ALL METRICS

def print_metrics(name: str, metrics: Dict[str, float]) -> None:      # DEFINES PRINTING FUNCTION
    """PRETTY-PRINT METRICS WITH UPPERCASE KEYS."""                    
    print(f"=== {name.upper()} METRICS ===")                          # PRINTS HEADER
    for k, v in metrics.items():                                      # LOOPS OVER METRICS
        print(f"{k.upper()}: {v:.4f}")                                # PRINTS EACH METRIC

### PREPROCESSING PIPELINES

In [7]:
# NUMERIC PIPELINE FOR FEATURES: IMPUTE MEDIAN + STANDARD SCALER                         

numeric_transformer = Pipeline(steps=[               # BUILDS A PIPELINE
    ("imputer", SimpleImputer(strategy="median")),   # MISSING VALUE IMPUTATION
    ("scaler", StandardScaler()),                    # STANDARD SCALING
])                                                   # CLOSES PIPELINE

preprocessor = ColumnTransformer(                    # WRAPS TRANSFORMER (ALL COLUMNS ARE NUMERIC)
    transformers=[("num", numeric_transformer, FEATURE_COLS)],  # APPLIES TO ALL FEATURE COLS
    remainder="drop"                                 # DROPS ANY OTHER COLUMNS (THERE ARE NONE)
)                                                    

y_scaler = StandardScaler() if USE_TARGET_SCALING else None  # CREATES Y SCALER IF REQUESTED

print("PREPROCESSORS READY. TARGET SCALING:", USE_TARGET_SCALING)  

PREPROCESSORS READY. TARGET SCALING: True


### CROSS-VALIDATION EVALUATION

In [80]:
# HELPER TO RUN K-FOLD CV AND RETURN MEAN/STD R2, PLUS FIT ON FULL TRAIN AND EVAL TEST     

def evaluate_sklearn_pipeline(name: str, pipe: Pipeline) -> Dict[str, Any]:  # DEFINES EVALUATION FUNCTION
    """FIT/VALIDATE A SKLEARN PIPELINE WITH K-FOLD AND TEST EVALUATION."""   
    # CROSS-VALIDATION R2 SCORES                                                     
    cv_scores = cross_val_score(pipe, X_train, y_train, cv=kf, scoring="r2", n_jobs=-1)  # RUNS CV
    pipe.fit(X_train, y_train)                                                    # FITS ON FULL TRAIN
    y_pred_test = pipe.predict(X_test)                                            # PREDICTS ON TEST
    metrics = regression_metrics(y_test, y_pred_test)                             # COMPUTES METRICS
    result = {                                                                    # BUILDS RESULT DICT
        "model": name,
        "cv_r2_mean": np.mean(cv_scores),
        "cv_r2_std": np.std(cv_scores),
        **metrics
    }                                                                              # CLOSES DICT
    print_metrics(name, metrics)                                                   # PRINTS TEST METRICS
    print(f"CV R2 MEAN: {result['cv_r2_mean']:.4f} | CV R2 STD: {result['cv_r2_std']:.4f}")  # PRINTS CV SCORES
    return result                                                                   # RETURNS RESULT

### SVR MODEL WITH BEST HYPERPARAMETERS

In [None]:
# BASELINE SVR MODEL WITH BEST HYPERPARAMETERS FROM OPTUNA STUDY

C = 3474.2154702644193                        # BEST C FROM OPTUNA
epsilon = 0.00011411129759067382              # BEST EPSILON
gamma = "scale"                               # BEST GAMMA
kernel = "rbf"                                # BEST KERNEL

# REBUILD SVR PIPELINE WITH BEST PARAMS
def build_svr_pipeline(C: float, epsilon: float, gamma: str, kernel: str) -> Pipeline:  # DEFINES PIPELINE
    svr = SVR(C=C, epsilon=epsilon, gamma=gamma, kernel=kernel)                         # INITIALIZES SVR
    mo = MultiOutputRegressor(svr)                                                      # WRAPS IN MULTI-OUTPUT
    if USE_TARGET_SCALING:                                                              # IF SCALING TARGETS
        mo = TransformedTargetRegressor(                                                # WRAPS TARGETS
            regressor=mo, transformer=StandardScaler(with_mean=True, with_std=True)     # STANDARD SCALE TARGETS
        )                                                                               # CLOSES TRANSFORMED TARGET
    steps = [("pre", preprocessor)]                                                     # ADDS PREPROCESSING
    steps.append(("reg", mo))                                                           # ADDS REGRESSOR
    return Pipeline(steps=steps)                                                        # BUILDS PIPELINE

# BUILD AND EVALUATE FINAL BASELINE SVR PIPELINE
svr_pipe_best = build_svr_pipeline(C=C, epsilon=epsilon, gamma=gamma, kernel=kernel)   # BUILDS BEST SVR
svr_results = evaluate_sklearn_pipeline("best_svr", svr_pipe_best)                     # EVALUATES AND STORES

=== BEST_SVR METRICS ===
R2: 0.8381
MAE: 0.9642
MSE: 4.4526
RMSE: 2.1101
EXPLAINED_VARIANCE: 0.8388
CV R2 MEAN: 0.8188 | CV R2 STD: 0.0227


In [None]:
# LASSO REGRESSION (TRIAL 115 - RECREATE BEST CONFIGURATION AND HYPER-TUNE POLY DEGREE UP TO 20)

results = []  # LIST TO COLLECT RESULTS

for degree in range(2, 21):  # LOOP OVER POLYNOMIAL DEGREE FROM 2 TO 20
    USE_POLYNOMIALS = True                             # ENABLE POLYNOMIAL FEATURES
    POLY_DEGREE = degree                               # SET CURRENT POLY DEGREE
    poly = PolynomialFeatures(degree=POLY_DEGREE, include_bias=False)  # POLY TRANSFORMER
    print(f"POLYNOMIAL FEATURES ENABLED. DEGREE: {POLY_DEGREE}")       # PRINT STATUS

    # DEFINE NUMERIC PIPELINE BASED ON BEST TRIAL CONFIG
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="mean")),    # MEAN IMPUTATION (as per best config)
        ("scaler", RobustScaler()),                     # ROBUST SCALING (as per best config)
    ])

    # APPLY TO FEATURE COLUMNS ONLY
    preprocessor = ColumnTransformer(
        transformers=[("num", numeric_transformer, FEATURE_COLS)],
        remainder="drop"
    )

    # DEFINE LASSO REGRESSOR WITH BEST ALPHA FROM TRIAL 115
    lasso = Lasso(alpha=0.0007944698383043709, max_iter=10000, random_state=GLOBAL_SEED)

    # WRAP IN TTR IF TARGET SCALING ENABLED
    if USE_TARGET_SCALING:
        lasso = TransformedTargetRegressor(
            regressor=lasso, transformer=StandardScaler(with_mean=True, with_std=True)
        )

    # BUILD PIPELINE WITH PREPROCESSOR, POLY FEATURES, AND REGRESSOR
    steps = [("pre", preprocessor), ("poly", poly), ("reg", lasso)]
    lasso_pipe = Pipeline(steps=steps)

    # EVALUATE USING YOUR EVALUATION FUNCTION AND STORE RESULTS
    model_name = f"lasso_poly_deg_{POLY_DEGREE}"
    results.append(evaluate_sklearn_pipeline(model_name, lasso_pipe))  # APPEND EVALUATION RESULTS

POLYNOMIAL FEATURES ENABLED. DEGREE: 2
=== LASSO_POLY_DEG_2 METRICS ===
R2: 0.4494
MAE: 2.4051
MSE: 18.4633
RMSE: 4.2969
EXPLAINED_VARIANCE: 0.4499
CV R2 MEAN: 0.5204 | CV R2 STD: 0.0174
POLYNOMIAL FEATURES ENABLED. DEGREE: 3
=== LASSO_POLY_DEG_3 METRICS ===
R2: 0.5407
MAE: 2.1684
MSE: 16.6481
RMSE: 4.0802
EXPLAINED_VARIANCE: 0.5409
CV R2 MEAN: 0.6196 | CV R2 STD: 0.0231
POLYNOMIAL FEATURES ENABLED. DEGREE: 4
=== LASSO_POLY_DEG_4 METRICS ===
R2: 0.6367
MAE: 1.9413
MSE: 13.3212
RMSE: 3.6498
EXPLAINED_VARIANCE: 0.6372
CV R2 MEAN: 0.6806 | CV R2 STD: 0.0224
POLYNOMIAL FEATURES ENABLED. DEGREE: 5
=== LASSO_POLY_DEG_5 METRICS ===
R2: 0.7147
MAE: 1.6238
MSE: 9.6692
RMSE: 3.1095
EXPLAINED_VARIANCE: 0.7151
CV R2 MEAN: 0.7449 | CV R2 STD: 0.0231
POLYNOMIAL FEATURES ENABLED. DEGREE: 6
=== LASSO_POLY_DEG_6 METRICS ===
R2: 0.7438
MAE: 1.4430
MSE: 7.9425
RMSE: 2.8183
EXPLAINED_VARIANCE: 0.7442
CV R2 MEAN: 0.7694 | CV R2 STD: 0.0260
POLYNOMIAL FEATURES ENABLED. DEGREE: 7
=== LASSO_POLY_DEG_7 METRICS

In [None]:
POLY_DEGREE = 10                                                # SET POLY DEGREE TO 10
poly = PolynomialFeatures(degree=POLY_DEGREE, include_bias=False)  # POLY TRANSFORMER
print(f"POLYNOMIAL FEATURES ENABLED. DEGREE: {POLY_DEGREE}")    # PRINT STATUS

# NUMERIC PIPELINE BASED ON BEST TRIAL CONFIG
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),               # MEAN IMPUTATION (AS PER BEST CONFIG)
    ("scaler", RobustScaler()),                                # ROBUST SCALING (AS PER BEST CONFIG)
])

# APPLY TO FEATURE COLUMNS ONLY
preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, FEATURE_COLS)],
    remainder="drop"
)

# LASSO REGRESSOR WITH BEST ALPHA
lasso = Lasso(alpha=0.0007944698383043709, max_iter=10000, random_state=GLOBAL_SEED)

# WRAP IN TTR IF TARGET SCALING ENABLED
if USE_TARGET_SCALING:
    lasso = TransformedTargetRegressor(
        regressor=lasso, transformer=StandardScaler(with_mean=True, with_std=True)
    )

# BUILD PIPELINE WITH PREPROCESSOR, POLY FEATURES, AND REGRESSOR
steps = [("pre", preprocessor), ("poly", poly), ("reg", lasso)]
lasso_pipe = Pipeline(steps=steps)

# EVALUATE USING YOUR EVALUATION FUNCTION
model_name = f"lasso_poly_deg_{POLY_DEGREE}"
result = evaluate_sklearn_pipeline(model_name, lasso_pipe)     # RUN EVALUATION

POLYNOMIAL FEATURES ENABLED. DEGREE: 10
=== LASSO_POLY_DEG_10 METRICS ===
R2: 0.7625
MAE: 1.3644
MSE: 7.0823
RMSE: 2.6613
EXPLAINED_VARIANCE: 0.7628
CV R2 MEAN: 0.7902 | CV R2 STD: 0.0290


### DECISION TREE REGRESSOR

In [15]:
from sklearn.tree import DecisionTreeRegressor

# DECISION TREE DOES NOT NEED SCALING
preprocessor_tree = ColumnTransformer(
    transformers=[("imputer", SimpleImputer(strategy="mean"), FEATURE_COLS)],
    remainder="drop"
)

tree = DecisionTreeRegressor(random_state=GLOBAL_SEED)

if USE_TARGET_SCALING:
    tree = TransformedTargetRegressor(
        regressor=tree, transformer=StandardScaler()
    )

tree_pipe = Pipeline([
    ("pre", preprocessor_tree),
    ("reg", tree)
])

results.append(evaluate_sklearn_pipeline("decision_tree", tree_pipe))

=== DECISION_TREE METRICS ===
R2: 0.8733
MAE: 0.5478
MSE: 2.1435
RMSE: 1.4641
EXPLAINED_VARIANCE: 0.8736
CV R2 MEAN: 0.8939 | CV R2 STD: 0.0235


### RANDOM FOREST REGRESSOR

In [16]:
from sklearn.ensemble import RandomForestRegressor

preprocessor_rf = ColumnTransformer(
    transformers=[("imputer", SimpleImputer(strategy="mean"), FEATURE_COLS)],
    remainder="drop"
)

rf = RandomForestRegressor(n_estimators=100, random_state=GLOBAL_SEED, n_jobs=-1)

if USE_TARGET_SCALING:
    rf = TransformedTargetRegressor(regressor=rf, transformer=StandardScaler())

rf_pipe = Pipeline([
    ("pre", preprocessor_rf),
    ("reg", rf)
])

results.append(evaluate_sklearn_pipeline("random_forest", rf_pipe))

=== RANDOM_FOREST METRICS ===
R2: 0.8988
MAE: 0.5503
MSE: 1.7961
RMSE: 1.3402
EXPLAINED_VARIANCE: 0.8988
CV R2 MEAN: 0.9278 | CV R2 STD: 0.0161


### XGBOOST REGRESSOR

In [None]:
from xgboost import XGBRegressor

preprocessor_xgb = ColumnTransformer(
    transformers=[("num", numeric_transformer, FEATURE_COLS)],
    remainder="drop"
)

xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6,
                   random_state=GLOBAL_SEED, n_jobs=-1, verbosity=0)

if USE_TARGET_SCALING:
    xgb = TransformedTargetRegressor(regressor=xgb, transformer=StandardScaler())

xgb_pipe = Pipeline([
    ("pre", preprocessor_xgb),
    ("reg", xgb)
])

results.append(evaluate_sklearn_pipeline("xgboost", xgb_pipe))

=== XGBOOST METRICS ===
R2: 0.8906
MAE: 0.6468
MSE: 1.9189
RMSE: 1.3852
EXPLAINED_VARIANCE: 0.8907
CV R2 MEAN: 0.9102 | CV R2 STD: 0.0172


### HISTGRADIENTBOOSTING REGRESSOR

In [36]:
from sklearn.ensemble import HistGradientBoostingRegressor


# BUILD HISTGRADIENTBOOSTING PIPELINE
def build_hgb_pipeline(use_target_scaling: bool = False) -> Pipeline:
    base_hgb = HistGradientBoostingRegressor(
        max_iter=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=GLOBAL_SEED
    )

    # WRAP IN MULTIOUTPUTREGRESSOR TO SUPPORT MULTI-TARGET REGRESSION
    mo_hgb = MultiOutputRegressor(base_hgb)

    # OPTIONALLY SCALE TARGETS USING TRANSFORMEDTARGETREGRESSOR
    if use_target_scaling:
        mo_hgb = TransformedTargetRegressor(
            regressor=mo_hgb,
            transformer=StandardScaler()
        )

    # BUILD PIPELINE WITH COLUMN TRANSFORMER TO PASS FEATURES THROUGH
    pipe = Pipeline([
        ("dropper", ColumnTransformer([("num", "passthrough", FEATURE_COLS)], remainder="drop")),
        ("reg", mo_hgb)
    ])

    return pipe


# BUILD AND EVALUATE HISTGRADIENTBOOSTING PIPELINE
hgb_pipe = build_hgb_pipeline(use_target_scaling=USE_TARGET_SCALING)
results.append(evaluate_sklearn_pipeline("hist_gradient_boosting", hgb_pipe))

=== HIST_GRADIENT_BOOSTING METRICS ===
R2: 0.8837
MAE: 0.8123
MSE: 2.9495
RMSE: 1.7174
EXPLAINED_VARIANCE: 0.8837
CV R2 MEAN: 0.8919 | CV R2 STD: 0.0154


### LIGHTGBM

In [23]:
from lightgbm import LGBMRegressor

# SCALING FOR LGBM
preprocessor_lgbm = ColumnTransformer(
    transformers=[("num", numeric_transformer, FEATURE_COLS)],
    remainder="drop"
)

base_lgbm = LGBMRegressor(n_estimators=100, learning_rate=0.1,
                          max_depth=-1, random_state=GLOBAL_SEED, n_jobs=-1)

# WRAP FOR MULTI-TARGET REGRESSION
mo_lgbm = MultiOutputRegressor(base_lgbm)

if USE_TARGET_SCALING:
    mo_lgbm = TransformedTargetRegressor(regressor=mo_lgbm, transformer=StandardScaler())

lgbm_pipe = Pipeline([
    ("pre", preprocessor_lgbm),
    ("reg", mo_lgbm)
])

results.append(evaluate_sklearn_pipeline("lightgbm", lgbm_pipe))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24
[LightGBM] [Info] Number of data points in the train set: 1036, number of used features: 4
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24
[LightGBM] [Info] Number of data points in the train set: 1036, number of used features: 4
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 

### CATBOOST REGRESSOR

In [24]:
from catboost import CatBoostRegressor

# CATBOOST 
base_cat = CatBoostRegressor(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    random_seed=GLOBAL_SEED,
    verbose=0
)

# WRAP CATBOOST IN MULTIOUTPUTREGRESSOR TO HANDLE MULTI-TARGET REGRESSION
mo_cat = MultiOutputRegressor(base_cat)

# SCALE TARGETS USING TRANSFORMEDTARGETREGRESSOR
if USE_TARGET_SCALING:
    mo_cat = TransformedTargetRegressor(
        regressor=mo_cat,
        transformer=StandardScaler()
    )

cat_pipe = Pipeline([
    ("DROPPER", ColumnTransformer([("NUM", "passthrough", FEATURE_COLS)], remainder="drop")),
    ("REG", mo_cat)
])

results.append(evaluate_sklearn_pipeline("catboost", cat_pipe))

print("CATBOOST PIPELINE BUILT AND EVALUATED.")

=== CATBOOST METRICS ===
R2: 0.8847
MAE: 0.7965
MSE: 2.9470
RMSE: 1.7167
EXPLAINED_VARIANCE: 0.8848
CV R2 MEAN: 0.8995 | CV R2 STD: 0.0156
CATBOOST PIPELINE BUILT AND EVALUATED.


In [69]:
# PREPROCESSING 
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", RobustScaler())
])
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, FEATURE_COLS)
], remainder="drop")

# BASE MODELS 
def build_svr():
    return ("svr", Pipeline([
        ("pre", preprocessor),
        ("reg", SVR(C=3474.215, epsilon=0.0001141, gamma="scale"))
    ]))

def build_rf():
    return ("rf", Pipeline([
        ("pre", preprocessor),
        ("reg", RandomForestRegressor(n_estimators=100, random_state=GLOBAL_SEED, n_jobs=-1))
    ]))

def build_xgb():
    return ("xgb", Pipeline([
        ("pre", preprocessor),
        ("reg", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=GLOBAL_SEED, n_jobs=-1))
    ]))

def build_lgbm():
    return ("lgbm", Pipeline([
        ("pre", preprocessor),
        ("reg", LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=GLOBAL_SEED))
    ]))

estimators = [build_svr(), build_rf(), build_xgb(), build_lgbm()]

# META LEARNER 
poly = PolynomialFeatures(degree=10, include_bias=False)
lasso = Lasso(alpha=0.000794, max_iter=10000, random_state=GLOBAL_SEED)
meta_learner = Pipeline([
    ("poly", poly),
    ("reg", lasso)
])

# STACKING REGRESSOR 
stacking = StackingRegressor(
    estimators=estimators,
    final_estimator=meta_learner,
    cv=KFold(n_splits=5, shuffle=True, random_state=GLOBAL_SEED),
    n_jobs=-1,
    passthrough=False
)

# WRAP IN MULTIOUTPUT REGRESSOR
stacking_multioutput = MultiOutputRegressor(stacking, n_jobs=-1)

# FINAL PIPELINE
stacking_pipeline = Pipeline([
    ("stack", stacking_multioutput)
])

# EVALUATE USING YOUR EVALUATION FUNCTION
model_name = f"STACKING REGRESSOR"
result = evaluate_sklearn_pipeline(model_name, stacking_pipeline)     # RUN EVALUATION

STACKING REGRESSOR METRICS:
R2: 0.8641
MAE: 0.6315
MSE: 2.9301
RMSE: 1.7118
EXPLAINED_VARIANCE: 0.8646
CV R2 MEAN: 0.8923 | CV R2 STD: 0.0346


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression

# COMBINED PREPROCESSING STEP INCLUDING POLY FEATURES
preprocessor = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", RobustScaler()),
    ("poly", PolynomialFeatures(degree=5, include_bias=False)),
])

# FEATURE SELECTION (AFTER POLY)
feature_selector = SelectKBest(score_func=f_regression, k=20)

# BASE MODEL BUILDERS—NO INTERNAL PREPROCESSING
def build_svr():
    return ("svr", SVR(C=3474.215, epsilon=0.0001141, gamma="scale"))

def build_rf():
    return ("rf", RandomForestRegressor(n_estimators=100, random_state=GLOBAL_SEED, n_jobs=-1))

def build_xgb():
    return ("xgb", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=GLOBAL_SEED, n_jobs=-1))

def build_lgbm():
    return ("lgbm", LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=GLOBAL_SEED))

estimators = [build_svr(), build_rf(), build_xgb(), build_lgbm()]

# META LEARNER (LASSO)
lasso = Lasso(alpha=0.000794, max_iter=10000, random_state=GLOBAL_SEED)

# STACKING REGRESSOR WITH passthrough=True
-stacking = StackingRegressor(
    estimators=estimators,
    final_estimator=lasso,
    cv=KFold(n_splits=5, shuffle=True, random_state=GLOBAL_SEED),
    n_jobs=-1,
    passthrough=True
)

# MAIN PIPELINE: PREPROCESSING + SELECTKBEST + STACKING
stacking_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", feature_selector),
    ("stack", stacking)
])

# MULTI-OUTPUT WRAPPER
stacking_pipeline = MultiOutputRegressor(stacking_pipeline, n_jobs=-1)

# EVALUATE
model_name = "STACKING REGRESSOR"
result = evaluate_sklearn_pipeline(model_name, stacking_pipeline)

STACKING REGRESSOR METRICS:
R2: 0.8389
MAE: 0.8875
MSE: 3.3806
RMSE: 1.8387
EXPLAINED_VARIANCE: 0.8391
CV R2 MEAN: 0.8659 | CV R2 STD: 0.0190


In [83]:
# RANGE OF K VALUES TO EVALUATE
k_values = range(20, 45, 5)

# LIST TO HOLD RESULTS FOR EACH K
results_list = []

for k in k_values:
    print(f"\nRUNNING PIPELINE WITH SELECTKBEST K={k}...")

    # UPDATE FEATURE SELECTOR WITH CURRENT K
    feature_selector = SelectKBest(score_func=f_regression, k=k)

    # DEFINE THE PIPELINE FOR THIS K
    stacking_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("feature_selection", feature_selector),
        ("stack", stacking)
    ])

    # WRAP WITH MULTIOUTPUTREGRESSOR
    multioutput_pipeline = MultiOutputRegressor(stacking_pipeline, n_jobs=-1)

    # EVALUATE PIPELINE (YOUR EXISTING EVALUATION FUNCTION)
    model_name = f"STACKING REGRESSOR K={k}"
    result = evaluate_sklearn_pipeline(model_name, multioutput_pipeline)

    # STORE RESULTS WITH K FOR LATER COMPARISON
    results_list.append((k, result))

# AFTER ALL RUNS, PRINT SUMMARIZED COMPARISON
print("\n=== SUMMARY OF RESULTS ===")
print(f"{'K':>5} | {'R2':>8} | {'MAE':>8} | {'MSE':>8} | {'RMSE':>8} | {'EXPL_VAR':>9} | {'CV R2 MEAN':>10} | {'CV R2 STD':>9}")
print("-" * 80)
for k, res in results_list:
    print(f"{k:5} | {res['r2']:8.4f} | {res['mae']:8.4f} | {res['mse']:8.4f} | {res['rmse']:8.4f} | {res['explained_variance']:9.4f} | {res['cv_r2_mean']:10.4f} | {res['cv_r2_std']:9.4f}")


RUNNING PIPELINE WITH SELECTKBEST K=20...
STACKING REGRESSOR K=20 METRICS:
R2: 0.8389
MAE: 0.8875
MSE: 3.3806
RMSE: 1.8387
EXPLAINED_VARIANCE: 0.8391
CV R2 MEAN: 0.8659 | CV R2 STD: 0.0190

RUNNING PIPELINE WITH SELECTKBEST K=25...
STACKING REGRESSOR K=25 METRICS:
R2: 0.8403
MAE: 0.8701
MSE: 3.3433
RMSE: 1.8285
EXPLAINED_VARIANCE: 0.8405
CV R2 MEAN: 0.8677 | CV R2 STD: 0.0180

RUNNING PIPELINE WITH SELECTKBEST K=30...
STACKING REGRESSOR K=30 METRICS:
R2: 0.8524
MAE: 0.7934
MSE: 2.6200
RMSE: 1.6186
EXPLAINED_VARIANCE: 0.8524
CV R2 MEAN: 0.8745 | CV R2 STD: 0.0217

RUNNING PIPELINE WITH SELECTKBEST K=35...
STACKING REGRESSOR K=35 METRICS:
R2: 0.8707
MAE: 0.7354
MSE: 2.5418
RMSE: 1.5943
EXPLAINED_VARIANCE: 0.8707
CV R2 MEAN: 0.8860 | CV R2 STD: 0.0225

RUNNING PIPELINE WITH SELECTKBEST K=40...
STACKING REGRESSOR K=40 METRICS:
R2: 0.8772
MAE: 0.6847
MSE: 2.1062
RMSE: 1.4513
EXPLAINED_VARIANCE: 0.8773
CV R2 MEAN: 0.8915 | CV R2 STD: 0.0226

=== SUMMARY OF RESULTS ===
    K |       R2 |    