In [134]:
!pip install xgboost catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [135]:
from pathlib import Path

import joblib
import os
import numpy as np
import pandas as pd
import sklearn.gaussian_process.kernels as kernels

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import BayesianRidge
from sklearn.svm import NuSVR
from joblib import Memory
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.base import clone
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [117]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [118]:
BASE_PATH = Path("/content/gdrive/MyDrive/data")
x_train = pd.read_csv(BASE_PATH / 'X_train.csv', skiprows=1, header=None).values[:, 1:]
x_test = pd.read_csv(BASE_PATH / 'X_test.csv', skiprows=1, header=None).values[:, 1:]
y_train = pd.read_csv(BASE_PATH / 'y_train.csv', skiprows=1, header=None).values[:, 1:].flatten()

In [119]:
def remove_outliers(x_train, y_train, x_test, contamination=0.047, variance_threshold=1e-7, random_state=42):
    """
    Remove low-variance features and outliers from training data.

    Parameters:
    -----------
    x_train : DataFrame or ndarray
    y_train : Series or ndarray
    x_test : DataFrame or ndarray
    contamination : float, proportion of outliers
    variance_threshold : float, threshold for variance filtering
    random_state : int

    Returns:
    --------
    x_train_clean, y_train_clean, x_test_clean
    """

    # Convert to DataFrame if needed
    x_train_df = pd.DataFrame(x_train).copy()
    x_test_df = pd.DataFrame(x_test).copy() if x_test is not None else None
    y_train_series = pd.Series(np.asarray(y_train).flatten(), index=x_train_df.index)

    # ===== Step 1: Remove zero/low-variance features =====
    var_selector = VarianceThreshold(threshold=variance_threshold)

    # Fit on training data (using median-imputed values to handle NaN)
    med = x_train_df.median(axis=0)
    x_train_for_var = x_train_df.fillna(med)

    var_selector.fit(x_train_for_var)

    variance_mask = var_selector.get_support()
    n_removed = (~variance_mask).sum()
    print(f"[VarianceThreshold] Removed {n_removed} low-variance features (threshold={variance_threshold})")

    x_train_df = x_train_df.iloc[:, variance_mask]
    if x_test_df is not None:
        x_test_df = x_test_df.iloc[:, variance_mask]

    med = x_train_df.median(axis=0)
    xtr_imp = x_train_df.fillna(med)

    scaler = RobustScaler()
    xtr_std = scaler.fit_transform(xtr_imp)

    pca = PCA(n_components=2, random_state=random_state)
    x_proj = pca.fit_transform(xtr_std)

    iso = IsolationForest(contamination=contamination, random_state=random_state)
    mask = iso.fit_predict(x_proj) == 1

    outlier_pos = np.where(~mask)[0]
    n_outliers = outlier_pos.size
    print(f"[OutlierRemoval] Removed {n_outliers} outliers ({n_outliers/len(mask)*100:.2f}%)")
    print(f"[Outlier positions] {outlier_pos.tolist()}")

    x_train_clean = x_train_df[mask]
    y_train_clean = y_train_series[mask]

    if isinstance(x_train, np.ndarray):
        return x_train_clean.values, y_train_clean.values, x_test_df.values if x_test_df is not None else None
    else:
        return x_train_clean, y_train_clean, x_test_df

In [120]:
class ScaledKNNImputer(BaseEstimator, TransformerMixin):
    """
    KNN Imputer that internally scales data for distance calculation,
    then returns imputed data in the ORIGINAL scale.

    This avoids having scaling artifacts in your pipeline.
    """

    def __init__(self, n_neighbors=7, weights='distance'):
        self.n_neighbors = n_neighbors
        self.weights = weights

    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)

        self.median_ = X_df.median(axis=0)
        X_filled = X_df.fillna(self.median_)

        self.scaler_ = StandardScaler()
        self.scaler_.fit(X_filled)

        X_scaled = self.scaler_.transform(X_filled)
        self.imputer_ = KNNImputer(n_neighbors=self.n_neighbors, weights=self.weights)
        self.imputer_.fit(X_scaled)

        return self

    def transform(self, X):
        X_df = pd.DataFrame(X)

        X_filled = X_df.fillna(self.median_)

        X_scaled = self.scaler_.transform(X_filled)

        X_imputed_scaled = self.imputer_.transform(X_scaled)

        X_imputed_original = self.scaler_.inverse_transform(X_imputed_scaled)

        return X_imputed_original

In [121]:
class SpearmanRFSelector(BaseEstimator, TransformerMixin):
    """
    Combined feature selector:
    1. Scale data (using provided scaler)
    2. Select top_k features by Spearman correlation
    3. Select top_k features by RandomForest importance

    Works on already-imputed data.
    """

    def __init__(
        self,
        scaler=None,
        feature_tops=(200, 200),  # (spearman_top_k, rf_top_k) as tuple
        rf_n_estimators=1000,
        rf_max_depth=None,
        rf_min_samples_leaf=1,
        rf_max_features='sqrt',
        random_state=42
    ):
        self.scaler = scaler
        self.feature_tops = feature_tops
        self.rf_n_estimators = rf_n_estimators
        self.rf_max_depth = rf_max_depth
        self.rf_min_samples_leaf = rf_min_samples_leaf
        self.rf_max_features = rf_max_features
        self.random_state = random_state

    def fit(self, X, y):
        if y is None:
            raise ValueError("SpearmanRFSelector requires y")

        spearman_top_k, rf_top_k = self.feature_tops

        X_df = pd.DataFrame(X).copy() if not isinstance(X, pd.DataFrame) else X.copy()
        y_series = pd.Series(np.asarray(y).flatten(), index=X_df.index)

        self.n_features_in_ = X_df.shape[1]

        if self.scaler is not None:
            self.scaler_ = clone(self.scaler)
            X_scaled = self.scaler_.fit_transform(X_df)
            X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns, index=X_df.index)
        else:
            X_scaled_df = X_df

        spearman_corr = X_scaled_df.corrwith(y_series, method='spearman').abs()

        n_keep_spearman = min(spearman_top_k, len(spearman_corr))
        spearman_features = spearman_corr.nlargest(n_keep_spearman).index

        print(f"[SpearmanRFSelector] Spearman: Selected {len(spearman_features)} features (from {self.n_features_in_})")

        X_spearman = X_scaled_df.loc[:, spearman_features]

        self.rf_ = RandomForestRegressor(
            n_estimators=self.rf_n_estimators,
            max_depth=self.rf_max_depth,
            min_samples_leaf=self.rf_min_samples_leaf,
            max_features=self.rf_max_features,
            random_state=self.random_state,
            n_jobs=-1,
            verbose=0
        )
        self.rf_.fit(X_spearman, y_series)

        importances = pd.Series(self.rf_.feature_importances_, index=X_spearman.columns)
        importances = importances.sort_values(ascending=False)

        n_keep_rf = min(rf_top_k, len(importances)) if rf_top_k else len(importances)
        rf_features = importances.head(n_keep_rf).index

        print(f"[SpearmanRFSelector] RF: Selected {len(rf_features)} features (from {len(spearman_features)})")

        self.selected_features_ = [X_df.columns.get_loc(col) for col in rf_features]
        self.selected_feature_names_ = rf_features.tolist()

        print(f"[SpearmanRFSelector] FINAL: {len(self.selected_features_)} features")

        return self

    def transform(self, X):
        X_df = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        # Return selected features
        return X_df.iloc[:, self.selected_features_].values


In [122]:
def build_imputation_pipeline(random_state=42):
    pipeline = Pipeline(
        [
              ("knn_imputer", ScaledKNNImputer()),
            # ("iterative_imputer", IterativeImputer()),
        ]
    )
    return pipeline

In [123]:
def build_feature_selection_pipeline(random_state=42):

    return Pipeline([
        ("feature_selector", SpearmanRFSelector(
            random_state=random_state
        ))
    ])

In [147]:
def build_svr_branch(random_state=42):
    """SVR branch with feature selection"""
    return Pipeline([
        ("feature_selector", SpearmanRFSelector(
            scaler=StandardScaler(),
            random_state=random_state
        )),
        ("scaler", StandardScaler()),
        ("model", NuSVR())
    ])


def build_hgb_branch(random_state=42):
    """HistGradientBoosting with feature selection"""
    return Pipeline([
        ("feature_selector", SpearmanRFSelector(
            scaler=StandardScaler(),
            random_state=random_state
        )),
        ("scaler", StandardScaler()),
        ("model", HistGradientBoostingRegressor(random_state=random_state))
    ])

def build_xgb_branch(random_state=42):
    """XGBoost"""
    return Pipeline([
        ("feature_selector", SpearmanRFSelector(
            scaler=StandardScaler(),
            random_state=random_state
        )),
        ("scaler", RobustScaler()),
        ("model", XGBRegressor(
            random_state=random_state,
            n_jobs=-1,
            verbosity=0  # Suppress output
        ))
    ])


def build_cat_branch(random_state=42):
    """CatBoost"""
    return Pipeline([
        ("feature_selector", SpearmanRFSelector(
            scaler=StandardScaler(),
            random_state=random_state
        )),
        ("scaler", RobustScaler()),
        ("model", CatBoostRegressor(
            random_state=random_state,
            verbose=False,
            thread_count=-1
        ))
    ])


def build_etr_branch(random_state=42):
    """ExtraTrees with feature selection"""
    return Pipeline([
        ("feature_selector", SpearmanRFSelector(
            scaler=StandardScaler(),
            random_state=random_state
        )),
        ("scaler", StandardScaler()),
        ("model", ExtraTreesRegressor(random_state=random_state, n_jobs=-1))
    ])

def build_gpr_branch(random_state=42):
    """GaussianProcess with feature selection"""
    return Pipeline([
        ("feature_selector", SpearmanRFSelector(
            scaler=StandardScaler(),
            random_state=random_state
        )),
        ("scaler", RobustScaler()),
        ("model", GaussianProcessRegressor(random_state=random_state))
    ])

In [125]:
x_train_clean, y_train_clean, x_test_clean = remove_outliers(
    x_train, y_train, x_test, contamination=0.047, random_state=42
)

[VarianceThreshold] Removed 4 low-variance features (threshold=1e-07)
[OutlierRemoval] Removed 57 outliers (4.70%)
[Outlier positions] [3, 10, 27, 60, 114, 156, 167, 203, 207, 213, 232, 329, 346, 380, 382, 423, 441, 442, 446, 459, 537, 575, 579, 611, 627, 674, 676, 681, 690, 694, 697, 733, 740, 763, 769, 805, 899, 906, 915, 933, 935, 951, 983, 1015, 1032, 1070, 1071, 1087, 1090, 1097, 1103, 1127, 1136, 1137, 1144, 1152, 1196]


In [130]:
imputation_pipeline = build_imputation_pipeline()

svr_branch = build_svr_branch(random_state=42)

svr_full_pipeline = Pipeline([
    ("imputation", imputation_pipeline),
    ("svr", svr_branch),
])

svr_param_grid = {
    "imputation__knn_imputer__n_neighbors": [2],
    "imputation__knn_imputer__weights": ["distance"],
    "svr__feature_selector__feature_tops": [(200, 175)],
    "svr__feature_selector__rf_n_estimators": [1000],
    "svr__feature_selector__rf_max_depth": [None],
    "svr__feature_selector__rf_min_samples_leaf": [1],
    "svr__feature_selector__rf_max_features": ["sqrt"],
    "svr__model__nu": [0.5],
    "svr__model__kernel": ["rbf"],
    "svr__model__C": [60],
    "svr__model__gamma": ["auto"],
}

svr_grid = GridSearchCV(
    estimator=svr_full_pipeline,
    param_grid=svr_param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1,
    verbose=2,
)

print("=" * 60)
print("Optimizing SVR Branch")
print("=" * 60)

svr_grid.fit(x_train_clean, y_train_clean)

print(f"\nBest SVR R²: {svr_grid.best_score_:.4f}")
print(f"Best SVR Params: {svr_grid.best_params_}")

#  {
#     "imputation__knn_imputer__n_neighbors": [2],
#     "imputation__knn_imputer__weights": ["distance"],
#     "svr__feature_selector__feature_tops": [
#         (50, 45),
#         (75, 65),
#         (100, 90),
#         (125, 110),
#         (150, 130),
#         (175, 150),
#         (200, 170),
#         (225, 190),
#         (250, 210),
#         (275, 230),
#         (300, 250),
#         (350, 290),
#         (400, 330),
#         (450, 370),
#         (500, 410),
#     ],
#     "svr__feature_selector__rf_n_estimators": [1000],
#     "svr__feature_selector__rf_max_depth": [None],
#     "svr__feature_selector__rf_min_samples_leaf": [1],
#     "svr__feature_selector__rf_max_features": ["sqrt"],
#     "svr__model__nu": [0.5],
#     "svr__model__kernel": ["rbf"],
#     "svr__model__C": [50, 55, 60],
#     "svr__model__gamma": ["auto"],
# }
# Best SVR R²: 0.6731
# Best SVR Params: {'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance', 'svr__feature_selector__feature_tops': (200, 170), 'svr__feature_selector__rf_max_depth': None, 'svr__feature_selector__rf_max_features': 'sqrt', 'svr__feature_selector__rf_min_samples_leaf': 1, 'svr__feature_selector__rf_n_estimators': 1000, 'svr__model__C': 60, 'svr__model__gamma': 'auto', 'svr__model__kernel': 'rbf', 'svr__model__nu': 0.5}


# "svr__feature_selector__feature_tops": [
#       (170, 145),
#       (175, 150),
#       (180, 155),
#       (185, 160),
#       (190, 165),
#       (195, 168),
#       (198, 168),
#       (200, 170),
#       (202, 172),
#       (205, 175),
#       (210, 180),
#       (215, 185),
#       (220, 188),
#       (225, 190),
#       (230, 195),
# ]
# Best SVR R²: 0.6731
# Best SVR Params: {'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance', 'svr__feature_selector__feature_tops': (200, 170), 'svr__feature_selector__rf_max_depth': None, 'svr__feature_selector__rf_max_features': 'sqrt', 'svr__feature_selector__rf_min_samples_leaf': 1, 'svr__feature_selector__rf_n_estimators': 1000, 'svr__model__C': 60, 'svr__model__gamma': 'auto', 'svr__model__kernel': 'rbf', 'svr__model__nu': 0.5}

# "svr__feature_selector__feature_tops": [
#       (200, 161),
#       (200, 162),
#       (200, 163),
#       (200, 164),
#       (200, 165),
#       (200, 166),
#       (200, 167),
#       (200, 168),
#       (200, 169),
#       (200, 170),
#       (200, 171),
#       (200, 172),
#       (200, 173),
#       (200, 174),
#       (200, 175),
#       (200, 176),
#       (200, 177),
#       (200, 178),
#       (200, 179),
#     ],
# Best SVR R²: 0.6756
# Best SVR Params: {'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance', 'svr__feature_selector__feature_tops': (200, 175), 'svr__feature_selector__rf_max_depth': None, 'svr__feature_selector__rf_max_features': 'sqrt', 'svr__feature_selector__rf_min_samples_leaf': 1, 'svr__feature_selector__rf_n_estimators': 1000, 'svr__model__C': 60, 'svr__model__gamma': 'auto', 'svr__model__kernel': 'rbf', 'svr__model__nu': 0.5}

# "svr__model__nu": [0.4, 0.5, 0.6],
# "svr__model__C": [60, 65, 70],
# Best SVR R²: 0.6756
# Best SVR Params: {'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance', 'svr__feature_selector__feature_tops': (200, 175), 'svr__feature_selector__rf_max_depth': None, 'svr__feature_selector__rf_max_features': 'sqrt', 'svr__feature_selector__rf_min_samples_leaf': 1, 'svr__feature_selector__rf_n_estimators': 1000, 'svr__model__C': 60, 'svr__model__gamma': 'auto', 'svr__model__kernel': 'rbf', 'svr__model__nu': 0.5}


Optimizing SVR Branch
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 175 features (from 200)
[SpearmanRFSelector] FINAL: 175 features

Best SVR R²: 0.6756
Best SVR Params: {'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance', 'svr__feature_selector__feature_tops': (200, 175), 'svr__feature_selector__rf_max_depth': None, 'svr__feature_selector__rf_max_features': 'sqrt', 'svr__feature_selector__rf_min_samples_leaf': 1, 'svr__feature_selector__rf_n_estimators': 1000, 'svr__model__C': 60, 'svr__model__gamma': 'auto', 'svr__model__kernel': 'rbf', 'svr__model__nu': 0.5}


In [133]:
imputation_pipeline = build_imputation_pipeline()

hgb_branch = build_hgb_branch(random_state=42)

hgb_full_pipeline = Pipeline([
    ("imputation", imputation_pipeline),
    ("hgb", hgb_branch),
])

hgb_param_grid = {
    "imputation__knn_imputer__n_neighbors": [2],
    "imputation__knn_imputer__weights": ["distance"],

    "hgb__feature_selector__feature_tops":[(200, 167)],
    "hgb__feature_selector__rf_n_estimators": [1000],
    "hgb__feature_selector__rf_max_depth": [None],
    "hgb__feature_selector__rf_min_samples_leaf": [1],
    "hgb__feature_selector__rf_max_features": ["sqrt"],

    "hgb__model__learning_rate": [0.1],
    "hgb__model__max_iter": [200],
    "hgb__model__max_depth": [None],
    "hgb__model__min_samples_leaf": [20],
}

hgb_grid = GridSearchCV(
    estimator=hgb_full_pipeline,
    param_grid=hgb_param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1,
    verbose=2,
)

print("=" * 60)
print("Optimizing HGB Branch")
print("=" * 60)

hgb_grid.fit(x_train_clean, y_train_clean)

print(f"\nBest HGB R²: {hgb_grid.best_score_:.4f}")
print(f"Best HGB Params: {hgb_grid.best_params_}")


# [
#  (50, 45),
#  (125, 110),
#  (200, 170),
#  (275, 230),
#  (350, 290),
#  (425, 350),
#  (500, 410),
# ],
# Best HGB R²: 0.6367
# Best HGB Params: {'hgb__feature_selector__feature_tops': (200, 170), 'hgb__feature_selector__rf_max_depth': None, 'hgb__feature_selector__rf_max_features': 'sqrt', 'hgb__feature_selector__rf_min_samples_leaf': 1, 'hgb__feature_selector__rf_n_estimators': 1000, 'hgb__model__learning_rate': 0.1, 'hgb__model__max_depth': None, 'hgb__model__max_iter': 100, 'hgb__model__min_samples_leaf': 20, 'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance'}

# [
#       (200, 161),
#       (200, 162),
#       (200, 163),
#       (200, 164),
#       (200, 165),
#       (200, 166),
#       (200, 167),
#       (200, 168),
#       (200, 169),
#       (200, 170),
#       (200, 171),
#       (200, 172),
#       (200, 173),
#       (200, 174),
#       (200, 175),
#       (200, 176),
#       (200, 177),
#       (200, 178),
#       (200, 179),
# ]
# Best HGB R²: 0.6424
# Best HGB Params: {'hgb__feature_selector__feature_tops': (200, 167), 'hgb__feature_selector__rf_max_depth': None, 'hgb__feature_selector__rf_max_features': 'sqrt', 'hgb__feature_selector__rf_min_samples_leaf': 1, 'hgb__feature_selector__rf_n_estimators': 1000, 'hgb__model__learning_rate': 0.1, 'hgb__model__max_depth': None, 'hgb__model__max_iter': 100, 'hgb__model__min_samples_leaf': 20, 'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance'}


# "hgb__model__learning_rate": [0.05, 0.1, 0.15],
# "hgb__model__max_iter": [100, 150, 200],
# Best HGB R²: 0.6449
# Best HGB Params: {'hgb__feature_selector__feature_tops': (200, 167), 'hgb__feature_selector__rf_max_depth': None, 'hgb__feature_selector__rf_max_features': 'sqrt', 'hgb__feature_selector__rf_min_samples_leaf': 1, 'hgb__feature_selector__rf_n_estimators': 1000, 'hgb__model__learning_rate': 0.1, 'hgb__model__max_depth': None, 'hgb__model__max_iter': 200, 'hgb__model__min_samples_leaf': 20, 'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance'}

Optimizing HGB Branch
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 167 features (from 200)
[SpearmanRFSelector] FINAL: 167 features

Best HGB R²: 0.6449
Best HGB Params: {'hgb__feature_selector__feature_tops': (200, 167), 'hgb__feature_selector__rf_max_depth': None, 'hgb__feature_selector__rf_max_features': 'sqrt', 'hgb__feature_selector__rf_min_samples_leaf': 1, 'hgb__feature_selector__rf_n_estimators': 1000, 'hgb__model__learning_rate': 0.1, 'hgb__model__max_depth': None, 'hgb__model__max_iter': 200, 'hgb__model__min_samples_leaf': 20, 'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance'}


In [146]:
imputation_pipeline = build_imputation_pipeline()

xgb_branch = build_xgb_branch(random_state=42)

xgb_full_pipeline = Pipeline([
    ("imputation", imputation_pipeline),
    ("xgb", xgb_branch),
])

xgb_param_grid = {
    "imputation__knn_imputer__n_neighbors": [2],
    "imputation__knn_imputer__weights": ["distance"],
    "xgb__feature_selector__feature_tops": [(200, 175)],
    "xgb__feature_selector__rf_n_estimators": [1000],
    "xgb__feature_selector__rf_max_depth": [None],
    "xgb__feature_selector__rf_min_samples_leaf": [1],
    "xgb__feature_selector__rf_max_features": ["sqrt"],
    "xgb__model__n_estimators": [1000],
    "xgb__model__learning_rate": [0.1],
    "xgb__model__max_depth": [3],
    "xgb__model__min_child_weight": [1.2],
    "xgb__model__subsample": [1.0],
    "xgb__model__colsample_bytree": [1.0],
    "xgb__model__gamma": [0],
}

xgb_grid = GridSearchCV(
    estimator=xgb_full_pipeline,
    param_grid=xgb_param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1,
    verbose=2,
)

print("=" * 60)
print("Optimizing XGBoost Branch")
print("=" * 60)

xgb_grid.fit(x_train_clean, y_train_clean)

print(f"\nBest XGBoost R²: {xgb_grid.best_score_:.4f}")
print(f"Best XGBoost Params: {xgb_grid.best_params_}")

# [
#         (50, 45),
#         (75, 65),
#         (100, 90),
#         (125, 110),
#         (150, 130),
#         (175, 150),
#         (200, 170),
#         (225, 190),
#         (250, 210),
#         (275, 230),
#         (300, 250),
#         (350, 290),
#         (400, 330),
#         (450, 370),
#         (500, 410),
# ]
# Best XGBoost R²: 0.6292
# Best XGBoost Params: {'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance', 'xgb__feature_selector__feature_tops': (200, 170), 'xgb__feature_selector__rf_max_depth': None, 'xgb__feature_selector__rf_max_features': 'sqrt', 'xgb__feature_selector__rf_min_samples_leaf': 1, 'xgb__feature_selector__rf_n_estimators': 1000, 'xgb__model__colsample_bytree': 0.8, 'xgb__model__gamma': 0, 'xgb__model__learning_rate': 0.1, 'xgb__model__max_depth': 6, 'xgb__model__min_child_weight': 1, 'xgb__model__n_estimators': 500, 'xgb__model__subsample': 0.8}

#Best XGBoost R²: 0.6339
# (200, 175)

# Best XGBoost R²: 0.6434
# "xgb__model__n_estimators": [500, 1000],
# "xgb__model__learning_rate": [0.1, 0.3],
# "xgb__model__max_depth": [3, 6, 9],
# "xgb__model__min_child_weight": [1, 1.2],
# "xgb__model__subsample": [0.8, 1.0],
# "xgb__model__colsample_bytree": [0.8, 1.0],
# "xgb__model__gamma": [0, 0.2],
# {'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance', 'xgb__feature_selector__feature_tops': (200, 175), 'xgb__feature_selector__rf_max_depth': None, 'xgb__feature_selector__rf_max_features': 'sqrt', 'xgb__feature_selector__rf_min_samples_leaf': 1, 'xgb__feature_selector__rf_n_estimators': 1000, 'xgb__model__colsample_bytree': 1.0, 'xgb__model__gamma': 0, 'xgb__model__learning_rate': 0.1, 'xgb__model__max_depth': 3, 'xgb__model__min_child_weight': 1.2, 'xgb__model__n_estimators': 1000, 'xgb__model__subsample': 1.0}

Optimizing XGBoost Branch
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 175 features (from 200)
[SpearmanRFSelector] FINAL: 175 features

Best XGBoost R²: 0.6434
Best XGBoost Params: {'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance', 'xgb__feature_selector__feature_tops': (200, 175), 'xgb__feature_selector__rf_max_depth': None, 'xgb__feature_selector__rf_max_features': 'sqrt', 'xgb__feature_selector__rf_min_samples_leaf': 1, 'xgb__feature_selector__rf_n_estimators': 1000, 'xgb__model__colsample_bytree': 1.0, 'xgb__model__gamma': 0, 'xgb__model__learning_rate': 0.1, 'xgb__model__max_depth': 3, 'xgb__model__min_child_weight': 1.2, 'xgb__model__n_estimators': 1000, 'xgb__model__subsample': 1.0}


In [151]:
imputation_pipeline = build_imputation_pipeline()

cat_branch = build_cat_branch(random_state=42)

cat_full_pipeline = Pipeline([
    ("imputation", imputation_pipeline),
    ("cat", cat_branch),
])

cat_param_grid = {
    "imputation__knn_imputer__n_neighbors": [2],
    "imputation__knn_imputer__weights": ["distance"],

    "cat__feature_selector__feature_tops": [(200, 179)],
    "cat__feature_selector__rf_n_estimators": [1000],
    "cat__feature_selector__rf_max_depth": [None],
    "cat__feature_selector__rf_min_samples_leaf": [1],
    "cat__feature_selector__rf_max_features": ["sqrt"],


    # Number of trees (like n_estimators)
    "cat__model__iterations": [1000],  # Default 1000

    # Learning rate - Lower = need more trees but often better
    "cat__model__learning_rate": [0.03],  # Default is auto-calculated, 0.03 is common

    # Tree depth - Controls model complexity
    "cat__model__depth": [6],  # Default 6

    # L2 regularization on leaf weights
    "cat__model__l2_leaf_reg": [3],  # Default 3.0

    # Minimum number of training samples per leaf
    "cat__model__min_data_in_leaf": [1],  # Default 1

    # Bayesian bootstrap intensity (like subsample)
    "cat__model__bagging_temperature": [1],  # Default 1.0, higher = more aggressive

    # Amount of randomness for scoring splits
    "cat__model__random_strength": [1],  # Default 1.0
}

cat_grid = GridSearchCV(
    estimator=cat_full_pipeline,
    param_grid=cat_param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1,
    verbose=2,
)

print("=" * 60)
print("Optimizing CatBoost Branch")
print("=" * 60)

cat_grid.fit(x_train_clean, y_train_clean)

print(f"\nBest CatBoost R²: {cat_grid.best_score_:.4f}")
print(f"Best CatBoost Params: {cat_grid.best_params_}")

# Best CatBoost R²: 0.6490
# [
#   (50, 45),
#   (125, 110),
#   (200, 170),
#   (275, 230),
#   (350, 290),
#   (425, 350),
#   (500, 410),
# ]
# (200, 170)

# Best CatBoost R²: 0.6552
# [
#     (200, 171),
#     (200, 172),
#     (200, 173),
#     (200, 174),
#     (200, 175),
#     (200, 176),
#     (200, 177),
#     (200, 178),
#     (200, 179),
# ],
# (200, 179),

# Best CatBoost R²: 0.6548
# [
#     (200, 180),
#     (200, 181),
#     (200, 182),
# ],

Optimizing CatBoost Branch
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 180 features (from 200)
[SpearmanRFSelector] FINAL: 180 features

Best CatBoost R²: 0.6548
Best CatBoost Params: {'cat__feature_selector__feature_tops': (200, 180), 'cat__feature_selector__rf_max_depth': None, 'cat__feature_selector__rf_max_features': 'sqrt', 'cat__feature_selector__rf_min_samples_leaf': 1, 'cat__feature_selector__rf_n_estimators': 1000, 'cat__model__bagging_temperature': 1, 'cat__model__depth': 6, 'cat__model__iterations': 1000, 'cat__model__l2_leaf_reg': 3, 'cat__model__learning_rate': 0.03, 'cat__model__min_data_in_leaf': 1, 'cat__model__random_strength': 1, 'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance'}


In [152]:
imputation_pipeline = build_imputation_pipeline()

etr_branch = build_etr_branch(random_state=42)

etr_full_pipeline = Pipeline([
    ("imputation", imputation_pipeline),
    ("etr", etr_branch),
])

etr_param_grid = {
    "imputation__knn_imputer__n_neighbors": [2],
    "imputation__knn_imputer__weights": ["distance"],

    "etr__feature_selector__feature_tops": [
        (50, 45),
        (125, 110),
        (200, 170),
        (275, 230),
        (350, 290),
        (425, 350),
        (500, 410),
    ],
    "etr__feature_selector__rf_n_estimators": [1000],
    "etr__feature_selector__rf_max_depth": [None],
    "etr__feature_selector__rf_min_samples_leaf": [1],
    "etr__feature_selector__rf_max_features": ["sqrt"],

    # ===== ExtraTrees Model Params (DEFAULTS) =====
    # MUST-TUNE parameters:

    # Number of trees
    "etr__model__n_estimators": [1000],  # Default 100, but more is often better

    # Tree depth - None means unlimited
    "etr__model__max_depth": [None],  # Default None

    # Minimum samples required to be at a leaf node
    "etr__model__min_samples_leaf": [2],  # Default 1, higher = more regularization

    # Number of features to consider when looking for best split
    "etr__model__max_features": ["sqrt"],  # Default "sqrt" or 1.0

    # Minimum samples required to split an internal node
    "etr__model__min_samples_split": [2],  # Default 2
}

etr_grid = GridSearchCV(
    estimator=etr_full_pipeline,
    param_grid=etr_param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1,
    verbose=2,
    refit=True,
)

print("=" * 60)
print("Optimizing ExtraTrees Branch")
print("=" * 60)

etr_grid.fit(x_train_clean, y_train_clean)

print(f"\nBest ExtraTrees R²: {etr_grid.best_score_:.4f}")
print(f"Best ExtraTrees Params: {etr_grid.best_params_}")

# Best ExtraTrees R²: 0.5595
# [
#     (50, 45),
#     (125, 110),
#     (200, 170),
#     (275, 230),
#     (350, 290),
#     (425, 350),
#     (500, 410),
# ],

Optimizing ExtraTrees Branch
Fitting 5 folds for each of 7 candidates, totalling 35 fits
[SpearmanRFSelector] Spearman: Selected 125 features (from 828)
[SpearmanRFSelector] RF: Selected 110 features (from 125)
[SpearmanRFSelector] FINAL: 110 features

Best ExtraTrees R²: 0.5595
Best ExtraTrees Params: {'etr__feature_selector__feature_tops': (125, 110), 'etr__feature_selector__rf_max_depth': None, 'etr__feature_selector__rf_max_features': 'sqrt', 'etr__feature_selector__rf_min_samples_leaf': 1, 'etr__feature_selector__rf_n_estimators': 1000, 'etr__model__max_depth': None, 'etr__model__max_features': 'sqrt', 'etr__model__min_samples_leaf': 2, 'etr__model__min_samples_split': 2, 'etr__model__n_estimators': 1000, 'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance'}


In [157]:
gpr_kernel = (kernels.ConstantKernel(1.0, (1e-3, 1e3))
              * kernels.RationalQuadratic(length_scale=1.0, alpha=1.0)
              + kernels.WhiteKernel(noise_level=1e-3, noise_level_bounds=(1e-6, 1e1)))

# Build GaussianProcess pipeline
imputation_pipeline = build_imputation_pipeline()

gpr_branch = build_gpr_branch(random_state=42)

gpr_full_pipeline = Pipeline([
    ("imputation", imputation_pipeline),
    ("gpr", gpr_branch),
])

gpr_param_grid = {
    "imputation__knn_imputer__n_neighbors": [2],
    "imputation__knn_imputer__weights": ["distance"],
    "gpr__feature_selector__feature_tops": [
        (200, 175),
    ],
    "gpr__feature_selector__rf_n_estimators": [1000],
    "gpr__feature_selector__rf_max_depth": [None],
    "gpr__feature_selector__rf_min_samples_leaf": [1],
    "gpr__feature_selector__rf_max_features": ["sqrt"],

    # Kernel - use the custom one defined above
    "gpr__model__kernel": [gpr_kernel],

    # Noise level added to diagonal of kernel matrix (regularization)
    "gpr__model__alpha": [1e-6],  # Default 1e-10, higher = more regularization

    # Whether to normalize target values (often helps)
    "gpr__model__normalize_y": [True],  # Default False

    # Number of restarts for optimizer (more = better but slower)
    "gpr__model__n_restarts_optimizer": [2],  # Default 0, higher = more robust
}

gpr_grid = GridSearchCV(
    estimator=gpr_full_pipeline,
    param_grid=gpr_param_grid,
    cv=5,
    scoring="r2",
    n_jobs=1,
    verbose=3,
)

print("=" * 60)
print("Optimizing GaussianProcess Branch (This will be SLOW!)")
print("=" * 60)

gpr_grid.fit(x_train_clean, y_train_clean)

print(f"\nBest GaussianProcess R²: {gpr_grid.best_score_:.4f}")
print(f"Best GaussianProcess Params: {gpr_grid.best_params_}")

Optimizing GaussianProcess Branch (This will be SLOW!)
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 173 features (from 200)
[SpearmanRFSelector] FINAL: 173 features




[CV 1/5] END gpr__feature_selector__feature_tops=(200, 173), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.680 total time=  52.6s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 173 features (from 200)
[SpearmanRFSelector] FINAL: 173 features




[CV 2/5] END gpr__feature_selector__feature_tops=(200, 173), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.631 total time=  39.7s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 173 features (from 200)
[SpearmanRFSelector] FINAL: 173 features




[CV 3/5] END gpr__feature_selector__feature_tops=(200, 173), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.707 total time=  49.6s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 173 features (from 200)
[SpearmanRFSelector] FINAL: 173 features




[CV 4/5] END gpr__feature_selector__feature_tops=(200, 173), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.691 total time=  51.5s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 173 features (from 200)
[SpearmanRFSelector] FINAL: 173 features




[CV 5/5] END gpr__feature_selector__feature_tops=(200, 173), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.685 total time=  50.7s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 174 features (from 200)
[SpearmanRFSelector] FINAL: 174 features




[CV 1/5] END gpr__feature_selector__feature_tops=(200, 174), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.680 total time=  41.5s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 174 features (from 200)
[SpearmanRFSelector] FINAL: 174 features




[CV 2/5] END gpr__feature_selector__feature_tops=(200, 174), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.630 total time=  54.9s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 174 features (from 200)
[SpearmanRFSelector] FINAL: 174 features




[CV 3/5] END gpr__feature_selector__feature_tops=(200, 174), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.708 total time=  51.8s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 174 features (from 200)
[SpearmanRFSelector] FINAL: 174 features




[CV 4/5] END gpr__feature_selector__feature_tops=(200, 174), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.690 total time=  48.0s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 174 features (from 200)
[SpearmanRFSelector] FINAL: 174 features




[CV 5/5] END gpr__feature_selector__feature_tops=(200, 174), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.685 total time=  51.4s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 175 features (from 200)
[SpearmanRFSelector] FINAL: 175 features




[CV 1/5] END gpr__feature_selector__feature_tops=(200, 175), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.680 total time=  49.7s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 175 features (from 200)
[SpearmanRFSelector] FINAL: 175 features




[CV 2/5] END gpr__feature_selector__feature_tops=(200, 175), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.630 total time=  45.7s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 175 features (from 200)
[SpearmanRFSelector] FINAL: 175 features




[CV 3/5] END gpr__feature_selector__feature_tops=(200, 175), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.709 total time=  51.4s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 175 features (from 200)
[SpearmanRFSelector] FINAL: 175 features




[CV 4/5] END gpr__feature_selector__feature_tops=(200, 175), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.689 total time=  60.0s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 175 features (from 200)
[SpearmanRFSelector] FINAL: 175 features




[CV 5/5] END gpr__feature_selector__feature_tops=(200, 175), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.684 total time=  56.3s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 176 features (from 200)
[SpearmanRFSelector] FINAL: 176 features




[CV 1/5] END gpr__feature_selector__feature_tops=(200, 176), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.679 total time=  47.6s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 176 features (from 200)
[SpearmanRFSelector] FINAL: 176 features




[CV 2/5] END gpr__feature_selector__feature_tops=(200, 176), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.629 total time= 1.0min
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 176 features (from 200)
[SpearmanRFSelector] FINAL: 176 features




[CV 3/5] END gpr__feature_selector__feature_tops=(200, 176), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.709 total time=  51.3s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 176 features (from 200)
[SpearmanRFSelector] FINAL: 176 features




[CV 4/5] END gpr__feature_selector__feature_tops=(200, 176), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.690 total time=  51.7s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 176 features (from 200)
[SpearmanRFSelector] FINAL: 176 features




[CV 5/5] END gpr__feature_selector__feature_tops=(200, 176), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.685 total time=  50.6s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 177 features (from 200)
[SpearmanRFSelector] FINAL: 177 features




[CV 1/5] END gpr__feature_selector__feature_tops=(200, 177), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.676 total time=  54.6s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 177 features (from 200)
[SpearmanRFSelector] FINAL: 177 features




[CV 2/5] END gpr__feature_selector__feature_tops=(200, 177), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.629 total time=  58.1s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 177 features (from 200)
[SpearmanRFSelector] FINAL: 177 features




[CV 3/5] END gpr__feature_selector__feature_tops=(200, 177), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.708 total time=  46.1s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 177 features (from 200)
[SpearmanRFSelector] FINAL: 177 features




[CV 4/5] END gpr__feature_selector__feature_tops=(200, 177), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), gpr__model__n_restarts_optimizer=2, gpr__model__normalize_y=True, imputation__knn_imputer__n_neighbors=2, imputation__knn_imputer__weights=distance;, score=0.689 total time=  49.6s
[SpearmanRFSelector] Spearman: Selected 200 features (from 828)
[SpearmanRFSelector] RF: Selected 177 features (from 200)
[SpearmanRFSelector] FINAL: 177 features
[CV 5/5] END gpr__feature_selector__feature_tops=(200, 177), gpr__feature_selector__rf_max_depth=None, gpr__feature_selector__rf_max_features=sqrt, gpr__feature_selector__rf_min_samples_leaf=1, gpr__feature_selector__rf_n_estimators=1000, gpr__model__alpha=1e-06, gpr__model__kernel=1**2 *




Best GaussianProcess R²: 0.6786
Best GaussianProcess Params: {'gpr__feature_selector__feature_tops': (200, 175), 'gpr__feature_selector__rf_max_depth': None, 'gpr__feature_selector__rf_max_features': 'sqrt', 'gpr__feature_selector__rf_min_samples_leaf': 1, 'gpr__feature_selector__rf_n_estimators': 1000, 'gpr__model__alpha': 1e-06, 'gpr__model__kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=0.001), 'gpr__model__n_restarts_optimizer': 2, 'gpr__model__normalize_y': True, 'imputation__knn_imputer__n_neighbors': 2, 'imputation__knn_imputer__weights': 'distance'}


In [None]:
imputation_pipeline = build_imputation_pipeline()
feature_pipeline = build_feature_selection_pipeline()

model = build_model()

full_pipeline = Pipeline(
    [
        ("imputation", imputation_pipeline),
        ("feature_selector", feature_pipeline),
        ("model", model),
    ]
)

gpr_kernel = (kernels.ConstantKernel(1.0, (1e-3, 1e3))
              * kernels.RationalQuadratic(length_scale=1.0, alpha=1.0)
              + kernels.WhiteKernel(noise_level=1e-3, noise_level_bounds=(1e-6, 1e1)))

param_grid = {
    "imputation__knn_imputer__n_neighbors": [2],
    "imputation__knn_imputer__weights": ["distance"],
    # "imputation__iterative_imputer__estimator": [NuSVR(kernel='rbf', C=54, gamma="scale")],
    # "imputation__iterative_imputer__max_iter": [20],
    # "imputation__iterative_imputer__initial_strategy": ["median"],

    "feature_selector__feature_selector__scaler": [StandardScaler()],
    "feature_selector__feature_selector__spearman_top_k": [202],
    "feature_selector__feature_selector__rf_top_k": [173],
    "feature_selector__feature_selector__rf_n_estimators": [1000],
    "feature_selector__feature_selector__rf_max_depth": [None],
    "feature_selector__feature_selector__rf_min_samples_leaf": [1],
    "feature_selector__feature_selector__rf_max_features": ["sqrt"],

    # ===== SVR Branch =====
    "model__svr__model__nu": [0.5],
    "model__svr__model__kernel": ["rbf"],
    "model__svr__model__C": [55],
    "model__svr__model__gamma": ["auto"],

    # ===== HistGradientBoosting Branch =====
    "model__hgb__model__learning_rate": [0.1],
    "model__hgb__model__max_iter": [100],
    "model__hgb__model__max_depth": [None],
    "model__hgb__model__min_samples_leaf": [20],

    # ===== GaussianProcess Branch =====
    # "model__gpr__model__kernel": [gpr_kernel],
    # "model__gpr__model__alpha": [1e-6],
    # "model__gpr__model__normalize_y": [True],
    # "model__gpr__model__n_restarts_optimizer": [2],

    # ===== AdaBoost Branch =====
    "model__abr__model__n_estimators": [600],
    "model__abr__model__learning_rate": [0.03],
    "model__abr__model__loss": ["square"],
    "model__abr__model__estimator__max_depth": [15],
    "model__abr__model__estimator__min_samples_leaf": [5],

    # ===== ExtraTrees Branch =====
    "model__etr__model__n_estimators": [1000],
    "model__etr__model__max_depth": [None],
    "model__etr__model__min_samples_leaf": [2],
    "model__etr__model__max_features": ["sqrt"],
}

grid = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    n_jobs=1,
    verbose=100,
    refit=True,
)

grid.fit(x_train_clean, y_train_clean)

print(f"Best CV R² (mean across folds): {grid.best_score_:.4f}")
print("Best params:", grid.best_params_)

In [None]:
imputation_pipeline_final = build_imputation_pipeline()
feature_pipeline_final = build_feature_selection_pipeline()
model_with_gpr = build_model(include_gpr=True)

full_pipeline_with_gpr = Pipeline([
    ("imputation", imputation_pipeline_final),
    ("feature_selector", feature_pipeline_final),
    ("model", model_with_gpr),
])

best_params_with_gpr = grid.best_params_.copy()

best_params_with_gpr.update({
    "model__gpr__model__kernel": gpr_kernel,
    "model__gpr__model__alpha": 1e-6,
    "model__gpr__model__normalize_y": True,
    "model__gpr__model__n_restarts_optimizer": 2,
})

full_pipeline_with_gpr.set_params(**best_params_with_gpr)
full_pipeline_with_gpr.fit(x_train_clean, y_train_clean)

y_pred = full_pipeline_with_gpr.predict(x_test_clean)

table = pd.DataFrame({'id': np.arange(0, y_pred.shape[0]), 'y': y_pred.flatten()})
table.to_csv('submission.csv', index=False)
print("Submission saved!")