In [11]:
from pathlib import Path

import joblib
import os
import numpy as np
import pandas as pd
import sklearn.gaussian_process.kernels as kernels

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import BayesianRidge
from sklearn.svm import NuSVR
from joblib import Memory
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

In [12]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [13]:
BASE_PATH = Path("/content/gdrive/MyDrive/data")
x_train = pd.read_csv(BASE_PATH / 'X_train.csv', skiprows=1, header=None).values[:, 1:]
x_test = pd.read_csv(BASE_PATH / 'X_test.csv', skiprows=1, header=None).values[:, 1:]
y_train = pd.read_csv(BASE_PATH / 'y_train.csv', skiprows=1, header=None).values[:, 1:].flatten()

In [14]:
def remove_outliers(x_train, y_train, contamination=0.047, random_state=42):

    x_train_df = pd.DataFrame(x_train).copy()
    y_train_series = pd.Series(np.asarray(y_train).flatten(), index=x_train_df.index)

    med = x_train_df.median(axis=0)
    xtr_imp = x_train_df.fillna(med)

    scaler = RobustScaler()
    xtr_std = scaler.fit_transform(xtr_imp)

    pca = PCA(n_components=2, random_state=random_state)
    x_proj = pca.fit_transform(xtr_std)

    iso = IsolationForest(contamination=contamination, random_state=random_state)
    mask = iso.fit_predict(x_proj) == 1

    outlier_pos = np.where(~mask)[0]
    outlier_idx = x_train_df.index[~mask]
    n_outliers = outlier_pos.size
    print(f"[OutlierRemoval] Removed {n_outliers} outliers")
    print(f"[Outlier positions] {outlier_pos.tolist()}")

    if isinstance(x_train, np.ndarray):
        return x_train[mask], y_train[mask]
    else:
        return x_train_df[mask], y_train_series[mask]

In [15]:
class NaNAwareScaler(BaseEstimator, TransformerMixin):
    """
    Scaler that preserves NaN positions.
    1. Remember NaN positions
    2. Fill NaNs with median temporarily
    3. Scale the data
    4. Restore NaNs at original positions
    """

    def fit(self, x, y=None):
        x_df = pd.DataFrame(x)

        self.median_ = x_df.median(axis=0)

        x_filled = x_df.fillna(self.median_)

        self.scaler_ = StandardScaler()
        self.scaler_.fit(x_filled)

        return self

    def transform(self, x):
        x_df = pd.DataFrame(x)

        nan_mask = x_df.isna()

        x_filled = x_df.fillna(self.median_)

        x_scaled = self.scaler_.transform(x_filled)
        x_scaled_df = pd.DataFrame(x_scaled, index=x_df.index, columns=x_df.columns)

        x_scaled_df[nan_mask] = np.nan

        return x_scaled_df.values

In [16]:
class SpearmanSelector(BaseEstimator, TransformerMixin):
    """Select top K features by Spearman correlation with target."""

    def __init__(self, top_k=200):
        self.top_k = top_k

    def fit(self, x, y):
        if y is None:
            raise ValueError("SpearmanSelector requires y")

        x_df = pd.DataFrame(x) if not isinstance(x, pd.DataFrame) else x
        y_series = pd.Series(np.asarray(y).flatten(), index=x_df.index)

        spearman_corr = x_df.corrwith(y_series, method='spearman').abs()

        n_keep = min(self.top_k, len(spearman_corr))
        top_features = spearman_corr.nlargest(n_keep).index

        self.selected_features_ = top_features.tolist()
        self.n_features_in_ = x_df.shape[1]

        print(f"[SpearmanSelector] Selected {len(self.selected_features_)} features (from {self.n_features_in_})")

        return self

    def transform(self, X):
        x_df = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        return x_df.iloc[:, self.selected_features_].values


class RFSelector(BaseEstimator, TransformerMixin):
    """Select top K features by RandomForest feature importance."""

    def __init__(self, top_k=200, n_estimators=1000, max_depth=None,
                 min_samples_leaf=1, max_features='sqrt', random_state=42):
        self.top_k = top_k
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.random_state = random_state

    def fit(self, x, y):

        if y is None:
            raise ValueError("RFSelector requires y")

        x_df = pd.DataFrame(x) if not isinstance(x, pd.DataFrame) else x
        y_series = pd.Series(np.asarray(y).flatten())

        self.rf_ = RandomForestRegressor(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            max_features=self.max_features,
            random_state=self.random_state,
            n_jobs=-1,
            verbose=0
        )
        self.rf_.fit(x_df, y_series)

        importances = pd.Series(self.rf_.feature_importances_, index=x_df.columns)
        importances = importances.sort_values(ascending=False)

        n_keep = min(self.top_k, len(importances)) if self.top_k else len(importances)
        self.selected_features_ = importances.head(n_keep).index.tolist()
        self.n_features_in_ = x_df.shape[1]

        print(f"[RFSelector] Selected {len(self.selected_features_)} features (from {self.n_features_in_})")

        return self

    def transform(self, x):
        x_df = pd.DataFrame(x) if not isinstance(x, pd.DataFrame) else x
        return x_df.iloc[:, self.selected_features_].values


In [17]:
def build_feature_pipeline(random_state=42):
    pipeline = Pipeline(
        [
            ("variance", VarianceThreshold()),
            ("scaler", NaNAwareScaler()),
            # ("knn_imputer", KNNImputer()),
            ("iterative_imputer", IterativeImputer()),
            ("spearman", SpearmanSelector()),
            ("rf_selector", RFSelector(random_state=random_state)),
        ]
    )
    return pipeline

In [18]:
def build_model(random_state=42):

    kernel = kernels.ConstantKernel(1.0, (1e-3, 1e3)) \
        * kernels.RationalQuadratic(length_scale=1.0, alpha=1.0) \
        + kernels.WhiteKernel(noise_level=1e-3, noise_level_bounds=(1e-6, 1e1))

    gpr = GaussianProcessRegressor(
        kernel=kernel,
        random_state=random_state
    )

    model = StackingRegressor(
        estimators=[
            ("svr", NuSVR()),
            ("hgb", HistGradientBoostingRegressor(random_state=random_state)),
            ("gp", gpr)
        ],
        final_estimator=LinearRegression()
    )

    return model

In [19]:
x_train_clean, y_train_clean = remove_outliers(
    x_train, y_train, contamination=0.047, random_state=42
)

[OutlierRemoval] Removed 57 outliers
[Outlier positions] [3, 10, 27, 60, 114, 156, 167, 203, 207, 213, 232, 329, 346, 380, 382, 423, 441, 442, 446, 459, 537, 575, 579, 611, 627, 674, 676, 681, 690, 694, 697, 733, 740, 763, 769, 805, 899, 906, 915, 933, 935, 951, 983, 1015, 1032, 1070, 1071, 1087, 1090, 1097, 1103, 1127, 1136, 1137, 1144, 1152, 1196]


In [20]:
feature_pipeline = build_feature_pipeline()
model = build_model()

full_pipeline = Pipeline(
    [
        ("features", feature_pipeline),
        ("model", model),
    ]
)

param_grid = {
    # "features__knn_imputer__n_neighbors": [1],
    # "features__knn_imputer__weights": ["distance"],

    "features__iterative_imputer__estimator": [NuSVR(kernel='rbf', C=54, gamma="scale")],
    "features__iterative_imputer__max_iter": [20],
    "features__iterative_imputer__initial_strategy": ["median"],

    "features__spearman__top_k": [202],
    "features__rf_selector__top_k": [173],
    "features__rf_selector__n_estimators": [1000],
    "features__rf_selector__max_depth": [None],
    "features__rf_selector__min_samples_leaf": [1],
    "features__rf_selector__max_features": ["sqrt"],

    # Model params - NuSVR
    "model__svr__C": [55],
    "model__svr__gamma": ["scale"],

    # Model params - HistGradientBoosting
    # "model__hgb__learning_rate": [0.1],
    # "model__hgb__max_iter": [100],
    # "model__hgb__max_depth": [None],
    # "model__hgb__min_samples_leaf": [20],

    # Model params - GaussianProcess
    "model__gp__alpha": [0.0],
    "model__gp__normalize_y": [True],
    "model__gp__n_restarts_optimizer": [8],
}

grid = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1,
    verbose=100,
    refit=True,
)

grid.fit(x_train_clean, y_train_clean)

print(f"Best CV R² (mean across folds): {grid.best_score_:.4f}")
print("Best params:", grid.best_params_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}
Detailed tracebacks of the workers should have been printed to stderr in the executor process if faulthandler was not disabled.

In [None]:
y_test_pred = grid.predict(x_train_clean)