In [None]:
import warnings
from functools import partial
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import optuna
import polars as pl
import polars.selectors as cs

from catboost import CatBoostRegressor, MultiTargetCustomMetric
from xgboost import XGBRegressor

from numpy.typing import ArrayLike, NDArray
from polars.testing import assert_frame_equal
from sklearn.base import BaseEstimator
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore", message="Failed to optimize method")


#DATA_DIR = Path("/kaggle/input/child-mind-institute-problematic-internet-use")
DATA_DIR = Path("./data")

TARGET_COLS = [
    "PCIAT-PCIAT_01",
    "PCIAT-PCIAT_02",
    "PCIAT-PCIAT_03",
    "PCIAT-PCIAT_04",
    "PCIAT-PCIAT_05",
    "PCIAT-PCIAT_06",
    "PCIAT-PCIAT_07",
    "PCIAT-PCIAT_08",
    "PCIAT-PCIAT_09",
    "PCIAT-PCIAT_10",
    "PCIAT-PCIAT_11",
    "PCIAT-PCIAT_12",
    "PCIAT-PCIAT_13",
    "PCIAT-PCIAT_14",
    "PCIAT-PCIAT_15",
    "PCIAT-PCIAT_16",
    "PCIAT-PCIAT_17",
    "PCIAT-PCIAT_18",
    "PCIAT-PCIAT_19",
    "PCIAT-PCIAT_20",
    "PCIAT-PCIAT_Total",
    "sii",
]

FEATURE_COLS = [
    "Basic_Demos-Age",
    "PreInt_EduHx-computerinternet_hoursday",
    "SDS-SDS_Total_Raw",
    "Physical-Height",
    "FGC-FGC_TL",
    "Physical-Waist_Circumference",
    "BIA-BIA_FMI",
    "BIA-BIA_Fat",
    "Fitness_Endurance-Time_Sec",
    "Fitness_Endurance-Max_Stage",
    "FGC-FGC_CU_Zone",
]

In [None]:
# Load data
train = pl.read_csv(DATA_DIR / "train.csv")
test = pl.read_csv(DATA_DIR / "test.csv")
train_test = pl.concat([train, test], how="diagonal")

IS_TEST = test.height <= 100

assert_frame_equal(train, train_test[: train.height].select(train.columns))
assert_frame_equal(test, train_test[train.height :].select(test.columns))

In [None]:
# Cast string columns to categorical
train_test = train_test.with_columns(cs.string().cast(pl.Categorical).fill_null("NAN"))
train = train_test[: train.height]
test = train_test[train.height :]

# ignore rows with null values in TARGET_COLS
train_without_null = train_test.drop_nulls(subset=TARGET_COLS)
X = train_without_null.select(FEATURE_COLS)
X_test = test.select(FEATURE_COLS)
y = train_without_null.select(TARGET_COLS)
y_sii = y.get_column("sii").to_numpy()  # ground truth
cat_features = X.select(cs.categorical()).columns

print("Features selected:")
print(cat_features)  # Should be none

# Tubotubo's Quadratic Weighted Kappa metric & Optuna optimizer

In [None]:
class MultiTargetQWK(MultiTargetCustomMetric):
    def get_final_error(self, error, weight):
        return np.sum(error)  # / np.sum(weight)

    def is_max_optimal(self):
        # if True, the bigger the better
        return True

    def evaluate(self, approxes, targets, weight):
        # approxes: 予測値 (shape: [ターゲット数, サンプル数])
        # targets: 実際の値 (shape: [ターゲット数, サンプル数])
        # weight: サンプルごとの重み (Noneも可)

        approx = np.clip(approxes[-1], 0, 3).round().astype(int)
        target = targets[-1]

        qwk = cohen_kappa_score(target, approx, weights="quadratic")

        return qwk, 1

    def get_custom_metric_name(self):
        return "MultiTargetQWK"


class OptimizedRounder:
    """
    A class for optimizing the rounding of continuous predictions into discrete class labels using Optuna.
    The optimization process maximizes the Quadratic Weighted Kappa score by learning thresholds that separate
    continuous predictions into class intervals.

    Args:
        n_classes (int): The number of discrete class labels.
        n_trials (int, optional): The number of trials for the Optuna optimization. Defaults to 100.

    Attributes:
        n_classes (int): The number of discrete class labels.
        labels (NDArray[np.int_]): An array of class labels from 0 to `n_classes - 1`.
        n_trials (int): The number of optimization trials.
        metric (Callable): The Quadratic Weighted Kappa score metric used for optimization.
        thresholds (List[float]): The optimized thresholds learned after calling `fit()`.

    Methods:
        fit(y_pred: NDArray[np.float_], y_true: NDArray[np.int_]) -> None:
            Fits the rounding thresholds based on continuous predictions and ground truth labels.

            Args:
                y_pred (NDArray[np.float_]): Continuous predictions that need to be rounded.
                y_true (NDArray[np.int_]): Ground truth class labels.

            Returns:
                None

        predict(y_pred: NDArray[np.float_]) -> NDArray[np.int_]:
            Predicts discrete class labels by rounding continuous predictions using the fitted thresholds.
            `fit()` must be called before `predict()`.

            Args:
                y_pred (NDArray[np.float_]): Continuous predictions to be rounded.

            Returns:
                NDArray[np.int_]: Predicted class labels.

        _normalize(y: NDArray[np.float_]) -> NDArray[np.float_]:
            Normalizes the continuous values to the range [0, `n_classes - 1`].

            Args:
                y (NDArray[np.float_]): Continuous values to be normalized.

            Returns:
                NDArray[np.float_]: Normalized values.

    References:
        - This implementation uses Optuna for threshold optimization.
        - Quadratic Weighted Kappa is used as the evaluation metric.
    """

    def __init__(self, n_classes: int, n_trials: int = 100):
        self.n_classes = n_classes
        self.labels = np.arange(n_classes)
        self.n_trials = n_trials
        self.metric = partial(cohen_kappa_score, weights="quadratic")

    def fit(self, y_pred: NDArray[np.float_], y_true: NDArray[np.int_]) -> None:
        y_pred = self._normalize(y_pred)

        def objective(trial: optuna.Trial) -> float:
            thresholds = []
            for i in range(self.n_classes - 1):
                low = max(thresholds) if i > 0 else min(self.labels)
                high = max(self.labels)
                th = trial.suggest_float(f"threshold_{i}", low, high)
                thresholds.append(th)
            try:
                y_pred_rounded = np.digitize(y_pred, thresholds)
            except ValueError:
                return -100
            return self.metric(y_true, y_pred_rounded)

        optuna.logging.disable_default_handler()
        study = optuna.create_study(direction="maximize")
        study.optimize(
            objective,
            n_trials=self.n_trials,
        )
        self.thresholds = [study.best_params[f"threshold_{i}"] for i in range(self.n_classes - 1)]

    def predict(self, y_pred: NDArray[np.float_]) -> NDArray[np.int_]:
        assert hasattr(self, "thresholds"), "fit() must be called before predict()"
        y_pred = self._normalize(y_pred)
        return np.digitize(y_pred, self.thresholds)

    def _normalize(self, y: NDArray[np.float_]) -> NDArray[np.float_]:
        # normalize y_pred to [0, n_classes - 1]
        return (y - y.min()) / (y.max() - y.min()) * (self.n_classes - 1)

# Start making models!

In [None]:
from sklearn.feature_selection import RFE

# Updated SILLY MODEL with RFE
class SillyManRFE():
    def __init__(self, n_features_to_select=10):
        self.selector = RFE(estimator=CatBoostRegressor(**params), n_features_to_select=n_features_to_select, step=1)
        self.cat = CatBoostRegressor(**params)
        
    def fit(self, x, y, eval_set, cat_features, verbose=False):
        self.cat_features = cat_features
        
        # Apply RFE for feature selection
        X_selected = self.selector.fit_transform(x, y)
        
        # Identify selected feature indices
        selected_features = np.array(x.columns)[self.selector.support_]
        
        # Identify categorical features after selection
        new_cat_features = [i for i, col in enumerate(x.columns) if col in cat_features and col in selected_features]
        
        # Prepare evaluation set
        X_val_selected = self.selector.transform(eval_set[0])
        
        # Fit CatBoost
        self.cat.fit(
            X_selected,
            y,
            eval_set=(X_val_selected, eval_set[1]),
            cat_features=new_cat_features,
            verbose=verbose
        )
        
    def predict(self, x):
        X_selected = self.selector.transform(x)
        return self.cat.predict(X_selected)


In [None]:
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer
from sklearn.impute import SimpleImputer
from catboost import CatBoostRegressor
import numpy as np
import pandas as pd

# Define your parameters
params = dict(
    loss_function="MultiRMSE",
    eval_metric="MultiRMSE",  # Ensure eval_metric is appropriate
    iterations=7000,  # Adjust as needed
    learning_rate=0.05,
    depth=5,
    early_stopping_rounds=50,
)

# Updated SILLY MODEL
class SillyManPolynomial:
    def __init__(
        self,
        include_bias=False,
        imputation_strategy='mean',
        use_spline=True,
        use_poly=False,
        use_raw=False
    ):
        """
        Initializes the SillyManPolynomial model with options to include Spline, Polynomial, and/or raw features.

        Parameters:
        - degree (int): Degree of the polynomial features.
        - include_bias (bool): Whether to include a bias column.
        - imputation_strategy (str): Strategy for imputing missing values ('mean', 'median', 'most_frequent', etc.).
        - use_spline (bool): Whether to include Spline transformed features.
        - use_poly (bool): Whether to include Polynomial transformed features.
        - use_raw (bool): Whether to include raw numerical features.
        """
        self.use_spline = use_spline
        self.use_poly = use_poly
        self.use_raw = use_raw

        self.imputer = SimpleImputer(strategy=imputation_strategy)
        if self.use_poly:
            self.poly = PolynomialFeatures(degree=2, include_bias=include_bias)
        if self.use_spline:
            self.spline = SplineTransformer(n_knots=5, degree=3, include_bias=include_bias)
        self.cat = CatBoostRegressor(**params)

    def fit(self, x, y, eval_set, cat_features, verbose=False):
        """
        Fits the model with selected feature transformations and imputation.

        Parameters:
        - x (pd.DataFrame): Training features.
        - y (pd.Series or pd.DataFrame): Training target.
        - eval_set (tuple): Validation set as (X_val, y_val).
        - cat_features (list): List of categorical feature names.
        - verbose (bool): Verbosity flag.
        """
        self.cat_features = cat_features

        # Separate numerical and categorical features
        if cat_features:
            numerical = x.drop(columns=self.cat_features)
            categorical = x[self.cat_features]
        else:
            numerical = x.copy()
            categorical = None

        # Impute missing values in numerical features
        X_train_imputed = self.imputer.fit_transform(numerical)
        X_val_imputed = self.imputer.transform(
            eval_set[0].drop(columns=self.cat_features) if self.cat_features else eval_set[0]
        )

        # Initialize lists to hold transformed features
        X_train_features = []
        X_val_features = []

        # Apply Spline Transformation if selected
        if self.use_spline:
            X_train_spline = self.spline.fit_transform(X_train_imputed)
            X_val_spline = self.spline.transform(X_val_imputed)
            spline_feature_names = self.spline.get_feature_names_out(input_features=numerical.columns)
            # Prefix the feature names to avoid duplicates
            spline_feature_names = ['spline_' + name for name in spline_feature_names]
            X_train_features.append(pd.DataFrame(X_train_spline, columns=spline_feature_names))
            X_val_features.append(pd.DataFrame(X_val_spline, columns=spline_feature_names))

        # Apply Polynomial Transformation if selected
        if self.use_poly:
            X_train_poly = self.poly.fit_transform(X_train_imputed)
            X_val_poly = self.poly.transform(X_val_imputed)
            poly_feature_names = self.poly.get_feature_names_out(input_features=numerical.columns)
            # Prefix the feature names to avoid duplicates
            poly_feature_names = ['poly_' + name for name in poly_feature_names]
            X_train_features.append(pd.DataFrame(X_train_poly, columns=poly_feature_names))
            X_val_features.append(pd.DataFrame(X_val_poly, columns=poly_feature_names))

        # Include raw numerical features if selected
        if self.use_raw:
            X_train_raw = pd.DataFrame(X_train_imputed, columns=numerical.columns)
            X_val_raw = pd.DataFrame(X_val_imputed, columns=numerical.columns)
            # Prefix the feature names to avoid duplicates
            raw_feature_names = ['raw_' + name for name in numerical.columns]
            X_train_raw.columns = raw_feature_names
            X_val_raw.columns = raw_feature_names
            X_train_features.append(X_train_raw)
            X_val_features.append(X_val_raw)

        # Check if at least one feature set is included
        if not X_train_features:
            raise ValueError("At least one of use_spline, use_poly, or use_raw must be True.")

        # Concatenate all selected features
        X_train_processed = pd.concat(X_train_features, axis=1).reset_index(drop=True)
        X_val_processed = pd.concat(X_val_features, axis=1).reset_index(drop=True)

        # Append categorical features if any
        if categorical is not None:
            X_train_cat = categorical.reset_index(drop=True)
            X_val_cat = eval_set[0][self.cat_features].reset_index(drop=True)
            X_train_processed = pd.concat([X_train_processed, X_train_cat], axis=1)
            X_val_processed = pd.concat([X_val_processed, X_val_cat], axis=1)
            # Update cat_features indices to point to the categorical features appended at the end
            new_cat_features = list(range(
                X_train_processed.shape[1] - len(self.cat_features),
                X_train_processed.shape[1]
            ))
        else:
            new_cat_features = []

        # Fit CatBoost
        self.cat.fit(
            X_train_processed,
            y,
            eval_set=(X_val_processed, eval_set[1]),
            cat_features=new_cat_features,
            verbose=verbose
        )

    def predict(self, x):
        """
        Predicts using the fitted model.

        Parameters:
        - x (pd.DataFrame): Features for prediction.

        Returns:
        - np.ndarray: Predictions.
        """
        # Separate numerical and categorical features
        if self.cat_features:
            numerical = x.drop(columns=self.cat_features)
            categorical = x[self.cat_features]
        else:
            numerical = x.copy()
            categorical = None

        # Impute missing values
        X_imputed = self.imputer.transform(numerical)

        # Initialize list to hold transformed features
        X_features = []

        # Apply Spline Transformation if selected
        if self.use_spline:
            X_spline = self.spline.transform(X_imputed)
            spline_feature_names = self.spline.get_feature_names_out(input_features=numerical.columns)
            spline_feature_names = ['spline_' + name for name in spline_feature_names]
            X_features.append(pd.DataFrame(X_spline, columns=spline_feature_names))

        # Apply Polynomial Transformation if selected
        if self.use_poly:
            X_poly = self.poly.transform(X_imputed)
            poly_feature_names = self.poly.get_feature_names_out(input_features=numerical.columns)
            poly_feature_names = ['poly_' + name for name in poly_feature_names]
            X_features.append(pd.DataFrame(X_poly, columns=poly_feature_names))

        # Include raw numerical features if selected
        if self.use_raw:
            X_raw = pd.DataFrame(X_imputed, columns=numerical.columns)
            raw_feature_names = ['raw_' + name for name in numerical.columns]
            X_raw.columns = raw_feature_names
            X_features.append(X_raw)

        # Check if at least one feature set is included
        if not X_features:
            raise ValueError("At least one of use_spline, use_poly, or use_raw must be True.")

        # Concatenate all selected features
        X_processed = pd.concat(X_features, axis=1).reset_index(drop=True)

        # Append categorical features if any
        if categorical is not None:
            X_cat = categorical.reset_index(drop=True)
            X_processed = pd.concat([X_processed, X_cat.reset_index(drop=True)], axis=1)

        return self.cat.predict(X_processed)



In [None]:
from sklearn.preprocessing import (
    PolynomialFeatures,
    SplineTransformer,
    StandardScaler,
    MinMaxScaler,
    QuantileTransformer
)
from sklearn.impute import SimpleImputer
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd

# Define your parameters
catboost_params = dict(
    loss_function="MultiRMSE",
    eval_metric="MultiRMSE",
    iterations=7000,
    learning_rate=0.05,
    depth=5,
    early_stopping_rounds=50,
)

xgboost_params = dict(
    objective="reg:squarederror",
    eval_metric="rmse",
    n_estimators=7000,
    learning_rate=0.05,
    max_depth=5,
    early_stopping_rounds=50,
    tree_method='hist'  # or 'auto' depending on your setup
)

class SillyManCombinational:
    def __init__(
        self,
        include_bias=False,
        imputation_strategy='mean',
        use_spline=True,
        use_poly=False,
        use_raw=False,
        use_standard_scaler=False,
        use_minmax_scaler=False,
        use_quantile_transformer=False,
        base_estimators=['catboost'],  # options: ['catboost', 'xgboost']
        final_estimators=['catboost'],  # options: ['catboost', 'xgboost']
    ):
        """
        Initializes the SillyManPolynomial model with options to include various transformers and base estimators.

        Parameters:
        - include_bias (bool): Whether to include a bias column.
        - imputation_strategy (str): Strategy for imputing missing values.
        - use_spline (bool): Whether to include Spline transformed features.
        - use_poly (bool): Whether to include Polynomial transformed features.
        - use_raw (bool): Whether to include raw numerical features.
        - use_standard_scaler (bool): Whether to include StandardScaler transformed features.
        - use_minmax_scaler (bool): Whether to include MinMaxScaler transformed features.
        - use_quantile_transformer (bool): Whether to include QuantileTransformer transformed features.
        - base_estimators (list): List of estimators to use for base models ['catboost', 'xgboost'].
        - final_estimators (list): List of estimators to use for final model ['catboost', 'xgboost'].
        """
        # Transformer toggles
        self.use_spline = use_spline
        self.use_poly = use_poly
        self.use_raw = use_raw
        self.use_standard_scaler = use_standard_scaler
        self.use_minmax_scaler = use_minmax_scaler
        self.use_quantile_transformer = use_quantile_transformer

        # Estimator types
        self.base_estimators_types = base_estimators
        self.final_estimators_types = final_estimators

        self.imputer = SimpleImputer(strategy=imputation_strategy)
        if self.use_poly:
            self.poly = PolynomialFeatures(degree=2, include_bias=include_bias)
        if self.use_spline:
            self.spline = SplineTransformer(n_knots=5, degree=3, include_bias=include_bias)
        if self.use_standard_scaler:
            self.standard_scaler = StandardScaler()
        if self.use_minmax_scaler:
            self.minmax_scaler = MinMaxScaler()
        if self.use_quantile_transformer:
            self.quantile_transformer = QuantileTransformer(output_distribution='normal')

        # Base estimators dict
        self.base_estimators = {}
        # Final estimator(s)
        self.final_estimators = {}

    def fit(self, x, y, eval_set, cat_features, verbose=False):
        """
        Fits the model with selected feature transformations and imputation.

        Parameters:
        - x (pd.DataFrame): Training features.
        - y (pd.Series or pd.DataFrame): Training target.
        - eval_set (tuple): Validation set as (X_val, y_val).
        - cat_features (list): List of categorical feature names.
        - verbose (bool): Verbosity flag.
        """
        self.cat_features = cat_features

        # Separate numerical and categorical features
        if cat_features:
            numerical = x.drop(columns=self.cat_features)
            categorical = x[self.cat_features]
        else:
            numerical = x.copy()
            categorical = None

        # Impute missing values in numerical features
        X_train_imputed = self.imputer.fit_transform(numerical)
        X_val_imputed = self.imputer.transform(
            eval_set[0].drop(columns=self.cat_features) if self.cat_features else eval_set[0]
        )

        # Initialize list to collect base estimator predictions
        base_train_preds = []
        base_val_preds = []

        # Transformer functions mapping
        transformers = {}
        if self.use_spline:
            transformers['spline'] = self.spline
        if self.use_poly:
            transformers['poly'] = self.poly
        if self.use_raw:
            transformers['raw'] = None  # raw data, no transformer
        if self.use_standard_scaler:
            transformers['standard_scaler'] = self.standard_scaler
        if self.use_minmax_scaler:
            transformers['minmax_scaler'] = self.minmax_scaler
        if self.use_quantile_transformer:
            transformers['quantile_transformer'] = self.quantile_transformer

        if not transformers:
            raise ValueError("At least one transformer must be selected.")

        # For each transformer, apply transformation, train base estimators
        for name, transformer in transformers.items():
            if transformer is not None:
                # Fit transformer on training data
                X_train_transformed = transformer.fit_transform(X_train_imputed)
                X_val_transformed = transformer.transform(X_val_imputed)
                feature_names = transformer.get_feature_names_out(input_features=numerical.columns)
                feature_names = [f'{name}_{feat}' for feat in feature_names]
            else:
                # Raw data
                X_train_transformed = X_train_imputed
                X_val_transformed = X_val_imputed
                feature_names = [f'raw_{feat}' for feat in numerical.columns]

            # Create DataFrames
            X_train_df = pd.DataFrame(X_train_transformed, columns=feature_names)
            X_val_df = pd.DataFrame(X_val_transformed, columns=feature_names)

            # Append categorical features if any
            if categorical is not None:
                X_train_cat = categorical.reset_index(drop=True)
                X_val_cat = eval_set[0][self.cat_features].reset_index(drop=True)
                X_train_df = pd.concat([X_train_df.reset_index(drop=True), X_train_cat], axis=1)
                X_val_df = pd.concat([X_val_df.reset_index(drop=True), X_val_cat], axis=1)
                # Update cat_features indices to point to the categorical features appended at the end
                new_cat_features = list(range(
                    X_train_df.shape[1] - len(self.cat_features),
                    X_train_df.shape[1]
                ))
            else:
                new_cat_features = []

            # Train base estimators
            for est_type in self.base_estimators_types:
                est_name = f'{name}_{est_type}'
                if est_type == 'catboost':
                    estimator = CatBoostRegressor(**catboost_params)
                    estimator.fit(
                        X_train_df,
                        y,
                        eval_set=(X_val_df, eval_set[1]),
                        cat_features=new_cat_features,
                        verbose=verbose
                    )
                elif est_type == 'xgboost':
                    estimator = XGBRegressor(**xgboost_params)
                    estimator.fit(
                        X_train_df,
                        y,
                        eval_set=[(X_val_df, eval_set[1])],
                        verbose=verbose
                    )
                else:
                    raise ValueError(f"Unsupported estimator type: {est_type}")
                # Save estimator
                self.base_estimators[est_name] = estimator

                # Collect predictions
                train_pred = estimator.predict(X_train_df)
                val_pred = estimator.predict(X_val_df)

                # Ensure predictions are 2D arrays
                if train_pred.ndim == 1:
                    train_pred = train_pred.reshape(-1, 1)
                    val_pred = val_pred.reshape(-1, 1)

                # Generate appropriate column names for multi-output
                n_outputs = train_pred.shape[1]
                if n_outputs == 1:
                    columns = [est_name]
                else:
                    columns = [f'{est_name}_output_{i}' for i in range(n_outputs)]

                # Create DataFrames with correct column names
                base_train_preds.append(pd.DataFrame(train_pred, columns=columns))
                base_val_preds.append(pd.DataFrame(val_pred, columns=columns))

        # Combine base estimator predictions
        X_train_meta = pd.concat(base_train_preds, axis=1).reset_index(drop=True)
        X_val_meta = pd.concat(base_val_preds, axis=1).reset_index(drop=True)

        # Train final estimator(s)
        for est_type in self.final_estimators_types:
            est_name = f'final_{est_type}'
            if est_type == 'catboost':
                final_estimator = CatBoostRegressor(**catboost_params)
                final_estimator.fit(
                    X_train_meta,
                    y,
                    eval_set=(X_val_meta, eval_set[1]),
                    verbose=verbose
                )
            elif est_type == 'xgboost':
                final_estimator = XGBRegressor(**xgboost_params)
                final_estimator.fit(
                    X_train_meta,
                    y,
                    eval_set=[(X_val_meta, eval_set[1])],
                    verbose=verbose
                )
            else:
                raise ValueError(f"Unsupported estimator type: {est_type}")
            # Save final estimator
            self.final_estimators[est_type] = final_estimator

    def predict(self, x):
        """
        Predicts using the fitted model.

        Parameters:
        - x (pd.DataFrame): Features for prediction.

        Returns:
        - np.ndarray: Predictions.
        """
        # Separate numerical and categorical features
        if self.cat_features:
            numerical = x.drop(columns=self.cat_features)
            categorical = x[self.cat_features]
        else:
            numerical = x.copy()
            categorical = None

        # Impute missing values
        X_imputed = self.imputer.transform(numerical)

        # Initialize list to collect base estimator predictions
        base_preds = []

        # Transformer functions mapping (same as in fit)
        transformers = {}
        if self.use_spline:
            transformers['spline'] = self.spline
        if self.use_poly:
            transformers['poly'] = self.poly
        if self.use_raw:
            transformers['raw'] = None  # raw data, no transformer
        if self.use_standard_scaler:
            transformers['standard_scaler'] = self.standard_scaler
        if self.use_minmax_scaler:
            transformers['minmax_scaler'] = self.minmax_scaler
        if self.use_quantile_transformer:
            transformers['quantile_transformer'] = self.quantile_transformer

        # For each transformer, apply transformation, get base estimator predictions
        for name, transformer in transformers.items():
            if transformer is not None:
                X_transformed = transformer.transform(X_imputed)
                feature_names = transformer.get_feature_names_out(input_features=numerical.columns)
                feature_names = [f'{name}_{feat}' for feat in feature_names]
            else:
                X_transformed = X_imputed
                feature_names = [f'raw_{feat}' for feat in numerical.columns]

            # Create DataFrame
            X_df = pd.DataFrame(X_transformed, columns=feature_names)

            # Append categorical features if any
            if categorical is not None:
                X_cat = categorical.reset_index(drop=True)
                X_df = pd.concat([X_df.reset_index(drop=True), X_cat.reset_index(drop=True)], axis=1)

            # Get base estimator predictions
            for est_type in self.base_estimators_types:
                est_name = f'{name}_{est_type}'
                estimator = self.base_estimators[est_name]
                pred = estimator.predict(X_df)
                if pred.ndim == 1:
                    pred = pred.reshape(-1, 1)

                # Generate appropriate column names for multi-output
                n_outputs = pred.shape[1]
                if n_outputs == 1:
                    columns = [est_name]
                else:
                    columns = [f'{est_name}_output_{i}' for i in range(n_outputs)]

                base_preds.append(pd.DataFrame(pred, columns=columns))

        # Combine base estimator predictions
        X_meta = pd.concat(base_preds, axis=1).reset_index(drop=True)

        # Collect final predictions
        final_preds = []
        for est_type in self.final_estimators_types:
            final_estimator = self.final_estimators[est_type]
            pred = final_estimator.predict(X_meta)
            final_preds.append(pred)

        # Average predictions if multiple final estimators
        if len(final_preds) == 1:
            return final_preds[0]
        else:
            return np.mean(final_preds, axis=0)


In [None]:
# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=52)
models: list = []
y_pred = np.full((X.height, len(TARGET_COLS)), fill_value=np.nan)

for train_idx, val_idx in skf.split(X, y_sii):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Initialize and train model
    model = SillyManCombinational(
        use_spline=True,
        use_poly=False,
        use_raw=False,
        use_standard_scaler=False,
        base_estimators=['catboost'],
        final_estimators=['catboost']
    )
    model.fit(
        X_train.to_pandas(),
        y_train.to_pandas(),
        eval_set=(X_val.to_pandas(), y_val.to_pandas()),
        cat_features=cat_features,
        verbose=False,
    )
    models.append(model)
    
    # Predict
    y_pred[val_idx] = model.predict(X_val.to_pandas())

assert np.isnan(y_pred).sum() == 0

# Optimize thresholds
optimizer = OptimizedRounder(n_classes=4, n_trials=300)
y_pred_total = y_pred[:, TARGET_COLS.index("PCIAT-PCIAT_Total")]
optimizer.fit(y_pred_total, y_sii)
y_pred_rounded = optimizer.predict(y_pred_total)

# Calculate QWK
qwk = cohen_kappa_score(y_sii, y_pred_rounded, weights="quadratic")
print(f"Cross-Validated QWK Score: {qwk}")

# Combinational Model V2!
Base                    - Cross-Validated QWK Score: 0.4152442719436863

    model = SillyManCombinational(
        use_spline=True,
        use_poly=True,
        use_raw=True,
        use_standard_scaler=True,
        base_estimators=['catboost', 'xgboost'],
        final_estimators=['catboost', 'xgboost']
    ) 

Base - No Scaler        - Cross-Validated QWK Score: 0.4155735952333438

    model = SillyManCombinational(
        use_spline=True,
        use_poly=True,
        use_raw=True,
        use_standard_scaler=False,
        base_estimators=['catboost', 'xgboost'],
        final_estimators=['catboost', 'xgboost']
    )

CatBoost - No Scaler    - Cross-Validated QWK Score: 0.4563740524739843

    model = SillyManCombinational(
        use_spline=True,
        use_poly=True,
        use_raw=True,
        use_standard_scaler=False,
        base_estimators=['catboost'],
        final_estimators=['catboost']
    )
CatBoost - No Scaler, Spline -  Cross-Validated QWK Score: 0.45752648926425

    model = SillyManCombinational(
        use_spline=True,
        use_poly=False,
        use_raw=False,
        use_standard_scaler=False,
        base_estimators=['catboost'],
        final_estimators=['catboost']
    )

# Combinational Model!
Spline  - Cross-Validated QWK Score: 0.46978673137433913
Poly    - Cross-Validated QWK Score: 0.4603240623617997
Raw     - Cross-Validated QWK Score: 0.4675990891579175

# RESULTS!

Cross-Validated QWK Score: 0.47054350562542613

In [None]:
class AvgModel:
    def __init__(self, models: list[BaseEstimator]):
        self.models = models

    def predict(self, X: ArrayLike) -> NDArray[np.int_]:
        preds: list[NDArray[np.int_]] = []
        for model in self.models:
            pred = model.predict(X)
            preds.append(pred)

        return np.mean(preds, axis=0)

In [None]:
avg_model = AvgModel(models)
test_pred = avg_model.predict(X_test.to_pandas())[:, TARGET_COLS.index("PCIAT-PCIAT_Total")]
test_pred_rounded = optimizer.predict(test_pred)
test.select("id").with_columns(
    pl.Series("sii", pl.Series("sii", test_pred_rounded)),
).write_csv("submission.csv")