In [None]:
import os 
import warnings
from functools import partial
from pathlib import Path
import re

import matplotlib.pyplot as plt
import numpy as np
import optuna
import polars as pl
import polars.selectors as cs

from catboost import CatBoostRegressor, MultiTargetCustomMetric
from xgboost import XGBRegressor

from numpy.typing import ArrayLike, NDArray
from polars.testing import assert_frame_equal
from sklearn.base import BaseEstimator
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from colorama import Fore, Style, init
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore", message="Failed to optimize method")


#DATA_DIR = Path("/kaggle/input/child-mind-institute-problematic-internet-use")
DATA_DIR = Path("./data")

TARGET_COLS = [
    "PCIAT-PCIAT_01",
    "PCIAT-PCIAT_02",
    "PCIAT-PCIAT_03",
    "PCIAT-PCIAT_04",
    "PCIAT-PCIAT_05",
    "PCIAT-PCIAT_06",
    "PCIAT-PCIAT_07",
    "PCIAT-PCIAT_08",
    "PCIAT-PCIAT_09",
    "PCIAT-PCIAT_10",
    "PCIAT-PCIAT_11",
    "PCIAT-PCIAT_12",
    "PCIAT-PCIAT_13",
    "PCIAT-PCIAT_14",
    "PCIAT-PCIAT_15",
    "PCIAT-PCIAT_16",
    "PCIAT-PCIAT_17",
    "PCIAT-PCIAT_18",
    "PCIAT-PCIAT_19",
    "PCIAT-PCIAT_20",
    "PCIAT-PCIAT_Total",
    "sii",
]

FEATURE_COLS = [
    "Basic_Demos-Age",
    "PreInt_EduHx-computerinternet_hoursday",
    "SDS-SDS_Total_Raw",
    "Physical-Height",
    "FGC-FGC_TL",
    "Physical-Waist_Circumference",
    "BIA-BIA_FMI",
    "BIA-BIA_Fat",
    "Fitness_Endurance-Time_Sec",
    "Fitness_Endurance-Max_Stage",
    "FGC-FGC_CU_Zone",
]

In [None]:
# Credit to Ravi Ramakrishnan for his great Preprocessing class
# https://www.kaggle.com/code/ravi20076/cmi2024-baseline-v2

# Color printing
def PrintColor(text: str, color = Fore.BLUE, style = Style.BRIGHT):
    "Prints color outputs using colorama using a text F-string"
    print(style + color + text + Style.RESET_ALL)

class Preprocessor:
    "This class organizes the preprocessing steps for the train-test data into a single code block"
    
    def __init__(
        self, cat_imp_val : str= "missing", ip_path: str = DATA_DIR
    ):
        self.cat_imp_val = cat_imp_val
        self.ip_path     = ip_path
        
    def make_pqfile_cols(
        self, verbose: bool = False, label: str = "Train"
    )->pl.DataFrame:
        "This method collates the id level parquet files and creates the aggregation columns in a polars dataframe"

        cols = ["X", "Y", "Z", "enmo", "anglez", "light", "battery_voltage"]
        
        ip_path   = os.path.join(self.ip_path, f"series_{label.lower()}.parquet")
        all_files = os.listdir(ip_path)

        for file_nb, file in tqdm(enumerate(all_files)):
            df = \
            pl.scan_parquet(
                os.path.join(ip_path, file, f"part-0.parquet")
            ).select(pl.col(cols)).\
            collect().\
            describe(
                percentiles = np.arange(0.05, 0.95, 0.10)
            ).\
            filter(~pl.col("statistic").is_in(["count", "null_count"])).\
            unpivot(index = "statistic").\
            with_columns(
                pl.concat_str([pl.col("variable"), pl.col("statistic")],separator = "_",).alias("myvar")
            ).\
            with_columns(pl.col("myvar").str.replace(r"\%", "")).\
            select(["myvar", "value"]).\
            transpose(column_names = "myvar").\
            select(pl.all().shrink_dtype()).\
            with_columns(
                pl.Series("id", np.array(re.sub("id=", "", file)))
            )

            if file_nb == 0:
                op_df = df.clone()
            elif file_nb > 0:
                op_df = pl.concat([op_df, df], how = "vertical_relaxed")

                if verbose:
                    print(f"---> Shapes = {op_df.shape}")
                else:
                    pass
            del df

        PrintColor(f"---> {label} - shape = {op_df.shape}", color = Fore.CYAN)
        return op_df

    def pp_data(
        self, df: pl.DataFrame, label: str = "Train", cat_cols: list = [], 
    ):
        "This method preprocesses the train-test data with requisite steps"
        
        PrintColor(f"\n --- Data Processing - {label} --- \n")
        PrintColor(f"---> Shape = {df.shape} - memory usage {df.estimated_size('mb') :.3f} MB", 
                   color = Fore.CYAN
                  )
        
        if label == "Train":
            cat_cols = df.select(cs.string().exclude("id")).columns
        else:
            pass
        
        df    = df.with_columns(pl.col(cat_cols).fill_null(self.cat_imp_val).cast(pl.Categorical))
        op_df = self.make_pqfile_cols(label = label)
        df    = df.join(op_df, how = "left", on = "id")
        df    = df.select(pl.all().shrink_dtype())
        del op_df
        
        PrintColor(f"---> Shape = {df.shape} - memory usage {df.estimated_size('mb') :.3f} MB", 
                   color = Fore.CYAN
                  )
        return df, cat_cols
        

In [None]:
# Load data
train = pl.read_csv(DATA_DIR / "train.csv").drop("PCIAT-Season", strict=False)
test = pl.read_csv(DATA_DIR / "test.csv").drop("PCIAT-Season", strict=False)

# Ensure 'id' is a string in both DataFrames
train = train.with_columns(pl.col('id').cast(pl.Utf8))
test = test.with_columns(pl.col('id').cast(pl.Utf8))

pp = Preprocessor()

# Use StringCache to handle categoricals consistently
with pl.StringCache():
    train, cat_cols = pp.pp_data(train, "Train")
    test, _ = pp.pp_data(test, "Test", cat_cols)

# Ensure data types match between train and test
for col in train.columns:
    if col in test.columns:
        train_dtype = train.schema[col]
        test_dtype = test.schema[col]
        if train_dtype != test_dtype:
            # Cast test column to train's data type
            test = test.with_columns(pl.col(col).cast(train_dtype))
    else:
        # Column in train but not in test, fill with nulls in test
        test = test.with_columns(pl.lit(None).cast(train.schema[col]).alias(col))

# Reorder test columns to match train
test = test.select(train.columns)

# Concatenate train and test
train_test = pl.concat([train, test], how="vertical")

IS_TEST = test.height <= 100

assert_frame_equal(train, train_test[: train.height].select(train.columns))
assert_frame_equal(test, train_test[train.height :].select(test.columns))

In [None]:
FEATURE_COLS = train.drop(TARGET_COLS, strict=False).columns

# Cast string columns to categorical
train_test = train_test.with_columns(cs.string().cast(pl.Categorical).fill_null("NAN"))
train = train_test[: train.height]
test = train_test[train.height :]

# ignore rows with null values in TARGET_COLS
train_without_null = train_test.drop_nulls(subset=TARGET_COLS)
X = train_without_null.select(FEATURE_COLS)
X_test = test.select(FEATURE_COLS)
y = train_without_null.select(TARGET_COLS)
y_sii = y.get_column("sii").to_numpy()  # ground truth
cat_features = X.select(cs.categorical()).columns

print("Features selected:")
print(cat_features)  # Should be none

# Tubotubo's Quadratic Weighted Kappa metric & Optuna optimizer

In [None]:
class MultiTargetQWK(MultiTargetCustomMetric):
    def get_final_error(self, error, weight):
        return np.sum(error)  # / np.sum(weight)

    def is_max_optimal(self):
        # if True, the bigger the better
        return True

    def evaluate(self, approxes, targets, weight):
        # approxes: 予測値 (shape: [ターゲット数, サンプル数])
        # targets: 実際の値 (shape: [ターゲット数, サンプル数])
        # weight: サンプルごとの重み (Noneも可)

        approx = np.clip(approxes[-1], 0, 3).round().astype(int)
        target = targets[-1]

        qwk = cohen_kappa_score(target, approx, weights="quadratic")

        return qwk, 1

    def get_custom_metric_name(self):
        return "MultiTargetQWK"


class OptimizedRounder:
    """
    A class for optimizing the rounding of continuous predictions into discrete class labels using Optuna.
    The optimization process maximizes the Quadratic Weighted Kappa score by learning thresholds that separate
    continuous predictions into class intervals.

    Args:
        n_classes (int): The number of discrete class labels.
        n_trials (int, optional): The number of trials for the Optuna optimization. Defaults to 100.

    Attributes:
        n_classes (int): The number of discrete class labels.
        labels (NDArray[np.int_]): An array of class labels from 0 to `n_classes - 1`.
        n_trials (int): The number of optimization trials.
        metric (Callable): The Quadratic Weighted Kappa score metric used for optimization.
        thresholds (List[float]): The optimized thresholds learned after calling `fit()`.

    Methods:
        fit(y_pred: NDArray[np.float_], y_true: NDArray[np.int_]) -> None:
            Fits the rounding thresholds based on continuous predictions and ground truth labels.

            Args:
                y_pred (NDArray[np.float_]): Continuous predictions that need to be rounded.
                y_true (NDArray[np.int_]): Ground truth class labels.

            Returns:
                None

        predict(y_pred: NDArray[np.float_]) -> NDArray[np.int_]:
            Predicts discrete class labels by rounding continuous predictions using the fitted thresholds.
            `fit()` must be called before `predict()`.

            Args:
                y_pred (NDArray[np.float_]): Continuous predictions to be rounded.

            Returns:
                NDArray[np.int_]: Predicted class labels.

        _normalize(y: NDArray[np.float_]) -> NDArray[np.float_]:
            Normalizes the continuous values to the range [0, `n_classes - 1`].

            Args:
                y (NDArray[np.float_]): Continuous values to be normalized.

            Returns:
                NDArray[np.float_]: Normalized values.

    References:
        - This implementation uses Optuna for threshold optimization.
        - Quadratic Weighted Kappa is used as the evaluation metric.
    """

    def __init__(self, n_classes: int, n_trials: int = 100):
        self.n_classes = n_classes
        self.labels = np.arange(n_classes)
        self.n_trials = n_trials
        self.metric = partial(cohen_kappa_score, weights="quadratic")

    def fit(self, y_pred: NDArray[np.float_], y_true: NDArray[np.int_]) -> None:
        y_pred = self._normalize(y_pred)

        def objective(trial: optuna.Trial) -> float:
            thresholds = []
            for i in range(self.n_classes - 1):
                low = max(thresholds) if i > 0 else min(self.labels)
                high = max(self.labels)
                th = trial.suggest_float(f"threshold_{i}", low, high)
                thresholds.append(th)
            try:
                y_pred_rounded = np.digitize(y_pred, thresholds)
            except ValueError:
                return -100
            return self.metric(y_true, y_pred_rounded)

        optuna.logging.disable_default_handler()
        study = optuna.create_study(direction="maximize")
        study.optimize(
            objective,
            n_trials=self.n_trials,
        )
        self.thresholds = [study.best_params[f"threshold_{i}"] for i in range(self.n_classes - 1)]

    def predict(self, y_pred: NDArray[np.float_]) -> NDArray[np.int_]:
        assert hasattr(self, "thresholds"), "fit() must be called before predict()"
        y_pred = self._normalize(y_pred)
        return np.digitize(y_pred, self.thresholds)

    def _normalize(self, y: NDArray[np.float_]) -> NDArray[np.float_]:
        # normalize y_pred to [0, n_classes - 1]
        return (y - y.min()) / (y.max() - y.min()) * (self.n_classes - 1)

# Start making models!

In [None]:
from sklearn.feature_selection import RFE

params = dict(
    loss_function="MultiRMSE",
    eval_metric="MultiRMSE",  # Ensure eval_metric is appropriate
    iterations=7000,  # Adjust as needed
    learning_rate=0.05,
    depth=5,
    early_stopping_rounds=50,
)

# Updated SILLY MODEL with RFE
class SillyManRFE():
    def __init__(self, n_features_to_select=10):
        self.selector = RFE(estimator=CatBoostRegressor(**params), n_features_to_select=n_features_to_select, step=1)
        self.cat = CatBoostRegressor(**params)
        
    def fit(self, x, y, eval_set, cat_features, verbose=False):
        self.cat_features = cat_features
        
        # Apply RFE for feature selection
        X_selected = self.selector.fit_transform(x, y)
        
        # Identify selected feature indices
        selected_features = np.array(x.columns)[self.selector.support_]
        
        # Identify categorical features after selection
        new_cat_features = [i for i, col in enumerate(x.columns) if col in cat_features and col in selected_features]
        
        # Prepare evaluation set
        X_val_selected = self.selector.transform(eval_set[0])
        
        # Fit CatBoost
        self.cat.fit(
            X_selected,
            y,
            eval_set=(X_val_selected, eval_set[1]),
            cat_features=new_cat_features,
            verbose=verbose
        )
        
    def predict(self, x):
        X_selected = self.selector.transform(x)
        return self.cat.predict(X_selected)


In [None]:
from sklearn.preprocessing import (
    PolynomialFeatures,
    SplineTransformer,
    StandardScaler,
    MinMaxScaler,
    QuantileTransformer
)
from sklearn.impute import SimpleImputer
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd

class SillyManCombinational:
    def __init__(
        self,
        base_catboost_params=None,
        base_xgboost_params=None,
        spline_catboost_params=None,
        spline_xgboost_params=None,
        include_bias=False,
        imputation_strategy='mean',
        use_spline=True,
        use_poly=False,
        use_raw=False,
        use_standard_scaler=False,
        use_minmax_scaler=False,
        use_quantile_transformer=False,
        base_estimators=['catboost'],  # options: ['catboost', 'xgboost', 'raw']
        final_estimators=['catboost'],  # options: ['catboost', 'xgboost']
    ):
        """
        Initializes the SillyManCombinational model with options to include various transformers and base estimators.

        Parameters:
        - include_bias (bool): Whether to include a bias column.
        - imputation_strategy (str): Strategy for imputing missing values.
        - use_spline (bool): Whether to include Spline transformed features.
        - use_poly (bool): Whether to include Polynomial transformed features.
        - use_raw (bool): Whether to include raw numerical features.
        - use_standard_scaler (bool): Whether to include StandardScaler transformed features.
        - use_minmax_scaler (bool): Whether to include MinMaxScaler transformed features.
        - use_quantile_transformer (bool): Whether to include QuantileTransformer transformed features.
        - base_estimators (list): List of estimators to use for base models ['catboost', 'xgboost', 'raw'].
        - final_estimators (list): List of estimators to use for final model ['catboost', 'xgboost'].
        """
        self.base_catboost_params=base_catboost_params
        self.base_xgboost_params=base_xgboost_params
        self.spline_catboost_params=spline_catboost_params
        self.spline_xgboost_params=spline_xgboost_params
        
        # Transformer toggles
        self.use_spline = use_spline
        self.use_poly = use_poly
        self.use_raw_transformer = use_raw  # Renamed to avoid confusion with 'raw' in base_estimators
        self.use_standard_scaler = use_standard_scaler
        self.use_minmax_scaler = use_minmax_scaler
        self.use_quantile_transformer = use_quantile_transformer

        # Estimator types
        self.base_estimators_types = base_estimators
        self.final_estimators_types = final_estimators

        # Validation: If 'raw' is in base_estimators, 'use_raw_transformer' must be True
        #if 'raw' in self.base_estimators_types and not self.use_raw_transformer:
        #    raise ValueError("Setting 'raw' in base_estimators requires 'use_raw=True'.")

        self.imputer = SimpleImputer(strategy=imputation_strategy)
        if self.use_poly:
            self.poly = PolynomialFeatures(degree=2, include_bias=include_bias)
        if self.use_spline:
            self.spline = SplineTransformer(n_knots=5, degree=3, include_bias=include_bias)
        if self.use_standard_scaler:
            self.standard_scaler = StandardScaler()
        if self.use_minmax_scaler:
            self.minmax_scaler = MinMaxScaler()
        if self.use_quantile_transformer:
            self.quantile_transformer = QuantileTransformer(output_distribution='normal')

        # Base estimators dict
        self.base_estimators = {}
        # Final estimator(s)
        self.final_estimators = {}
        # To store categorical feature indices in final estimators
        self.final_cat_features = {}

    def fit(self, x, y, eval_set, cat_features, verbose=False):
        """
        Fits the model with selected feature transformations and imputation.

        Parameters:
        - x (pd.DataFrame): Training features.
        - y (pd.Series or pd.DataFrame): Training target.
        - eval_set (tuple): Validation set as (X_val, y_val).
        - cat_features (list): List of categorical feature names.
        - verbose (bool): Verbosity flag.
        """
        self.cat_features = cat_features

        # Separate numerical and categorical features
        if cat_features:
            numerical = x.drop(columns=self.cat_features)
            categorical = x[self.cat_features]
        else:
            numerical = x.copy()
            categorical = None

        # Impute missing values in numerical features
        X_train_imputed = self.imputer.fit_transform(numerical)
        X_val_imputed = self.imputer.transform(
            eval_set[0].drop(columns=self.cat_features) if self.cat_features else eval_set[0]
        )

        # Transformer functions mapping
        transformers = {}
        if self.use_spline:
            transformers['spline'] = self.spline
        if self.use_poly:
            transformers['poly'] = self.poly
        if self.use_raw_transformer:
            transformers['raw_transformer'] = None  # raw data, no transformer
        if self.use_standard_scaler:
            transformers['standard_scaler'] = self.standard_scaler
        if self.use_minmax_scaler:
            transformers['minmax_scaler'] = self.minmax_scaler
        if self.use_quantile_transformer:
            transformers['quantile_transformer'] = self.quantile_transformer

        if not transformers:
            raise ValueError("At least one transformer must be selected.")

        transformed_train_features = []
        transformed_val_features = []
        transformed_feature_names = []

        # Apply each transformer and store transformed features
        for name, transformer in transformers.items():
            if transformer is not None:
                # Fit and transform
                X_train_transformed = transformer.fit_transform(X_train_imputed)
                X_val_transformed = transformer.transform(X_val_imputed)
                feature_names = transformer.get_feature_names_out(input_features=numerical.columns)
                feature_names = [f'{name}_{feat}' for feat in feature_names]
            else:
                # Raw data (no transformation)
                X_train_transformed = X_train_imputed
                X_val_transformed = X_val_imputed
                feature_names = [f'raw_{feat}' for feat in numerical.columns]

            # Create DataFrames
            X_train_df = pd.DataFrame(X_train_transformed, columns=feature_names)
            X_val_df = pd.DataFrame(X_val_transformed, columns=feature_names)

            # Store transformed features
            transformed_train_features.append(X_train_df)
            transformed_val_features.append(X_val_df)
            transformed_feature_names.append(name)

        # Handle categorical features
        if categorical is not None:
            X_train_cat = categorical.reset_index(drop=True)
            X_val_cat = eval_set[0][self.cat_features].reset_index(drop=True)
        else:
            X_train_cat = None
            X_val_cat = None

        # Initialize lists to collect base estimator predictions
        base_train_preds = []
        base_val_preds = []

        # If base_estimators are specified and not empty
        if self.base_estimators_types and self.base_estimators_types != ['']:
            for idx, name in enumerate(transformed_feature_names):
                X_train_df = transformed_train_features[idx]
                X_val_df = transformed_val_features[idx]

                # Append categorical features if any
                if X_train_cat is not None:
                    X_train_df = pd.concat([X_train_df.reset_index(drop=True), X_train_cat], axis=1)
                    X_val_df = pd.concat([X_val_df.reset_index(drop=True), X_val_cat], axis=1)
                    # Update cat_features indices to point to the categorical features appended at the end
                    new_cat_features = list(range(X_train_df.shape[1] - len(self.cat_features), X_train_df.shape[1]))
                else:
                    new_cat_features = []

                for est_type in self.base_estimators_types:
                    if est_type == 'raw':
                        # Include transformed features directly as meta-features
                        base_train_preds.append(X_train_df.copy())
                        base_val_preds.append(X_val_df.copy())
                        continue  # Skip training any estimator
                    est_name = f'{name}_{est_type}'
                    if est_type == 'catboost':
                        estimator = CatBoostRegressor(**self.base_catboost_params)
                        estimator.fit(
                            X_train_df,
                            y,
                            eval_set=(X_val_df, eval_set[1]),
                            cat_features=new_cat_features,
                            verbose=verbose
                        )
                    elif est_type == 'xgboost':
                        estimator = XGBRegressor(**self.base_xgboost_params)
                        estimator.fit(
                            X_train_df,
                            y,
                            eval_set=[(X_val_df, eval_set[1])],
                            verbose=verbose
                        )
                    else:
                        raise ValueError(f"Unsupported estimator type: {est_type}")

                    # Save estimator
                    self.base_estimators[est_name] = estimator

                    # Collect predictions
                    train_pred = estimator.predict(X_train_df)
                    val_pred = estimator.predict(X_val_df)

                    # Ensure predictions are 2D arrays
                    if train_pred.ndim == 1:
                        train_pred = train_pred.reshape(-1, 1)
                        val_pred = val_pred.reshape(-1, 1)

                    # Generate appropriate column names for multi-output
                    n_outputs = train_pred.shape[1] if train_pred.ndim > 1 else 1
                    if n_outputs == 1:
                        columns = [est_name]
                    else:
                        columns = [f'{est_name}_output_{i}' for i in range(n_outputs)]

                    # Create DataFrames with correct column names
                    base_train_preds.append(pd.DataFrame(train_pred, columns=columns))
                    base_val_preds.append(pd.DataFrame(val_pred, columns=columns))

            # Combine base estimator predictions
            if base_train_preds and base_val_preds:
                X_train_meta = pd.concat(base_train_preds, axis=1).reset_index(drop=True)
                X_val_meta = pd.concat(base_val_preds, axis=1).reset_index(drop=True)
            else:
                raise ValueError("Base estimators did not produce any predictions.")
        else:
            # If no base estimators, use transformed features directly
            X_train_meta = pd.concat(transformed_train_features, axis=1).reset_index(drop=True)
            X_val_meta = pd.concat(transformed_val_features, axis=1).reset_index(drop=True)

            # Append categorical features if any
            if X_train_cat is not None:
                X_train_meta = pd.concat([X_train_meta.reset_index(drop=True), X_train_cat], axis=1)
                X_val_meta = pd.concat([X_val_meta.reset_index(drop=True), X_val_cat], axis=1)
                # Update cat_features indices to point to the categorical features appended at the end
                new_cat_features = list(range(X_train_meta.shape[1] - len(self.cat_features), X_train_meta.shape[1]))
            else:
                new_cat_features = []

        # Train final estimator(s)
        for est_type in self.final_estimators_types:
            est_name = f'final_{est_type}'
            if est_type == 'catboost':
                final_estimator = CatBoostRegressor(**self.spline_catboost_params)
                final_estimator.fit(
                    X_train_meta,
                    y,
                    eval_set=(X_val_meta, eval_set[1]),
                    cat_features=new_cat_features,
                    verbose=verbose
                )
            elif est_type == 'xgboost':
                final_estimator = XGBRegressor(**self.spline_xgboost_params)
                final_estimator.fit(
                    X_train_meta,
                    y,
                    eval_set=[(X_val_meta, eval_set[1])],
                    verbose=verbose
                )
            else:
                raise ValueError(f"Unsupported final estimator type: {est_type}")
            # Save final estimator
            self.final_estimators[est_type] = final_estimator

    def predict(self, x):
        """
        Predicts using the fitted model.

        Parameters:
        - x (pd.DataFrame): Features for prediction.

        Returns:
        - np.ndarray: Predictions.
        """
        # Separate numerical and categorical features
        if self.cat_features:
            numerical = x.drop(columns=self.cat_features)
            categorical = x[self.cat_features]
        else:
            numerical = x.copy()
            categorical = None

        # Impute missing values
        X_imputed = self.imputer.transform(numerical)

        # Transformer functions mapping (same as in fit)
        transformers = {}
        if self.use_spline:
            transformers['spline'] = self.spline
        if self.use_poly:
            transformers['poly'] = self.poly
        if self.use_raw_transformer:
            transformers['raw_transformer'] = None  # raw data, no transformer
        if self.use_standard_scaler:
            transformers['standard_scaler'] = self.standard_scaler
        if self.use_minmax_scaler:
            transformers['minmax_scaler'] = self.minmax_scaler
        if self.use_quantile_transformer:
            transformers['quantile_transformer'] = self.quantile_transformer

        transformed_features = []
        transformed_feature_names = []

        # Apply each transformer and store transformed features
        for name, transformer in transformers.items():
            if transformer is not None:
                # Transform data
                X_transformed = transformer.transform(X_imputed)
                feature_names = transformer.get_feature_names_out(input_features=numerical.columns)
                feature_names = [f'{name}_{feat}' for feat in feature_names]
            else:
                # Raw data (no transformation)
                X_transformed = X_imputed
                feature_names = [f'raw_{feat}' for feat in numerical.columns]

            # Create DataFrame
            X_df = pd.DataFrame(X_transformed, columns=feature_names)

            # Store transformed features
            transformed_features.append(X_df)
            transformed_feature_names.append(name)

        # Handle categorical features
        if categorical is not None:
            X_cat = categorical.reset_index(drop=True)
        else:
            X_cat = None

        # Initialize list to collect base estimator predictions
        base_preds = []

        # If base_estimators are specified and not empty
        if self.base_estimators_types and self.base_estimators_types != ['']:
            for idx, name in enumerate(transformed_feature_names):
                X_df = transformed_features[idx]

                # Append categorical features if any
                if X_cat is not None:
                    X_df = pd.concat([X_df.reset_index(drop=True), X_cat], axis=1)
                    # Update cat_features indices to point to the categorical features appended at the end
                    new_cat_features = list(range(X_df.shape[1] - len(self.cat_features), X_df.shape[1]))
                else:
                    new_cat_features = []

                for est_type in self.base_estimators_types:
                    if est_type == 'raw':
                        # Include transformed features directly as meta-features
                        base_preds.append(X_df.copy())
                        continue  # Skip any further processing
                    est_name = f'{name}_{est_type}'
                    if est_name not in self.base_estimators:
                        raise ValueError(f"Base estimator '{est_name}' has not been trained.")
                    estimator = self.base_estimators[est_name]
                    pred = estimator.predict(X_df)

                    # Ensure predictions are 2D arrays
                    if pred.ndim == 1:
                        pred = pred.reshape(-1, 1)

                    # Generate appropriate column names for multi-output
                    n_outputs = pred.shape[1] if pred.ndim > 1 else 1
                    if n_outputs == 1:
                        columns = [est_name]
                    else:
                        columns = [f'{est_name}_output_{i}' for i in range(n_outputs)]

                    # Create DataFrame with correct column names
                    base_preds.append(pd.DataFrame(pred, columns=columns))

            # Combine base estimator predictions
            if base_preds:
                X_meta = pd.concat(base_preds, axis=1).reset_index(drop=True)
            else:
                raise ValueError("Base estimators did not produce any predictions.")
        else:
            # If no base estimators, use transformed features directly
            X_meta = pd.concat(transformed_features, axis=1).reset_index(drop=True)

            # Append categorical features if any
            if X_cat is not None:
                X_meta = pd.concat([X_meta.reset_index(drop=True), X_cat], axis=1)
                # Update cat_features indices to point to the categorical features appended at the end
                new_cat_features = list(range(X_meta.shape[1] - len(self.cat_features), X_meta.shape[1]))
            else:
                new_cat_features = []

        # Get final predictions from final estimators
        final_preds = []
        for est_type in self.final_estimators_types:
            est_name = f'final_{est_type}'
            if est_type not in self.final_estimators:
                raise ValueError(f"Final estimator '{est_type}' has not been trained.")
            final_estimator = self.final_estimators[est_type]
            pred = final_estimator.predict(X_meta)
            final_preds.append(pred)

        # Average predictions if multiple final estimators
        if len(final_preds) == 1:
            return final_preds[0]
        else:
            return np.mean(final_preds, axis=0)


In [None]:
# Reset cached models
models: list = []
y_pred = np.full((X.height, len(TARGET_COLS)), fill_value=np.nan)

# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=52)
rskf = RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=52)



for train_idx, val_idx in skf.split(X, y_sii):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Define your base estimator parameters
    base_xgboost_params = dict(
        objective="reg:squarederror",
        eval_metric="rmse",
        n_estimators=10000,
        learning_rate=0.01,
        max_depth=5,
        early_stopping_rounds=50,
        tree_method='hist'
    )
    
    base_catboost_params = dict(
        loss_function="MultiRMSE",
        eval_metric="MultiRMSE",
        n_estimators=10000,
        learning_rate=0.01,
        depth=5,
        early_stopping_rounds=50,
    )
    
    
    # Define your final parameters
    spline_xgboost_params = dict(
        objective="reg:squarederror",
        eval_metric="rmse",
        n_estimators=7000,
        learning_rate=0.05,
        max_depth=5,
        early_stopping_rounds=50,
        tree_method='hist'
    )
    
    spline_catboost_params = dict(
        loss_function="MultiRMSE",
        eval_metric="MultiRMSE",
        n_estimators=7500,
        learning_rate=0.05,
        max_depth=5,
        early_stopping_rounds=50,
        bagging_temperature=15,
        l2_leaf_reg=5
    )
        
        
    # Initialize and train model
    SMC_params = dict(
        base_catboost_params=base_catboost_params,
        base_xgboost_params=base_xgboost_params,
        spline_catboost_params=spline_catboost_params,
        spline_xgboost_params=spline_xgboost_params,
        use_spline=True,
        use_poly=False,
        use_raw=False,
        base_estimators=[''],
        final_estimators=['catboost']
    )
    model = SillyManCombinational(**SMC_params)
    model.fit(
        X_train.to_pandas(),
        y_train.to_pandas(),
        eval_set=(X_val.to_pandas(), y_val.to_pandas()),
        cat_features=cat_features,
        verbose=False,
    )
    models.append(model)
    
    # Predict
    y_pred[val_idx] = model.predict(X_val.to_pandas())# / 2 
    

assert np.isnan(y_pred).sum() == 0

# Optimize thresholds
optimizer = OptimizedRounder(n_classes=4, n_trials=400)
y_pred_total = y_pred[:, TARGET_COLS.index("PCIAT-PCIAT_Total")]
optimizer.fit(y_pred_total, y_sii)
y_pred_rounded = optimizer.predict(y_pred_total)

# Calculate QWK
qwk = cohen_kappa_score(y_sii, y_pred_rounded, weights="quadratic")
print(f"Cross-Validated QWK Score: {qwk}")

BAG 5  - 0.46378869920422117
BAG 10 - 0.46842822123076155
BAG 12 - 0.46749256436350695
BAG 15 - 0.4725783353846581

REG 0 - 0.47012322167140097
REG 1 - 0.47086647810591253
REG 3 - 0.4667491613394743
REG 5 - 0.4779985141274281
REG 8 - 


# Combinational Model V2!
Cross-Validated QWK Score: 0.4700519292387968

    model = SillyManCombinational(
        use_spline=True,
        use_poly=False,
        use_raw=False,
        use_standard_scaler=False,
        base_estimators=[],
        final_estimators=['catboost']
    )


# Combinational Model!
Spline  - Cross-Validated QWK Score: 0.46978673137433913
Poly    - Cross-Validated QWK Score: 0.4603240623617997
Raw     - Cross-Validated QWK Score: 0.4675990891579175

# RESULTS!

Cross-Validated QWK Score: 0.47054350562542613

In [None]:
class AvgModel:
    def __init__(self, models: list[BaseEstimator]):
        self.models = models

    def predict(self, X: ArrayLike) -> NDArray[np.int_]:
        preds: list[NDArray[np.int_]] = []
        for model in self.models:
            pred = model.predict(X)
            preds.append(pred)

        return np.mean(preds, axis=0)

In [None]:
avg_model = AvgModel(models)
test_pred = avg_model.predict(X_test.to_pandas())[:, TARGET_COLS.index("PCIAT-PCIAT_Total")]
test_pred_rounded = optimizer.predict(test_pred)
test.select("id").with_columns(
    pl.Series("sii", pl.Series("sii", test_pred_rounded)),
).write_csv("submission.csv")

Optuna!

        def objective(trial: optuna.Trial) -> float:
            thresholds = []
            for i in range(self.n_classes - 1):
                low = max(thresholds) if i > 0 else min(self.labels)
                high = max(self.labels)
                th = trial.suggest_float(f"threshold_{i}", low, high)
                thresholds.append(th)
            try:
                y_pred_rounded = np.digitize(y_pred, thresholds)
            except ValueError:
                return -100
            return self.metric(y_true, y_pred_rounded)

        optuna.logging.disable_default_handler()
        study = optuna.create_study(direction="maximize")
        study.optimize(
            objective,
            n_trials=self.n_trials,
        )
        self.thresholds = [study.best_params[f"threshold_{i}"] for i in range(self.n_classes - 1)]

In [None]:
def objective(trial: optuna.Trial) -> float:
    # Reset cached models
    models: list = []
    y_pred = np.full((X.height, len(TARGET_COLS)), fill_value=np.nan)
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=52)
    #rskf = RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=52)
    
    
    
    for train_idx, val_idx in skf.split(X, y_sii):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Define your base estimator parameters
        base_xgboost_params = dict(
            objective="reg:squarederror",
            eval_metric="rmse",
            n_estimators=10000,
            learning_rate=0.01,
            max_depth=5,
            early_stopping_rounds=50,
            tree_method='hist'
        )
        
        base_catboost_params = dict(
            loss_function="MultiRMSE",
            eval_metric="MultiRMSE",
            n_estimators=10000,
            learning_rate=0.01,
            depth=5,
            early_stopping_rounds=50,
        )
        
        
        # Define your final parameters
        spline_xgboost_params = dict(
            objective="reg:squarederror",
            eval_metric="rmse",
            n_estimators=7000,
            learning_rate=0.05,
            max_depth=5,
            early_stopping_rounds=50,
            tree_method='hist'
        )
        
        '''spline_catboost_params = dict(
            loss_function="MultiRMSE",
            eval_metric="MultiRMSE",
            n_estimators=7500,
            learning_rate=0.05,
            max_depth=5,
            early_stopping_rounds=50,
            bagging_temperature=15,
            l2_leaf_reg=5
        )'''
        
        spline_catboost_params = dict(
            loss_function="MultiRMSE",
            eval_metric="MultiRMSE",
            n_estimators=trial.suggest_int('n_estimators', 500, 10000),
            learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
            max_depth = trial.suggest_int('max_depth', 4, 10, step=1),
            early_stopping_rounds = trial.suggest_int('early_stopping_rounds', 30, 70, step=10),
            bagging_temperature = trial.suggest_int('bagging_temperature', 5, 25, step=1),
            l2_leaf_reg=trial.suggest_loguniform('l2_leaf_reg', 1e-3, 20.0)
        )
            
            
        # Initialize and train model
        SMC_params = dict(
            base_catboost_params=base_catboost_params,
            base_xgboost_params=base_xgboost_params,
            spline_catboost_params=spline_catboost_params,
            spline_xgboost_params=spline_xgboost_params,
            use_spline=True,
            use_poly=False,
            use_raw=False,
            base_estimators=[''],
            final_estimators=['catboost']
        )
        model = SillyManCombinational(**SMC_params)
        model.fit(
            X_train.to_pandas(),
            y_train.to_pandas(),
            eval_set=(X_val.to_pandas(), y_val.to_pandas()),
            cat_features=cat_features,
            verbose=False,
        )
        models.append(model)
        
        # Predict
        y_pred[val_idx] = model.predict(X_val.to_pandas())# / 2 
        
    
    assert np.isnan(y_pred).sum() == 0
    
    # Optimize thresholds
    optimizer = OptimizedRounder(n_classes=4, n_trials=400)
    y_pred_total = y_pred[:, TARGET_COLS.index("PCIAT-PCIAT_Total")]
    optimizer.fit(y_pred_total, y_sii)
    y_pred_rounded = optimizer.predict(y_pred_total)
    
    # Calculate QWK
    qwk = cohen_kappa_score(y_sii, y_pred_rounded, weights="quadratic")
    print(f"Cross-Validated QWK Score: {qwk}")
    return qwk

optuna.logging.enable_default_handler()
study = optuna.create_study(direction="maximize")
study.optimize(
    objective,
    n_trials=50,
    n_jobs=5
)

In [None]:
print('Best Trial:')
trial = study.best_trial
print(f'  Value: {trial.value}')
print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')