In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e9/sample_submission.csv
/kaggle/input/playground-series-s5e9/train.csv
/kaggle/input/playground-series-s5e9/test.csv


In [2]:
import os, sys, gc, warnings, random
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import clone

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool

In [3]:
USE_RAPIDS = False     
N_FOLDS = 5
SEED = 42
N_THREADS = 8 

if USE_RAPIDS:
    
    try:
        !pip -q install --target=/kaggle/working cudf-cu12 cuml-cu12 cupy-cuda12x --extra-index-url=https://pypi.nvidia.com
        sys.path.append("/kaggle/working")
        import cuml
        from cuml.linear_model import LinearRegression as cuLinearRegression
        RAPIDS_OK = True
    except Exception as e:
        print("RAPIDS install failed; falling back to CPU RidgeCV:", e)
        RAPIDS_OK = False
else:
    RAPIDS_OK = False

def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
set_seed(SEED)

In [4]:
DATA_DIR = "/kaggle/input"

train = pd.read_csv("/kaggle/input/playground-series-s5e9/train.csv")
test  = pd.read_csv("/kaggle/input/playground-series-s5e9/test.csv")
sub   = pd.read_csv("/kaggle/input/playground-series-s5e9/sample_submission.csv")

TARGET = "BeatsPerMinute"

# ID column heuristic (common in Kaggle Playgrounds is 'id')
id_cols = [c for c in train.columns if c.lower() in ["id", "row_id", "index"]]
ID_COL = id_cols[0] if id_cols else None

print("Shapes:", train.shape, test.shape)
print("Columns:", list(train.columns))
print("ID column:", ID_COL, " | Target:", TARGET)

Shapes: (524164, 11) (174722, 10)
Columns: ['id', 'RhythmScore', 'AudioLoudness', 'VocalContent', 'AcousticQuality', 'InstrumentalScore', 'LivePerformanceLikelihood', 'MoodScore', 'TrackDurationMs', 'Energy', 'BeatsPerMinute']
ID column: id  | Target: BeatsPerMinute


In [5]:
def drop_constant(df):
    const_cols = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
    return df.drop(columns=const_cols), const_cols
train, dropped_const = drop_constant(train)
test = test.drop(columns=[c for c in dropped_const if c in test.columns], errors="ignore")
if dropped_const:
    print("Dropped constant cols:", dropped_const)


In [6]:
y = train[TARGET].values
X = train.drop(columns=[TARGET] + ([ID_COL] if ID_COL else []), errors="ignore")
X_test = test.drop(columns=[ID_COL] if ID_COL else [], errors="ignore")

cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"Detected {len(num_cols)} numeric and {len(cat_cols)} categorical features.")

num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    # Keep a scaler for linear models and overall stability
    ("scaler", StandardScaler(with_mean=False))
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop",
    n_jobs=N_THREADS
)

X_proc = preprocess.fit_transform(X)
X_test_proc = preprocess.transform(X_test)

Detected 9 numeric and 0 categorical features.


In [7]:
xgb_params = dict(
    n_estimators=5000,
    learning_rate=0.03,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="reg:squarederror",
    tree_method="hist",
    random_state=SEED,
    n_jobs=N_THREADS
)

lgbm_params = dict(
    n_estimators=5000,
    learning_rate=0.03,
    num_leaves=255,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="rmse",     # alias to L2; metric set in fit params
    random_state=SEED,
    n_jobs=N_THREADS
)

cat_params = dict(
    loss_function="RMSE",
    iterations=5000,
    depth=8,
    learning_rate=0.03,
    subsample=0.8,
    random_seed=SEED,
    l2_leaf_reg=3.0,
    verbose=False
)

BASE_MODELS = {
    "xgb": xgb.XGBRegressor(**xgb_params),
    "lgbm": lgb.LGBMRegressor(**lgbm_params),
    "cat": CatBoostRegressor(**cat_params),
}


In [8]:
meta_model_cpu = RidgeCV(alphas=np.logspace(-6, 6, 13), fit_intercept=True)

In [9]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_preds = {name: np.zeros(len(X_proc)) for name in BASE_MODELS}
test_preds = {name: np.zeros(len(X_test_proc)) for name in BASE_MODELS}
fold_scores = {name: [] for name in BASE_MODELS}

for fold, (trn_idx, val_idx) in enumerate(kf.split(X_proc, y), start=1):
    print(f"\n========== Fold {fold}/{N_FOLDS} ==========")
    X_tr, X_va = X_proc[trn_idx], X_proc[val_idx]
    y_tr, y_va = y[trn_idx], y[val_idx]

    # XGBoost
    if "xgb" in BASE_MODELS:
        model = clone(BASE_MODELS["xgb"])
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="rmse",
            verbose=False,
            early_stopping_rounds=400
        )
        va_pred = model.predict(X_va)
        te_pred = model.predict(X_test_proc)
        oof_preds["xgb"][val_idx] = va_pred
        test_preds["xgb"] += te_pred / N_FOLDS
        score = rmse(y_va, va_pred)
        fold_scores["xgb"].append(score)
        print(f"XGB   fold RMSE: {score:.5f}")

    # LightGBM
    if "lgbm" in BASE_MODELS:
        model = clone(BASE_MODELS["lgbm"])
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="rmse",
            callbacks=[lgb.early_stopping(stopping_rounds=400, verbose=False)]
        )
        va_pred = model.predict(X_va)
        te_pred = model.predict(X_test_proc)
        oof_preds["lgbm"][val_idx] = va_pred
        test_preds["lgbm"] += te_pred / N_FOLDS
        score = rmse(y_va, va_pred)
        fold_scores["lgbm"].append(score)
        print(f"LGBM fold RMSE: {score:.5f}")

    # CatBoost
    if "cat" in BASE_MODELS:
        model = clone(BASE_MODELS["cat"])
        train_pool = Pool(X_tr, y_tr)
        valid_pool = Pool(X_va, y_va)
        model.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=False, early_stopping_rounds=400)
        va_pred = model.predict(X_va)
        te_pred = model.predict(X_test_proc)
        oof_preds["cat"][val_idx] = va_pred
        test_preds["cat"] += te_pred / N_FOLDS
        score = rmse(y_va, va_pred)
        fold_scores["cat"].append(score)
        print(f"CAT fold RMSE: {score:.5f}")




XGB   fold RMSE: 26.44032
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019784 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 119.056554
LGBM fold RMSE: 26.43983
CAT fold RMSE: 26.44040

XGB   fold RMSE: 26.48632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 119.039042
LGBM fold RMSE: 26.48624
CAT fold RMSE: 26.48490

XGB   fold RMSE: 26.52631
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021424 seconds.
You can set `force_col_wise=t

In [10]:
print("\n========== Base Models CV ==========")
for name in BASE_MODELS:
    score_rmse = rmse(y, oof_preds[name])
    score_r2 = r2_score(y, oof_preds[name])
    print(f"{name.upper():<5} OOF RMSE: {score_rmse:.5f} | OOF R2: {score_r2:.5f}")


XGB   OOF RMSE: 26.46182 | OOF R2: 0.00047
LGBM  OOF RMSE: 26.46272 | OOF R2: 0.00040
CAT   OOF RMSE: 26.46043 | OOF R2: 0.00058


In [11]:
stack_cols = [name for name in BASE_MODELS]
X_stack = np.column_stack([oof_preds[name] for name in stack_cols])
X_stack_test = np.column_stack([test_preds[name] for name in stack_cols])

if RAPIDS_OK:
    # GPU meta (cuML Linear Regression) â€“ simple and fast
    meta_gpu = cuLinearRegression(fit_intercept=True)
    meta_gpu.fit(X_stack, y)
    meta_pred_oof = meta_gpu.predict(X_stack).get()  # cuDF -> numpy
    meta_pred_test = meta_gpu.predict(X_stack_test).get()
    meta_name = "cuML-LR"
else:
    meta_cpu = meta_model_cpu
    meta_cpu.fit(X_stack, y)
    meta_pred_oof = meta_cpu.predict(X_stack)
    meta_pred_test = meta_cpu.predict(X_stack_test)
    meta_name = "RidgeCV"

meta_rmse = rmse(y, meta_pred_oof)
meta_r2 = r2_score(y, meta_pred_oof)
print(f"\nMETA ({meta_name}) OOF RMSE: {meta_rmse:.5f} | OOF R2: {meta_r2:.5f}")


META (RidgeCV) OOF RMSE: 26.45981 | OOF R2: 0.00062


In [12]:
weights = np.linalg.lstsq(
    X_stack, y, rcond=None
)[0]
# Normalize weights to sum to 1 (simple heuristic)
weights = np.maximum(0, weights)
if weights.sum() > 0:
    weights = weights / weights.sum()
else:
    weights = np.ones_like(weights) / len(weights)
print(f"Weights (heuristic blend) for {stack_cols}:", np.round(weights, 4))

blend_oof = X_stack @ weights
blend_test = X_stack_test @ weights
blend_rmse = rmse(y, blend_oof)
blend_r2 = r2_score(y, blend_oof)
print(f"BLEND  OOF RMSE: {blend_rmse:.5f} | OOF R2: {blend_r2:.5f}")

Weights (heuristic blend) for ['xgb', 'lgbm', 'cat']: [0.1276 0.2001 0.6723]
BLEND  OOF RMSE: 26.46013 | OOF R2: 0.00060


In [13]:
final_test_pred = meta_pred_test if meta_rmse <= blend_rmse else blend_test
print("Using:", meta_name if meta_rmse <= blend_rmse else "Heuristic Blend")

Using: RidgeCV


In [14]:
sub_out = sub.copy()
target_col = [c for c in sub_out.columns if c != (ID_COL or "id")]
if len(target_col) != 1:
    # Fallback: competition sample usually uses the target column name exactly as required
    target_name = "BeatsPerMinute"
else:
    target_name = target_col[0]

sub_out[target_name] = final_test_pred
save_path = "/kaggle/working/submission.csv"
sub_out.to_csv(save_path, index=False)
print("Saved:", save_path)

Saved: /kaggle/working/submission.csv
