In [1]:
!pip install hillclimbers -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for hillclimbers (setup.py) ... [?25l[?25hdone


In [2]:
import numpy as np
import pandas as pd
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import hashlib

from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold

from hillclimbers import climb_hill, partial

# ==========================================
# 0. CONFIGURATION
# ==========================================
test_req = False       # Set to True for fast execution, False for final submission
skip_hillclimb = False  
PATH = "/kaggle/input/scoring/" 
TARGET = 'exam_score'

# ==========================================
# RMSE COMPATIBILITY FIX (sklearn < 1.4)
# ==========================================
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# ==========================================
# 1. SETUP & DATA LOADING
# ==========================================
print(f"--- Initializing Data Loading (Mode: {'TEST' if test_req else 'FULL'}) ---")

oof_files = sorted(glob.glob(os.path.join(PATH, "**/*_oof.csv"), recursive=True))
if not oof_files:
    oof_files = sorted(glob.glob("/kaggle/input/**/*_oof.csv", recursive=True))

sub_files = [f.replace("_oof.csv", "_sub.csv") for f in oof_files]
model_names = [os.path.basename(f).replace("_oof.csv", "") for f in oof_files]

train_df = pd.read_csv("/kaggle/input/playground-series-s6e1/train.csv")
y_true = train_df[TARGET].values 

# --- Phase 0: Deduplication ---
unique_subs = {}
indices_to_keep = []

for i, (s_file, name) in enumerate(zip(sub_files, model_names)):
    temp_sub = pd.read_csv(s_file)[TARGET].values
    sub_hash = hashlib.md5(temp_sub.tobytes()).hexdigest()
    if sub_hash not in unique_subs:
        unique_subs[sub_hash] = name
        indices_to_keep.append(i)
    else:
        print(f" Dropping duplicate: {name}")

oof_files = [oof_files[i] for i in indices_to_keep]
sub_files = [sub_files[i] for i in indices_to_keep]
model_names = [model_names[i] for i in indices_to_keep]

oofs = np.stack([pd.read_csv(f)[TARGET].values for f in oof_files], axis=1)
subs = np.stack([pd.read_csv(f)[TARGET].values for f in sub_files], axis=1)

print(f" Data loaded. Models: {len(model_names)}")

# ==========================================
# 2. PHASE 1: HILL CLIMBING
# ==========================================
df_oof_indexed = pd.DataFrame(oofs, columns=model_names)
df_sub_indexed = pd.DataFrame(subs, columns=model_names)

hc_precision = 0.01 if test_req else 0.001
hc_negative = False if test_req else True

if test_req:
    np.random.seed(42)
    sample_idx = np.random.choice(len(train_df), size=int(len(train_df)*0.2), replace=False)
    hc_train_subset = train_df.iloc[sample_idx].reset_index(drop=True)
    hc_oof_subset = df_oof_indexed.iloc[sample_idx].reset_index(drop=True)
    print(f"Test Mode: Downsampled HC to {len(hc_train_subset)} rows.")
else:
    hc_train_subset = train_df
    hc_oof_subset = df_oof_indexed

if not skip_hillclimb:
    print(f"\n Initiating Hill Climbing (Precision: {hc_precision}, Neg Weights: {hc_negative})")
    
    hc_test, hc_oof = climb_hill(
        train=hc_train_subset, 
        target=TARGET, 
        objective='minimize', 
        eval_metric=partial(root_mean_squared_error),
        oof_pred_df=hc_oof_subset, 
        test_pred_df=df_sub_indexed,
        plot_hill=True,
        plot_hist=False, 
        precision=hc_precision,
        negative_weights=hc_negative,
        return_oof_preds=True
    )
    
    X_train_hc = hc_oof.reshape(-1, 1)
    y_true_stacking = (
        hc_train_subset[TARGET].values if test_req else y_true
    )
    X_test_hc = hc_test.reshape(-1, 1)
    selected_model_names = ['HC_Blended_Feature']
else:
    print("\n Skipping Hill Climbing.")
    X_train_hc = oofs
    y_true_stacking = y_true
    X_test_hc = subs
    selected_model_names = model_names

# ==========================================
# 3. PHASE 2: RIDGE CV (STACKING)
# ==========================================
print(f"\n--- Phase 2: Training RidgeCV Meta-Model ---")

kf_splits = 3 if test_req else 10 
kf = KFold(n_splits=kf_splits, shuffle=True, random_state=42)

alphas = (
    np.logspace(-2, 4, 15) if test_req 
    else np.logspace(-2, 7, 50)
)

oof_final_preds = np.zeros(len(y_true_stacking))
sub_final_preds = np.zeros(X_test_hc.shape[0]) 

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_hc)):
    X_tr, y_tr = X_train_hc[train_idx], y_true_stacking[train_idx]
    X_va, y_va = X_train_hc[val_idx], y_true_stacking[val_idx]
    
    model = RidgeCV(
        alphas=alphas,
        scoring='neg_root_mean_squared_error'
    )
    model.fit(X_tr, y_tr)
    
    oof_final_preds[val_idx] = model.predict(X_va)
    sub_final_preds += model.predict(X_test_hc) / kf_splits
    
    print(f"Fold {fold+1}/{kf_splits} complete. Alpha: {model.alpha_:.4f}")

# ==========================================
# 4. FINAL PERFORMANCE & EXPORT
# ==========================================
final_rmse = root_mean_squared_error(
    y_true_stacking, 
    oof_final_preds
)

print("\n" + "="*35)
print(f" FINAL ENSEMBLE RMSE: {final_rmse:.6f}")
print("="*35)

final_sub_preds = np.clip(
    sub_final_preds,
    train_df[TARGET].min(),
    train_df[TARGET].max()
)

sub_template = pd.read_csv(
    "/kaggle/input/playground-series-s6e1/sample_submission.csv"
)
sub_template[TARGET] = final_sub_preds

sub_file = f"submission_rmse_{final_rmse:.6f}.csv"
sub_template.to_csv(sub_file, index=False)

print(f" Saved to: {sub_file}")

--- Initializing Data Loading (Mode: FULL) ---
 Dropping duplicate: akira_ensemble_v2
 Dropping duplicate: akira_lgb_v2
 Dropping duplicate: akira_xgb_v3
 Dropping duplicate: dinev_xgb
 Dropping duplicate: omid_resnet_v2
 Dropping duplicate: rafi_xgb
 Data loaded. Models: 30

 Initiating Hill Climbing (Precision: 0.001, Neg Weights: True)
[1m[34m   /\  
  /__\  hillclimbers[0m[1m 
 /    \
/______\ 
[0m
[1m[33mModels to be ensembled | (30 total):[0m 

[1m[32msung:             8.59246 (best solo model)[0m
[1msunghur:          8.59246[0m
[1msunghur_ensemble: 8.59362[0m
[1momid_ensemble:    8.59936[0m
[1mutaazu_tabm:      8.60654[0m
[1mknight_ensemble:  8.60715[0m
[1mspiritmilk:       8.60723[0m
[1mhaha750:          8.60902[0m
[1makira_v2:         8.60917[0m
[1makira_xgb:        8.60919[0m
[1mgodara_xgb:       8.60927[0m
[1mrafi_v2:          8.61053[0m
[1makira_ensemble:   8.61072[0m
[1momid_v2:          8.61131[0m
[1mbhargava_xgb:     8.61161[0m
[1m


--- Phase 2: Training RidgeCV Meta-Model ---
Fold 1/10 complete. Alpha: 71.9686
Fold 2/10 complete. Alpha: 71.9686
Fold 3/10 complete. Alpha: 71.9686
Fold 4/10 complete. Alpha: 71.9686
Fold 5/10 complete. Alpha: 71.9686
Fold 6/10 complete. Alpha: 71.9686
Fold 7/10 complete. Alpha: 71.9686
Fold 8/10 complete. Alpha: 71.9686
Fold 9/10 complete. Alpha: 71.9686
Fold 10/10 complete. Alpha: 71.9686

 FINAL ENSEMBLE RMSE: 8.585401
 Saved to: submission_rmse_8.585401.csv
