In [1]:
import pandas as pd
import numpy as np
import os
import hashlib
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# 1. Load Data
train = pd.read_csv('train.csv')
y_true = train['exam_score'].values
sample_sub = pd.read_csv('sample_submission.csv')

# 2. Identify Unique Models
all_files = os.listdir('.')
oof_files = sorted([f for f in all_files if f.endswith('_oof.csv')])

unique_prefixes = []
seen_hashes = set()

print(f"Scanning {len(oof_files)} potential models...")

for f in oof_files:
    prefix = f.replace('_oof.csv', '')
    sub_file = f"{prefix}_sub.csv"
    
    if os.path.exists(sub_file):
        oof_data = pd.read_csv(f)['exam_score'].values
        # Create a unique fingerprint for the model predictions
        h = hashlib.md5(oof_data.tobytes()).hexdigest()
        
        if h not in seen_hashes:
            seen_hashes.add(h)
            unique_prefixes.append(prefix)
        else:
            print(f"Skipping duplicate: {prefix}")

# Load the unique OOFs and SUBS
X_train = np.stack([pd.read_csv(f"{p}_oof.csv")['exam_score'].values for p in unique_prefixes], axis=1)
X_test = np.stack([pd.read_csv(f"{p}_sub.csv")['exam_score'].values for p in unique_prefixes], axis=1)

print(f"Final stacking ensemble size: {X_train.shape[1]} unique models")

# 3. Meta-Learning (Ridge Stacking)
# 10-fold CV provides a more stable estimate of the best Alpha
kf = KFold(n_splits=10, shuffle=True, random_state=2026)
alphas = np.logspace(-1, 5, 50)

ridge = RidgeCV(alphas=alphas, scoring='neg_root_mean_squared_error', cv=kf)
ridge.fit(X_train, y_true)

stack_preds = ridge.predict(X_test)
print(f"Stacking CV Score: {-ridge.best_score_:.6f}")

# 4. The Elite Blend
# We are blending your best known score with the new stack
if os.path.exists('8.54853.csv'):
    print("Applying Elite Blend weights...")
    best_ever = pd.read_csv('8.54853.csv')['exam_score'].values
    
    # We use a 95/5 blend. We trust the 8.54853 file most, 
    # but use the stack to shave off a tiny bit of error.
    final_preds = (best_ever * 0.95) + (stack_preds * 0.05)
else:
    print("Warning: 8.54853.csv not found. Using stack only.")
    final_preds = stack_preds

# 5. Final Constraints & Export
# Clipping to the known range of the target variable
final_preds = np.clip(final_preds, 19.6, 100.0)

sample_sub['exam_score'] = final_preds
sample_sub.to_csv('submission_elite_improvement.csv', index=False)
print("Created 'submission_elite_improvement.csv'. Submit this to beat 8.54853.")

Scanning 40 potential models...
Skipping duplicate: akira_ensemble_v2
Skipping duplicate: akira_lgb_v2
Skipping duplicate: akira_xgb_v3
Skipping duplicate: dinev_xgb
Skipping duplicate: omid_resnet_v2
Skipping duplicate: rafi_xgb
Final stacking ensemble size: 34 unique models
Stacking CV Score: 8.585170
Applying Elite Blend weights...
Created 'submission_elite_improvement.csv'. Submit this to beat 8.54853.
