In [1]:
import pandas as pd
import numpy as np
import os
import hashlib
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
print("Loaded")

# 1. SETUP AND DATA LOADING
TARGET = 'exam_score'
# These are your top performers. We will give them the most weight.
ELITE_FILES = ['8.54853.csv', '8.54881.csv', '8.54905.csv', '8.54914.csv']
print("Files")

def get_unique_models():
    all_files = os.listdir('.')
    oof_files = sorted([f for f in all_files if f.endswith('_oof.csv')])
    unique_prefixes = []
    seen_hashes = set()
    
    for f in oof_files:
        prefix = f.replace('_oof.csv', '')
        if os.path.exists(f"{prefix}_sub.csv"):
            data = pd.read_csv(f)[TARGET].values
            h = hashlib.md5(data.tobytes()).hexdigest()
            if h not in seen_hashes:
                seen_hashes.add(h)
                unique_prefixes.append(prefix)
    return unique_prefixes
print("OS")

# 2. GENERATE A BASE STACK
train = pd.read_csv('train.csv')
y_true = train[TARGET].values
prefixes = get_unique_models()
print("start")

X_train = np.stack([pd.read_csv(f"{p}_oof.csv")[TARGET].values for p in prefixes], axis=1)
X_test = np.stack([pd.read_csv(f"{p}_sub.csv")[TARGET].values for p in prefixes], axis=1)
print("stack")

# Robust Ridge Stacking
kf = KFold(n_splits=10, shuffle=True, random_state=42)
ridge = RidgeCV(alphas=np.logspace(-1, 5, 50), scoring='neg_root_mean_squared_error', cv=kf)
ridge.fit(X_train, y_true)
stack_preds = ridge.predict(X_test)
print("Ridge")

# 3. THE POWER BLEND (The Breakthrough Step)
# We load your best file and the secondary elite files
best_file = pd.read_csv('8.54853.csv')[TARGET].values
elite_mean = np.mean([pd.read_csv(f)[TARGET].values for f in ELITE_FILES], axis=0)
print("Blend")

# Power Blending logic: 
# We take the best file and 'nudge' it with the stack and the elite mean.
# This specific ratio (85/10/5) is designed to preserve the 8.54853 quality 
# while incorporating the diversity of the other top models.
final_preds = (best_file * 0.85) + (elite_mean * 0.10) + (stack_preds * 0.05)

# 4. POST-PROCESSING
# Clipping based on the target range observed in training
final_preds = np.clip(final_preds, 19.6, 100.0)

# 5. CREATE SUBMISSION
sub = pd.read_csv('sample_submission.csv')
sub[TARGET] = final_preds
sub.to_csv('submission_final_push.csv', index=False)

print("Created 'submission_final_push.csv'.")
print(f"Blend Components: 85% Best, 10% Elite Mean, 5% Ridge Stack.")

Loaded
Files
OS
start
stack
Ridge
Blend
Created 'submission_final_push.csv'.
Blend Components: 85% Best, 10% Elite Mean, 5% Ridge Stack.
