In [1]:
import pandas as pd
import numpy as np
import os
import hashlib
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
print("Loaded")

# ==========================================
# CONFIGURATION
# ==========================================
TARGET = 'exam_score'

# Your "Gold Standard" files. 
# We trust the first one the most, but the others help smooth out noise.
ELITE_SUBS = [
    '8.54853.csv',  # The Anchor (Best)
    '8.54881.csv',  # The Validator (2nd Best)
    '8.54905.csv'   # The Validator (3rd Best)
]
print("Csv")

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def load_and_deduplicate():
    """Scans for OOF/SUB pairs and removes mathematical duplicates."""
    all_files = os.listdir('.')
    oof_files = sorted([f for f in all_files if f.endswith('_oof.csv')])
    
    unique_prefixes = []
    seen_hashes = set()
    
    print(f"Scanning {len(oof_files)} community models...")
    
    for f in oof_files:
        prefix = f.replace('_oof.csv', '')
        sub_file = f"{prefix}_sub.csv"
        
        if os.path.exists(sub_file):
            # Load OOF to check fingerprint
            try:
                oof_data = pd.read_csv(f)[TARGET].values
                file_hash = hashlib.md5(oof_data.tobytes()).hexdigest()
                
                if file_hash not in seen_hashes:
                    seen_hashes.add(file_hash)
                    unique_prefixes.append(prefix)
                else:
                    # Determine which file is being skipped for clarity
                    pass 
            except Exception as e:
                print(f"Error reading {f}: {e}")
                
    print(f"Found {len(unique_prefixes)} unique models out of {len(oof_files)}.")
    return unique_prefixes

print("dupes")


def main():
    # 1. Load Ground Truth
    print("Loading Train Data...")
    train = pd.read_csv('train.csv')
    y_true = train[TARGET].values
    
    # 2. Generate Community Stack (RidgeCV)
    # This provides a 'consensus' baseline from the 40+ models
    prefixes = load_and_deduplicate()
    
    print("Building Community Stack...")
    X_train = np.stack([pd.read_csv(f"{p}_oof.csv")[TARGET].values for p in prefixes], axis=1)
    X_test = np.stack([pd.read_csv(f"{p}_sub.csv")[TARGET].values for p in prefixes], axis=1)
    
    # High-precision Ridge
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    # Finer alpha search for better regularization
    alphas = np.logspace(-2, 5, 100) 
    
    ridge = RidgeCV(alphas=alphas, scoring='neg_root_mean_squared_error', cv=kf)
    ridge.fit(X_train, y_true)
    
    stack_preds = ridge.predict(X_test)
    print(f"Community Ridge Score (CV): {-ridge.best_score_:.6f}")
    
    # 3. Load Elite Submissions
    print("\nLoading Elite Submissions...")
    elite_preds = []
    valid_elites = []
    
    for f in ELITE_SUBS:
        if os.path.exists(f):
            print(f"Loaded: {f}")
            elite_preds.append(pd.read_csv(f)[TARGET].values)
            valid_elites.append(f)
        else:
            print(f"Warning: {f} not found. Skipping.")
            
    if not valid_elites:
        print("CRITICAL ERROR: No Elite files found. Cannot perform blending.")
        return

    # 4. The "Titan" Blend Logic
    # We combine the "Best File", the "Secondary Elites", and the "Stack".
    
    # Part A: The Elite Average (Weighted towards the best)
    # If we have all 3 files:
    if len(elite_preds) >= 2:
        # We give massive weight to the 8.54853 file
        # But we mix in the others to reduce variance
        p1 = elite_preds[0] # 8.54853
        p2 = elite_preds[1] # 8.54881
        
        # This 80/20 split is a common heuristic to improve a top score
        elite_ensemble = (p1 * 0.80) + (p2 * 0.20)
        
        if len(elite_preds) > 2:
            p3 = elite_preds[2]
            # Refined mix: 70% Best, 20% 2nd, 10% 3rd
            elite_ensemble = (p1 * 0.70) + (p2 * 0.20) + (p3 * 0.10)
    else:
        elite_ensemble = elite_preds[0]

    # Part B: Final Mix with Community Stack
    # We trust the Elite Ensemble 95%, and the Community Stack 5%
    # This 5% injection adds "diversity" without destroying the high accuracy.
    final_preds = (elite_ensemble * 0.95) + (stack_preds * 0.05)

    # 5. Post-Processing (Clipping)
    # Synthetic data often stays within specific bounds
    final_preds = np.clip(final_preds, 19.6, 100.0)
    
    # 6. Export
    sub = pd.read_csv('sample_submission.csv')
    sub[TARGET] = final_preds
    output_filename = 'submission_titan_blend.csv'
    sub.to_csv(output_filename, index=False)
    
    print(f"\nSUCCESS! Created {output_filename}")
    print("This file blends your top scores with a deduplicated Ridge stack.")

if __name__ == "__main__":
    main()

Loaded
Csv
dupes
Loading Train Data...
Scanning 40 community models...
Found 34 unique models out of 40.
Building Community Stack...
Community Ridge Score (CV): 8.585271

Loading Elite Submissions...
Loaded: 8.54853.csv
Loaded: 8.54881.csv
Loaded: 8.54905.csv

SUCCESS! Created submission_titan_blend.csv
This file blends your top scores with a deduplicated Ridge stack.
