In [1]:
import pandas as pd
import numpy as np
import os

# ==========================================
# 1. PRECISE FILE MAPPING
# ==========================================
FILES = {
    # CORE MODELS (The "Correction Signal" Generators)
    '0':  'submission (7).csv',            # Pseudo-Label LR
    '1':  'submission (6).csv',            # Bayesian
    '2':  'submission (4).csv',            # Single XGB
    '3':  'submission_rmse_8.583118.csv',  # Ridge Stack (Check if this exists)
    
    # TARGET MODELS (The ones being improved)
    '24': 'submission (2).csv',            # hb24g
    '25': 'submission (1).csv',            # hb25g
    '28': 'submission (3).csv',            # hb28g
    '29': 'submission (5).csv'             # hb29g
}

# ==========================================
# 2. HELPER FUNCTIONS
# ==========================================
def load_data():
    dfs = {}
    print(">>> Loading CSVs...")
    for k, v in FILES.items():
        if not os.path.exists(v):
            # Fallback if Ridge file is missing (Use hb28g as proxy if needed, but risky)
            if k == '3': 
                print(f"WARNING: {v} missing. Using submission (3).csv as fallback.")
                dfs[k] = pd.read_csv('submission (3).csv')
            else:
                print(f"CRITICAL ERROR: {v} missing!")
                return None
        else:
            dfs[k] = pd.read_csv(v)
    return dfs

def h_blend_precise(dfs_list, base_wts, sub_wts, asc_weight=0.30, desc_weight=0.70):
    # This function replicates the notebook's rank-blending exactly
    # Combining data for speed
    cols = [str(i) for i in range(len(dfs_list))]
    data = pd.DataFrame({cols[i]: dfs_list[i]['exam_score'].values for i in range(len(dfs_list))})
    
    # Descending Sort (Highest predictions get specific weights)
    desc_preds = np.zeros(len(data))
    # We use argsort. Note: argsort sorts ascending. 
    # For descending, we use -values
    scores_array = data.values
    desc_idx = np.argsort(-scores_array, axis=1)
    
    for r in range(len(data)):
        row_idx = desc_idx[r]
        row_sum = 0
        for rank, model_idx in enumerate(row_idx):
            # Rank 0 = Highest Score
            w = base_wts[model_idx] + sub_wts[rank]
            row_sum += scores_array[r, model_idx] * w
        desc_preds[r] = row_sum
        
    # Ascending Sort (Lowest predictions get specific weights)
    asc_preds = np.zeros(len(data))
    asc_idx = np.argsort(scores_array, axis=1)
    
    for r in range(len(data)):
        row_idx = asc_idx[r]
        row_sum = 0
        for rank, model_idx in enumerate(row_idx):
            # Rank 0 = Lowest Score
            w = base_wts[model_idx] + sub_wts[rank]
            row_sum += scores_array[r, model_idx] * w
        asc_preds[r] = row_sum
        
    return (desc_preds * desc_weight) + (asc_preds * asc_weight)

# ==========================================
# 3. THE 3-STAGE PIPELINE
# ==========================================
def main():
    dfs = load_data()
    if dfs is None: return

    print(">>> STARTING 8.54381 REPLICATION...")
    
    # ---------------------------------------------------------
    # STAGE 1 (es1): 4% Injection
    # Core Wts: [32, 32, 18, 18], Sub: [+0.55, -0.10, -0.20, -0.25]
    # ---------------------------------------------------------
    print("Computing Stage 1 (vs=[0.96, 0.04])...")
    core_1 = h_blend_precise(
        [dfs['0'], dfs['1'], dfs['2'], dfs['3']], 
        [0.32, 0.32, 0.18, 0.18], 
        [+0.55, -0.10, -0.20, -0.25]
    )
    
    # Inject 4% Core
    h_v1 = [pd.DataFrame({'exam_score': dfs[m]['exam_score']*0.96 + core_1*0.04}) for m in ['24', '25', '28', '29']]
    
    # Final Rank Blend (Standard weights)
    es1 = h_blend_precise(h_v1, [0.21, 0.08, 0.23, 0.48], [-0.25, 0, 0.55, -0.30])

    # ---------------------------------------------------------
    # STAGE 2 (es2): 11% Injection
    # Core Wts: [33, 33, 17, 17], Sub: [+0.11, -0.01, -0.03, -0.07]
    # ---------------------------------------------------------
    print("Computing Stage 2 (vs=[0.89, 0.11])...")
    core_2 = h_blend_precise(
        [dfs['0'], dfs['1'], dfs['2'], dfs['3']], 
        [0.33, 0.33, 0.17, 0.17], 
        [+0.11, -0.01, -0.03, -0.07]
    )
    
    # Inject 11% Core
    h_v2 = [pd.DataFrame({'exam_score': dfs[m]['exam_score']*0.89 + core_2*0.11}) for m in ['24', '25', '28', '29']]
    
    # Final Rank Blend
    es2 = h_blend_precise(h_v2, [0.21, 0.08, 0.23, 0.48], [-0.25, 0, 0.55, -0.30])

    # ---------------------------------------------------------
    # STAGE 3 (es3): 18% Injection
    # Core Wts: [34.5, 34.5, 15.5, 15.5], Sub: [+0.55, -0.10, -0.20, -0.25]
    # ---------------------------------------------------------
    print("Computing Stage 3 (vs=[0.82, 0.18])...")
    core_3 = h_blend_precise(
        [dfs['0'], dfs['1'], dfs['2'], dfs['3']], 
        [0.345, 0.345, 0.155, 0.155], 
        [+0.55, -0.10, -0.20, -0.25]
    )
    
    # Inject 18% Core
    h_v3 = [pd.DataFrame({'exam_score': dfs[m]['exam_score']*0.82 + core_3*0.18}) for m in ['24', '25', '28', '29']]
    
    # Final Rank Blend
    es3 = h_blend_precise(h_v3, [0.21, 0.08, 0.23, 0.48], [-0.25, 0, 0.55, -0.30])

    # ---------------------------------------------------------
    # FINAL TREND ENSEMBLE (The 8.54381 Logic)
    # ---------------------------------------------------------
    print("Applying Trend Logic (ct=1.00117)...")
    ct1 = 1.00117
    ct2 = 1.00117
    
    final_preds = np.zeros(len(es1))
    
    for i in range(len(es1)):
        v1, v2, v3 = es1[i], es2[i], es3[i]
        
        # Trend Increasing: v1 < v2 < v3 -> Boost v3
        if v1 < v2 and v2 < v3:
            final_preds[i] = v3 * ct1
        # Trend Decreasing: v1 > v2 > v3 -> Suppress v3
        elif v1 > v2 and v2 > v3:
            final_preds[i] = v3 / ct2
        # No Trend -> Use v3 (The most aggressive correction)
        else:
            final_preds[i] = v3

    # Clip and Save
    final_preds = np.clip(final_preds, 19.6, 100.0)
    
    output_file = 'submission_trend_854381.csv'
    submission = pd.DataFrame({'id': dfs['0']['id'], 'exam_score': final_preds})
    submission.to_csv(output_file, index=False)
    
    print(f"\nSUCCESS! Created '{output_file}'")
    print(f"Mean: {final_preds.mean():.6f}")
    print("Strategy: 4-Core Model, 3-Stage Injection, Trend-Based Post-Processing.")

if __name__ == "__main__":
    main()

>>> Loading CSVs...
>>> STARTING 8.54381 REPLICATION...
Computing Stage 1 (vs=[0.96, 0.04])...
Computing Stage 2 (vs=[0.89, 0.11])...
Computing Stage 3 (vs=[0.82, 0.18])...
Applying Trend Logic (ct=1.00117)...

SUCCESS! Created 'submission_trend_854381.csv'
Mean: 62.549564
Strategy: 4-Core Model, 3-Stage Injection, Trend-Based Post-Processing.
