In [1]:
import pandas as pd
import numpy as np
import os

# ==========================================
# 1. FILE MAPPING
# ==========================================
# We map your files to the IDs used in the notebook logic
# 1, 2, 3 are the "Base Models"
# 24, 25, 28, 29 are the "High Scoring Models"
FILES = {
    '1':  'submission (4).csv',            # Single XGB
    '2':  'submission (6).csv',            # Bayesian
    '3':  'submission_rmse_8.583118.csv',  # Ridge Stack
    '24': 'submission (2).csv',            # hb24g
    '25': 'submission (1).csv',            # hb25g
    '28': 'submission (3).csv',            # hb28g (Your Best)
    '29': 'submission (5).csv'             # hb29g
}

def load_data():
    dfs = {}
    print(">>> Loading Files...")
    for key, path in FILES.items():
        if not os.path.exists(path):
            print(f"CRITICAL ERROR: Missing {path}")
            return None
        dfs[key] = pd.read_csv(path)
    return dfs

# ==========================================
# 2. THE RANK-BASED BLEND FUNCTION (The Secret)
# ==========================================
def h_blend(df_list, base_wts, sub_wts, asc_weight, desc_weight):
    """
    Replicates the 'h_blend' logic from the notebook.
    Applies weights based on the RANK of the prediction in the row.
    """
    # Extract score arrays
    scores = np.stack([df['exam_score'].values for df in df_list], axis=1) # Shape: (N_rows, N_models)
    
    # --- ASCENDING SORT BLEND ---
    # Sort indices: Smallest to Largest
    # argsort returns the indices that would sort the array
    asc_idx = np.argsort(scores, axis=1)
    
    # Calculate Weighted Sum for Ascending
    # Formula: Value * (Base_Wt + Sub_Wt_for_that_Rank)
    # If a model is lowest (rank 0), it gets sub_wts[0] added to its weight
    asc_preds = np.zeros(scores.shape[0])
    for r in range(scores.shape[0]):
        row_scores = scores[r]
        row_idx = asc_idx[r] # Indices of models sorted by score
        
        row_sum = 0
        for rank, model_idx in enumerate(row_idx):
            # The weight depends on the RANK (0=Lowest, etc.)
            w = base_wts[model_idx] + sub_wts[rank]
            row_sum += row_scores[model_idx] * w
        asc_preds[r] = row_sum

    # --- DESCENDING SORT BLEND ---
    # Sort indices: Largest to Smallest
    # We just reverse the asc_idx or use argsort on negative
    desc_idx = np.argsort(-scores, axis=1)
    
    desc_preds = np.zeros(scores.shape[0])
    for r in range(scores.shape[0]):
        row_scores = scores[r]
        row_idx = desc_idx[r] # Indices of models sorted High to Low
        
        row_sum = 0
        for rank, model_idx in enumerate(row_idx):
            # Rank 0 here means "Highest Score"
            w = base_wts[model_idx] + sub_wts[rank]
            row_sum += row_scores[model_idx] * w
        desc_preds[r] = row_sum
        
    # --- FINAL MIX ---
    final = (desc_preds * desc_weight) + (asc_preds * asc_weight)
    return final

def main():
    dfs = load_data()
    if dfs is None: return

    # ==========================================
    # STEP 1: CREATE 'df123' (The Core Blend)
    # ==========================================
    print("Running Stage 1: Core Blend (df123)...")
    # Notebook Params for Step 1:
    # Models: 1, 2, 3
    # Weights: 0.33, 0.34, 0.33
    # SubWts: +0.55, -0.20, -0.35 (Boosts the Rank 0 model heavily)
    # Sort Mix: 30% Asc, 70% Desc
    
    input_dfs = [dfs['1'], dfs['2'], dfs['3']]
    base_wts = [0.33, 0.34, 0.33]
    sub_wts = [0.55, -0.20, -0.35]
    
    df123_scores = h_blend(input_dfs, base_wts, sub_wts, 0.30, 0.70)
    
    # ==========================================
    # STEP 2: INJECTION (Linear Blending)
    # ==========================================
    print("Running Stage 2: Injecting Core into Main Models...")
    # Notebook uses 'b2' function: 97% Main, 3% Core
    ratio_main = 0.97
    ratio_core = 0.03
    
    h4_sc = dfs['24']['exam_score'] * ratio_main + df123_scores * ratio_core
    h5_sc = dfs['25']['exam_score'] * ratio_main + df123_scores * ratio_core
    h8_sc = dfs['28']['exam_score'] * ratio_main + df123_scores * ratio_core
    h9_sc = dfs['29']['exam_score'] * ratio_main + df123_scores * ratio_core
    
    # Create temporary DataFrames for the next step
    h4 = pd.DataFrame({'exam_score': h4_sc})
    h5 = pd.DataFrame({'exam_score': h5_sc})
    h8 = pd.DataFrame({'exam_score': h8_sc})
    h9 = pd.DataFrame({'exam_score': h9_sc})

    # ==========================================
    # STEP 3: FINAL ENSEMBLE (df1)
    # ==========================================
    print("Running Stage 3: Final Rank Blend...")
    # Notebook Params for Step 3:
    # Models: 4(h4), 5(h5), 8(h8), 9(h9)
    # Base Weights: 0.21, 0.08, 0.23, 0.48
    # SubWts: [-0.25, 0, +0.55, -0.30]
    # Sort Mix: 30% Asc, 70% Desc
    
    final_inputs = [h4, h5, h8, h9]
    final_base_wts = [0.21, 0.08, 0.23, 0.48]
    final_sub_wts = [-0.25, 0.00, 0.55, -0.30]
    
    final_scores = h_blend(final_inputs, final_base_wts, final_sub_wts, 0.30, 0.70)
    
    # ==========================================
    # SAVE
    # ==========================================
    final_scores = np.clip(final_scores, 19.6, 100.0)
    
    output_file = 'submission_notebook_clone.csv'
    submission = pd.DataFrame({'id': dfs['1']['id'], 'exam_score': final_scores})
    submission.to_csv(output_file, index=False)
    
    print(f"\nSUCCESS! Created '{output_file}'")
    print(f"Mean: {final_scores.mean():.5f}")
    print("NOTE: This script performs the exact Rank-Based Weighting from the notebook.")

if __name__ == "__main__":
    main()

>>> Loading Files...
Running Stage 1: Core Blend (df123)...
Running Stage 2: Injecting Core into Main Models...
Running Stage 3: Final Rank Blend...

SUCCESS! Created 'submission_notebook_clone.csv'
Mean: 62.51729
NOTE: This script performs the exact Rank-Based Weighting from the notebook.
