#### Filtering out the nan seed of Gaussian scoring on WEC 

In [None]:
import os
import time
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from utils import seed_everything, DEVICE
from data_utils import *  
from methods import * 


# Suppress warnings
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
warnings.filterwarnings("ignore", category=RuntimeWarning, module="threadpoolctl")

# Data split
def rcp_protocol_split(X, Y, cal_size=0.2, seed=42):
    """Splits data into Training, Calibration, and Test sets."""
    # Convert cal_size fraction to int if needed, here simplified
    n_cal = int(len(X) * cal_size)    
    X_rem, X_cal, Y_rem, Y_cal = train_test_split(X, Y, test_size=n_cal, random_state=seed)
    X_tr, X_te, Y_tr, Y_te = train_test_split(X_rem, Y_rem, test_size=0.25, random_state=seed)
    return X_tr, Y_tr, X_cal, Y_cal, X_te, Y_te


# Main function
def run_benchmark_suite():
    seed_everything(42)
    print(f"Using Device: {DEVICE}")

    # Dataset registry
    dataset_loaders = {    
        "WEC": load_wec,                 
    }
    
    alpha = 0.1
    n_seeds = 20
    
    # Method registry 
    methods = [
        # ('Split', run_split),
        # ('PLCP-Pin-G20', lambda *a: run_plcp(*a, n_groups=20, score_type='pinball')),
        # ('PLCP-Pin-G50', lambda *a: run_plcp(*a, n_groups=50, score_type='pinball')),
        ('Gaussian-Scoring', run_gaussian_scoring),
        # ('CQR-Pinball', lambda *a: run_cqr(*a, 'pinball')),
        # ('CQR-ALD', lambda *a: run_cqr(*a, 'ald')),        
        # ('RCP-Pinball', run_rcp),        
        # ('RCP-ALD', lambda *a: run_rcp(*a, 'ald')),
        # ('RCP-MultiHead', run_rcp_multi_head),                
        
        # # Colorful Pinball Variants (CPCP)
        # ('CPCP-Split-0.02', lambda *a, **k: run_rcp_density_improved(*a, epsilon=0.02, mode='vanilla', **k)),                
        # ('CPCP-Clip-0.02', lambda *a, **k: run_rcp_density_improved(*a, epsilon=0.02, mode='clip', clip_max=5.0, **k)),            
        # ('CPCP-Mix-0.02', lambda *a, **k: run_rcp_density_improved(*a, epsilon=0.02, mode='mix', mix_ratio=0.5, **k)),
        # ('CPCP-Clip+Mix-0.02', lambda *a, **k: run_rcp_density_improved(*a, epsilon=0.02, mode='clip', clip_max=5.0, mix_ratio=0.5, **k)),
        # ('CPCP-Split-0.01', lambda *a, **k: run_rcp_density_improved(*a, epsilon=0.01, mode='vanilla', **k)),          
        # ('CPCP-Clip+Mix-0.01', lambda *a, **k: run_rcp_density_improved(*a, epsilon=0.01, mode='clip', clip_max=5.0, mix_ratio=0.5, **k)),
        # ('CPCP-Split-0.05', lambda *a, **k: run_rcp_density_improved(*a, epsilon=0.05, mode='vanilla', **k)),  
        # ('CPCP-Clip+Mix-0.05', lambda *a, **k: run_rcp_density_improved(*a, epsilon=0.05, mode='clip', clip_max=5.0, mix_ratio=0.5, **k)),
    ]
    
    for ds_name, loader in dataset_loaders.items():
        print(f"\n>>>>>> Running {ds_name} <<<<<<")
        try: 
            X, Y = loader()
            print(f"Data Shape: {X.shape}, {Y.shape}")
        except Exception as e: 
            print(f"Error loading {ds_name}: {e}")
            continue
            
        results = {m[0]: [] for m in methods}
        
        total_start_time = time.time()

        for seed in range(n_seeds):
            seed_start_time = time.time()
            print(f"Seed {seed}...", end="", flush=True)
            X_tr, Y_tr, X_cal, Y_cal, X_te, Y_te = rcp_protocol_split(X, Y, seed=42+seed)
            
            for name, func in methods:
                try:
                    # Pass extra args for CPCP methods
                    if 'CPCP' in name or 'RCP-Density' in name:
                        res = func(X_tr, Y_tr, X_cal, Y_cal, X_te, Y_te, alpha, dataset_name=ds_name, seed=seed)
                    else:
                        res = func(X_tr, Y_tr, X_cal, Y_cal, X_te, Y_te, alpha)
                    results[name].append(res)
                except Exception as e:
                    print(f" Err({name}:{e})", end="")

            seed_duration = time.time() - seed_start_time
            total_elapsed = time.time() - total_start_time
            # print(" Done")
            print(f" Done (Seed Time: {seed_duration/60:.2f}m | Total Time: {total_elapsed/60:.2f}m)")

        # Summary & Save
        torch.save(results, "results_gs_wec_supp_norobust.pkl")
        summary_rows = []
        for name, mets in results.items():
            if not mets: continue
            row = {'Method': name}
            for k in ['Cov', 'Size', 'WSC', 'MSCE_10', 'MSCE_30', "L1-ERT", "L2-ERT"]:
                vals = [m[k] for m in mets]
                row[k] = f"{np.mean(vals):.4f} ± {np.std(vals):.4f}"

            summary_rows.append(row)
        
        print("\nSummary:")
        df_res = pd.DataFrame(summary_rows)
        print(df_res)
        
        if not os.path.exists("./results"): os.makedirs("./results")
        df_res.to_csv(f"./results/{ds_name}_results_supp.csv")





In [None]:
run_benchmark_suite()

In [None]:
gs_results_dict = torch.load("results_gs_wec_supp_norobust.pkl", weights_only=False)
summary_rows = []

# Iterate through each method in the dictionary
for name, mets in gs_results_dict.items():
    if not mets: 
        continue
    
    # Filter out seeds where 'Size' is NaN (treating them as anomalies)
    # We check if 'Size' exists and is not a nan value
    valid_seeds = [m for m in mets if 'Size' in m and not np.isnan(m['Size'])]
    
    # Optional: Print how many seeds were removed
    print(f"Method: {name} | Total: {len(mets)} | Valid: {len(valid_seeds)} | Removed: {len(mets) - len(valid_seeds)}")
    
    if not valid_seeds:
        print(f"Warning: No valid seeds found for {name}")
        continue

    # Prepare a row for the summary dataframe
    row = {'Method': name}
    
    # List of metrics to process
    metrics_list = ['Cov', 'Size', 'WSC', 'MSCE_10', 'MSCE_30', 'L1-ERT', 'L2-ERT']
    
    for k in metrics_list:
        # Extract values for the current metric from valid seeds
        vals = [m[k] for m in valid_seeds if k in m]
        
        if vals:
            # Calculate mean and standard deviation
            mean_val = np.mean(vals)
            std_val = np.std(vals)
            # Format as "mean ± std" (4 decimal places)
            row[k] = f"{mean_val:.4f} ± {std_val:.4f}"
        else:
            row[k] = "N/A"
            
    summary_rows.append(row)

print("\nSummary Table (Filtered):")
df_res = pd.DataFrame(summary_rows)

# Reorder columns to ensure they match the desired output structure
cols_order = ['Method', 'Cov', 'Size', 'WSC', 'MSCE_10', 'MSCE_30', 'L1-ERT', 'L2-ERT']
df_res = df_res[cols_order]

# Display the dataframe
display(df_res)

Method: Gaussian-Scoring | Total: 20 | Valid: 19 | Removed: 1

Summary Table (Filtered):


Unnamed: 0,Method,Cov,Size,WSC,MSCE_10,MSCE_30,L1-ERT,L2-ERT
0,Gaussian-Scoring,0.9006 ± 0.0032,1.6395 ± 0.2775,0.8259 ± 0.0347,0.0057 ± 0.0026,0.0082 ± 0.0033,0.0691 ± 0.0120,0.0079 ± 0.0035


In [2]:
df_res.to_csv("results/WEC_gs_supp.csv")