In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import re

In [2]:
def normalize_setup_name(setup):
    # Remove the "sdXXXX" part (e.g., sd2021)
    return re.sub(r'_sd\d+', '', setup)

def parse_result_file(filepath):
    results = {}
    with open(filepath, 'r') as f:
        lines = [line.strip() for line in f if line.strip() != '']

    for i in range(0, len(lines), 2):
        setup_raw = lines[i]
        setup = normalize_setup_name(setup_raw)

        metrics_line = lines[i + 1]
        parts = metrics_line.split(',')
        mse = float(parts[0].split(':')[1].strip())
        mae = float(parts[1].split(':')[1].strip())

        if setup not in results:
            results[setup] = []
        results[setup].append({'mse': mse, 'mae': mae})
    
    return results

# def collect_all_results(folder_path):
#     all_results = {}  # {setup: {seed1: {...}, seed2: {...}, ...}}

#     for filename in os.listdir(folder_path):
#         if filename.endswith('.txt'):
#             seed_name = os.path.splitext(filename)[0]
#             seed = seed_name[-4:]
#             print(seed_name)
#             filepath = os.path.join(folder_path, filename)
#             results = parse_result_file(filepath)
#             print(results)
            
#             for setup, metrics in results.items():
#                 if setup not in all_results:
#                     all_results[setup] = {}
#                 all_results[setup][seed] = metrics
    
#     return all_results

def collect_all_results(folder_path):
    all_results = {}

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            filepath = os.path.join(folder_path, filename)
            file_results = parse_result_file(filepath)

            for setup, metrics_list in file_results.items():
                if setup not in all_results:
                    all_results[setup] = []
                all_results[setup].extend(metrics_list)

    return all_results

In [3]:
exp = 'env_48_s'

folder = '../res/' + exp
results_dict = collect_all_results(folder)

In [4]:
results_dict

{'custom_pl48_unFalse_udFalse_futemporal_ebtoken_ffmlp_ftS': [{'mse': 0.34428736567497253,
   'mae': 0.44022008776664734},
  {'mse': 0.35615894198417664, 'mae': 0.44934242963790894},
  {'mse': 0.34480321407318115, 'mae': 0.4419197738170624},
  {'mse': 0.35975828766822815, 'mae': 0.46986791491508484}],
 'custom_pl48_unFalse_udFalse_futemporal_ebtoken_ffrnn_ftS': [{'mse': 0.3064539134502411,
   'mae': 0.40500861406326294},
  {'mse': 0.34036698937416077, 'mae': 0.4493477940559387},
  {'mse': 0.3214545249938965, 'mae': 0.4383031129837036},
  {'mse': 0.32456129789352417, 'mae': 0.4250189960002899}],
 'custom_pl48_unFalse_udFalse_futemporal_ebtoken_fftrans_ftS': [{'mse': 0.3281436264514923,
   'mae': 0.43946707248687744},
  {'mse': 0.3585853576660156, 'mae': 0.455380380153656},
  {'mse': 0.31747013330459595, 'mae': 0.42891162633895874},
  {'mse': 0.35860690474510193, 'mae': 0.4568808376789093}],
 'custom_pl48_unFalse_udFalse_futemporal_ebpatch_ffmlp_ftS': [{'mse': 0.33119288086891174,
   'ma

In [5]:
def summarize_results(all_results):
    summary = {}

    for setup, metrics_list in all_results.items():
        mse_values = [m['mse'] for m in metrics_list]
        mae_values = [m['mae'] for m in metrics_list]

        mse_mean = np.mean(mse_values)
        mse_std = np.std(mse_values, ddof=1) if len(mse_values) > 1 else 0.0
        mae_mean = np.mean(mae_values)
        mae_std = np.std(mae_values, ddof=1) if len(mae_values) > 1 else 0.0

        summary[setup] = {
            'mse_mean': float(mse_mean),
            'mse_std': float(mse_std),
            'mae_mean': float(mae_mean),
            'mae_std': float(mae_std),
            'num_seeds': len(mse_values),
        }

    return summary

In [6]:
summary = summarize_results(results_dict)

In [7]:
summary

{'custom_pl48_unFalse_udFalse_futemporal_ebtoken_ffmlp_ftS': {'mse_mean': 0.3512519523501396,
  'mse_std': 0.007885175557820114,
  'mae_mean': 0.4503375515341759,
  'mae_std': 0.013609401360343578,
  'num_seeds': 4},
 'custom_pl48_unFalse_udFalse_futemporal_ebtoken_ffrnn_ftS': {'mse_mean': 0.3232091814279556,
  'mse_std': 0.013904872247548496,
  'mae_mean': 0.4294196292757988,
  'mae_std': 0.01907276654661643,
  'num_seeds': 4},
 'custom_pl48_unFalse_udFalse_futemporal_ebtoken_fftrans_ftS': {'mse_mean': 0.34070150554180145,
  'mse_std': 0.021117389916849253,
  'mae_mean': 0.4451599791646004,
  'mae_std': 0.013394691444465805,
  'num_seeds': 4},
 'custom_pl48_unFalse_udFalse_futemporal_ebpatch_ffmlp_ftS': {'mse_mean': 0.3276633024215698,
  'mse_std': 0.015548595864986216,
  'mae_mean': 0.4474526420235634,
  'mae_std': 0.021302126341410526,
  'num_seeds': 4},
 'custom_pl48_unFalse_udFalse_futemporal_ebpatch_ffrnn_ftS': {'mse_mean': 0.34367785602808,
  'mse_std': 0.011008265539328212,
  '

In [8]:
def rank_setups(summary):
    setups = list(summary.keys())
    mse_means = [summary[setup]['mse_mean'] for setup in setups]
    mae_means = [summary[setup]['mae_mean'] for setup in setups]

    # Rank setups: lower value = better rank (1 is best)
    mse_sorted = sorted(zip(mse_means, setups))
    mae_sorted = sorted(zip(mae_means, setups))

    mse_ranks = {setup: rank for rank, (_, setup) in enumerate(mse_sorted, 1)}
    mae_ranks = {setup: rank for rank, (_, setup) in enumerate(mae_sorted, 1)}

    ranked_summary = {}
    for setup in setups:
        mse_rank = mse_ranks[setup]
        mae_rank = mae_ranks[setup]
        avg_rank = (mse_rank + mae_rank) / 2

        ranked_summary[setup] = {
            **summary[setup],
            'mse_rank': mse_rank,
            'mae_rank': mae_rank,
            'avg_rank': avg_rank
        }

    # Sort by average rank
    ranked_summary = dict(sorted(ranked_summary.items(), key=lambda x: x[1]['avg_rank']))
    return ranked_summary

In [9]:
ranked = rank_setups(summary)

print("Setup Ranking (Lower is Better):")
for i, (setup, stats) in enumerate(ranked.items(), 1):
    print(f"{i:2d}. {setup} | Avg Rank: {stats['avg_rank']:.2f} | "
          f"MSE: {stats['mse_mean']:.4f} (#{stats['mse_rank']}) ± {stats['mse_std']:.4f}, "
          f"MAE: {stats['mae_mean']:.4f} (#{stats['mae_rank']}) ± {stats['mae_std']:.4f}")


Setup Ranking (Lower is Better):
 1. custom_pl48_unTrue_udTrue_futemporal_ebpatch_ffrnn_ftS | Avg Rank: 2.00 | MSE: 0.2912 (#1) ± 0.0025, MAE: 0.3786 (#3) ± 0.0030
 2. custom_pl48_unTrue_udTrue_fufeature_ebtoken_ffrnn_ftS | Avg Rank: 2.50 | MSE: 0.2920 (#4) ± 0.0013, MAE: 0.3781 (#1) ± 0.0013
 3. custom_pl48_unTrue_udFalse_fufeature_ebinvert_ffrnn_ftS | Avg Rank: 3.50 | MSE: 0.2919 (#3) ± 0.0012, MAE: 0.3789 (#4) ± 0.0014
 4. custom_pl48_unTrue_udTrue_fufeature_ebinvert_ffrnn_ftS | Avg Rank: 3.50 | MSE: 0.2913 (#2) ± 0.0036, MAE: 0.3795 (#5) ± 0.0036
 5. custom_pl48_unTrue_udTrue_fufeature_ebnone_ffrnn_ftS | Avg Rank: 6.00 | MSE: 0.2921 (#5) ± 0.0023, MAE: 0.3800 (#7) ± 0.0026
 6. custom_pl48_unTrue_udFalse_fufeature_ebnone_ffrnn_ftS | Avg Rank: 6.50 | MSE: 0.2923 (#7) ± 0.0010, MAE: 0.3796 (#6) ± 0.0016
 7. custom_pl48_unTrue_udTrue_fufeature_ebpatch_ffmlp_ftS | Avg Rank: 7.50 | MSE: 0.2943 (#13) ± 0.0036, MAE: 0.3785 (#2) ± 0.0038
 8. custom_pl48_unTrue_udTrue_futemporal_ebinvert_ffr

In [10]:
output_path = './rank_book/' + exp + '.txt'

if os.path.exists(output_path):
    os.remove(output_path)

with open(output_path, "w") as f:
    f.write("Setup Ranking (Lower is Better):\n")
    for i, (setup, stats) in enumerate(ranked.items(), 1):
        line = (f"{i:2d}. {setup} | Avg Rank: {stats['avg_rank']:.2f} | "
                f"MSE: {stats['mse_mean']:.4f} (#{stats['mse_rank']}) ± {stats['mse_std']:.4f}, "
                f"MAE: {stats['mae_mean']:.4f} (#{stats['mae_rank']}) ± {stats['mae_std']:.4f}\n")
        f.write(line)