In [16]:
# -*- coding: utf-8 -*-
import json
import csv
import os
import re
from pathlib import Path
from glob import glob

from config.constants import MODEL_NAME_REPLACEMENTS

# ──────────────────────────────────────────────────────────────────────────────
# Paths
# ──────────────────────────────────────────────────────────────────────────────
HERE = Path(__file__).resolve().parent if "__file__" in globals() else Path(".").resolve()
RESULTS_DIR = HERE / "results"
OUTPUT_JSON_DIR = RESULTS_DIR / "stats"
OUTPUT_CSV = RESULTS_DIR / "scores.csv"
RUNS_DIR = (HERE / "v3_results_x96").resolve()

OUTPUT_JSON_DIR.mkdir(parents=True, exist_ok=True)

# ──────────────────────────────────────────────────────────────────────────────
# Costs keyed by *exact* judge model string used in the run, where lists are
# joined by "," (e.g., "a,b,c"). Unknowns = None.
# ──────────────────────────────────────────────────────────────────────────────
run_cost_usd = {
    "google/gemini-2.5-flash-lite-preview-06-17": 0.53,
    "google/gemini-2.5-flash": 1.87,
    "moonshotai/kimi-k2": 3.70,
    "openai/gpt-4.1-mini": 2.09,
    "mistralai/mistral-medium-3": 2.31,
    "qwen/qwen3-235b-a22b": 1.00,
    "o3": 16.16,  # also used in run 23 where cost was 13.00 for that run; keep model-level as per provided list
    "o4-mini": 11.81,
    "meta-llama/llama-3.1-8b-instruct": 0.11,
    "openai/gpt-4.1": 10.43,
    "anthropic/claude-3.7-sonnet": None,
    "anthropic/claude-sonnet-4": 18.73,
    "google/gemini-2.5-pro": 52.90,  # 40.57 - (-12.33)
    #"openrouter/horizon-alpha,openrouter/horizon-alpha,openrouter/horizon-alpha,openrouter/horizon-alpha,openrouter/horizon-alpha": None,
    "openrouter/horizon-alpha": None,
    "mistralai/mistral-small-3.2-24b-instruct": 0.49,
    "qwen/qwen3-235b-a22b-2507": 0.94,
    "z-ai/glm-4.5": 4.40,
    #"mistralai/mistral-small-3.2-24b-instruct,mistralai/mistral-small-3.2-24b-instruct,mistralai/mistral-small-3.2-24b-instruct,mistralai/mistral-small-3.2-24b-instruct,mistralai/mistral-small-3.2-24b-instruct": 2.37,
    #"moonshotai/kimi-k2,openrouter/horizon-beta": None,
    #"openrouter/horizon-beta,openrouter/horizon-beta,openrouter/horizon-beta": None,
    "openrouter/horizon-beta": None,
    # Separate run noted with cost 13.00 for o3 in a different prompt variant:
    # If you prefer per-run specificity, you'd need per-file mapping instead of model-keyed mapping.
    "gpt-5-mini-2025-08-07": 1.49,
    "gpt-5-nano-2025-08-07": 0.29,
    "gpt-5-nano-2025-08-07,gpt-5-nano-2025-08-07,gpt-5-nano-2025-08-07,gpt-5-nano-2025-08-07,gpt-5-nano-2025-08-07": 1.45,
    "openai/gpt-oss-120b": 1.14,
    "openai/gpt-oss-20b": 0.45,
    "gpt-5-2025-08-07": 7.43,
    "qwen/qwen3-30b-a3b-instruct-2507": 0.98,
    "google/gemini-2.5-flash-lite": 0.5,

}

# ──────────────────────────────────────────────────────────────────────────────
# Optional ignores
# ──────────────────────────────────────────────────────────────────────────────
runids_to_ignore = set()

# ──────────────────────────────────────────────────────────────────────────────
# CSV header
# ──────────────────────────────────────────────────────────────────────────────
csv_header = [
    "model",
    "judgemark_score",
    "stability",
    "separability",
    "human_corr",
    "cost"
]

def normalize_model_name(name: str) -> str:
    if not name:
        return name
    return MODEL_NAME_REPLACEMENTS.get(name, name)

def fmt_cost(val) -> str:
    if val is None:
        return ""
    try:
        return f"${float(val):.2f}"
    except Exception:
        return ""

def safe_float(x, default=None):
    try:
        return float(x)
    except Exception:
        return default

def extract_model_fields(judge_data: dict):
    """
    Return:
      - model_for_csv: normalized single-name for CSV display (first model if list)
      - cost_lookup_key: exact joined string for cost lookup (list joined with ',')
    """
    field_models = judge_data.get("judge_models")
    field_model  = judge_data.get("judge_model")

    # Build cost lookup key
    if isinstance(field_models, list):
        cost_lookup_key = ",".join(field_models)
    elif isinstance(field_model, str):
        cost_lookup_key = field_model
    else:
        cost_lookup_key = ""

    # Model name to show in CSV (normalize first entry)
    if isinstance(field_models, list) and field_models:
        model_for_csv = normalize_model_name(','.join(field_models))
    elif isinstance(field_model, str):
        model_for_csv = normalize_model_name(field_model.split(",")[0].strip())
    else:
        model_for_csv = ""

    return model_for_csv, cost_lookup_key

def compute_row(judge_data: dict, fallback_cost: float | None):
    """
    Build a CSV row + return a sanitized copy of judge_data for JSON dump.
    """
    # Names
    judge_model_name, _ = extract_model_fields(judge_data)

    # Core score
    judgemark_score_raw = judge_data.get("final_judgemark_score_raw")

    # Elements for components (raw version)
    norm_stats = judge_data.get("final_judgemark_score_elements_raw", {}) or {}

    stability = norm_stats.get("norm_stability_between_iterations")

    # Separability = avg(norm_kruskall_wallis, norm_ci99_adjacent_overlap)
    norm_kw = safe_float(norm_stats.get("norm_kruskall_wallis", 0.0), 0.0)
    norm_ci = safe_float(norm_stats.get("norm_ci99_adjacent_overlap", 0.0), 0.0)
    separability = (norm_kw + norm_ci) / 2.0 if (norm_kw is not None and norm_ci is not None) else None

    human_corr = norm_stats.get("norm_correlation_with_lmsys_arena")

    # Prefer directly stored billing if present; else fall back to our lookup
    cost_candidates = [
        judge_data.get("billing", {}).get("total_usd"),
        judge_data.get("billing_total_usd"),
        judge_data.get("cost_usd"),
    ]
    cost_val = next((c for c in cost_candidates if isinstance(c, (int, float, str)) and str(c).strip() != ""), None)
    if cost_val is None:
        cost_val = fallback_cost

    # Row
    row = [
        judge_model_name,
        round(100 * judgemark_score_raw, 2) if isinstance(judgemark_score_raw, (int, float)) else "",
        round(float(stability), 3) if isinstance(stability, (int, float)) else "",
        round(float(separability), 3) if isinstance(separability, (int, float)) else "",
        round(float(human_corr), 3) if isinstance(human_corr, (int, float)) else "",
        fmt_cost(cost_val)
    ]

    # Sanitize JSON copy (drop heavy 'results' if present)
    jd_copy = dict(judge_data)
    if "results" in jd_copy:
        try:
            del jd_copy["results"]
        except Exception:
            pass

    return row, jd_copy

def safe_filename_from_model(model_name: str) -> str:
    s = (model_name or "").replace("/", "__")
    return re.sub(r"[^\w\-]", "-", s)

# ──────────────────────────────────────────────────────────────────────────────
# Walk all run files and aggregate
# ──────────────────────────────────────────────────────────────────────────────
csv_rows = []
json_written = 0

# Gather files like 1.json, 2.json, ... plus any stragglers
run_files = sorted([Path(p) for p in glob(str(RUNS_DIR / "*.json"))])

for path in run_files:
    with path.open("r", encoding="utf-8") as fh:
        payload = json.load(fh)

    # Expect shape: { run_id: judge_data, ... } — tolerate other shapes
    if isinstance(payload, dict):
        items = payload.items()
    elif isinstance(payload, list):
        items = enumerate(payload)
    else:
        continue

    for run_id, judge_data in items:
        if run_id in runids_to_ignore:
            continue
        if not isinstance(judge_data, dict):
            continue

        # NEW: derive fallback cost by reading judge_models/judge_model
        _, cost_key = extract_model_fields(judge_data)
        fallback_cost = run_cost_usd.get(cost_key, None)

        row, jd_copy = compute_row(judge_data, fallback_cost)
        csv_rows.append(row)

        # Write a per-judge JSON (model-normalized filename)
        model_name = row[0]
        safe_stem = safe_filename_from_model(model_name)
        json_output_path = OUTPUT_JSON_DIR / f"{safe_stem}.json"

        with json_output_path.open("w", encoding="utf-8") as outfh:
            json.dump(jd_copy, outfh, indent=4)
            json_written += 1

# ──────────────────────────────────────────────────────────────────────────────
# Write CSV
# ──────────────────────────────────────────────────────────────────────────────
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
with OUTPUT_CSV.open("w", newline="", encoding="utf-8") as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(csv_header)
    csv_writer.writerows(csv_rows)

# ──────────────────────────────────────────────────────────────────────────────
# Echo CSV rows as lines for quick eyeballing
# ──────────────────────────────────────────────────────────────────────────────
raw = ""
for row in csv_rows:
    row_str = ",".join(map(str, row))
    raw += "\n" + row_str
    print(row_str)


gpt-4.1,76.31,0.727,0.819,0.576,$10.43
claude-sonnet-4,81.99,0.726,0.844,0.818,$18.73
gemini-2.5-pro,72.06,0.675,0.747,0.662,$52.90
openrouter/horizon-alpha,82.25,0.804,0.85,0.733,
mistralai/Mistral-Small-3.2-24B-Instruct-2506,50.21,0.396,0.578,0.306,$0.49
qwen/qwen3-235b-a22b-2507,59.69,0.437,0.61,0.704,$0.94
z-ai/glm-4.5,63.41,0.53,0.664,0.619,$4.40
gemini-2.5-flash,62.97,0.532,0.66,0.605,$1.87
openrouter/horizon-beta,82.38,0.819,0.844,0.747,
o3,75.1,0.699,0.779,0.69,$16.16
gpt-5-mini-2025-08-07:minimal-reasoning,74.49,0.694,0.778,0.662,$1.49
gpt-5-nano-2025-08-07:minimal-reasoning,52.2,0.411,0.536,0.576,$0.29
openai/gpt-oss-120b,54.78,0.332,0.57,0.676,$1.14
openai/gpt-oss-20b,42.8,0.331,0.476,0.334,$0.45
gpt-5-2025-08-07:minimal-reasoning,81.2,0.8,0.838,0.719,$7.43
gpt-5-nano-2025-08-07__5x-ensemble,61.08,0.579,0.635,0.548,$1.45
moonshotai/Kimi-K2-Instruct,73.09,0.66,0.752,0.719,$3.70
qwen/qwen3-30b-a3b-instruct-2507,34.58,0.153,0.343,0.548,$0.98
gemini-2.5-flash-lite,55.99,0.483,0.

In [17]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import io # To read the string data

# Raw data provided by the user

# 1. Parse Data
# Parse Data
rows = []
for line in raw.splitlines():
    line = line.lstrip('*') # remove leading *
    parts = line.split(',')
    if len(parts) < 2: continue # Skip empty or malformed lines

    model = parts[0].strip()
    try:
        score = float(parts[1])
    except ValueError:
        score = np.nan # Handle cases where score might be missing/invalid

    cost = np.nan
    # Only look for a cost if the last element isn't an empty string
    if parts[-1].strip():
        # Search for cost from the end
        for part in reversed(parts):
            part = part.strip()
            if part.startswith('$'):
                try:
                    cost = float(part[1:])
                    break
                except ValueError:
                    continue
            elif part == '': # Handle trailing comma case
                continue
            else:
                try: # Handle cost without $ sign
                    cost = float(part)
                    break
                except ValueError:
                    continue

    rows.append({'model': model, 'score': score, 'cost': cost})

df = pd.DataFrame(rows)

# Drop rows where score or cost couldn't be parsed
df.dropna(subset=['score', 'cost'], inplace=True)

# 2. Filter Data: Remove items with cost 0
# 2. Filter Data: keep only finite, positive costs (log axis needs >0)
df_filtered = df[(df['cost'] > 0) & np.isfinite(df['cost'])].copy()


# 3. Compute Pareto frontier on filtered data
frontier_indices = []
is_frontier = pd.Series([False] * len(df_filtered), index=df_filtered.index)

for idx, row in df_filtered.iterrows():
    s = row['score']
    c = row['cost']
    other_points = df_filtered.drop(index=idx)
    dominated = (
        (other_points['score'] >= s) &
        (other_points['cost'] <= c) &
        ((other_points['score'] > s) | (other_points['cost'] < c))
    ).any()

    if not dominated:
        frontier_indices.append(idx)
        is_frontier[idx] = True

df_frontier = df_filtered.loc[frontier_indices].sort_values('cost')

# 4. Prepare for Plotting & Define Adjustments
adjustments = {
    
    'gpt-4.1-mini': 0.0,
    'gpt-4o-mini': 0.0,           
    
    'qwen/qwen3-32b': 0.0,
    'qwen/qwen3-14b': 0.0,
    "qwen/qwen3-235b-a22b": 0.0,
    #"qwen/qwen3-235b-a22b:thinking": -1.1,
    "qwen/qwen3-30b-a3b": 0.0,    
    
    'mistral-medium-3': 0.0,    
    'o3:low-reasoning': 0.0,
    'mistralai/Mistral-Small-3.2-24B-Instruct-2506': 0,
    'gemini-2.5-flash': -0.4,
    'gpt-5-2025-08-07:minimal-reasoning': -0.3,
    'openai/gpt-oss-120b': -0.5,
    'gemini-2.5-flash-lite': -0.6,

}
base_pixel_shift = 10 # Base shift in pixels

# 5. Create Plot with Plotly
fig = go.Figure()

# Define colors suitable for dark mode
non_frontier_color = 'rgba(135, 206, 250, 0.8)' # Light Sky Blue
frontier_marker_color = 'rgba(255, 105, 97, 1.0)' # Pastel Red / Coral
frontier_line_color = 'rgba(255, 105, 97, 0.8)'
frontier_marker_border_color = 'lightgrey' # Light grey border for diamond
annotation_font_color = 'white' # White text for dark background

# Add all non-frontier points first
df_non_frontier = df_filtered[~is_frontier]
fig.add_trace(go.Scatter(
    x=df_non_frontier['cost'],
    y=df_non_frontier['score'],
    mode='markers',
    marker=dict(color=non_frontier_color, size=8),
    name='Other Models',
    text=df_non_frontier['model'], # Text for hover
    hoverinfo='text+x+y'
))

# Add frontier points (different marker)
fig.add_trace(go.Scatter(
    x=df_frontier['cost'],
    y=df_frontier['score'],
    mode='markers',
    marker=dict(
        symbol='diamond',
        size=12,
        color=frontier_marker_color,
        line=dict(width=1, color=frontier_marker_border_color) # Use light border
    ),
    name='Pareto Frontier',
    text=df_frontier['model'], # Text for hover
    hoverinfo='text+x+y'
))

# Add line connecting frontier points
fig.add_trace(go.Scatter(
    x=df_frontier['cost'],
    y=df_frontier['score'],
    mode='lines',
    line=dict(color=frontier_line_color, width=2, dash='dash'),
    name='Frontier Line',
    hoverinfo='skip' # Don't show hover for the line itself
))

# Labels rendered as a text trace so they stick to data coords on log x
# Labels rendered as a text trace so they stick to data coords on log x
label_texts = []
label_positions = []
for m, c in zip(df_filtered['model'], df_filtered['cost']):
    label_texts.append(f"{m} (${c:.2f})")
    if m.strip().lower() == "gemini-2.5-pro":
        label_positions.append("middle left")
    elif m.strip().lower() == "gemini-2.5-flash-lite":
        label_positions.append("top right")
    else:
        label_positions.append("middle right")

fig.add_trace(go.Scatter(
    x=df_filtered['cost'],
    y=df_filtered['score'],
    mode='text',
    text=label_texts,
    textposition=label_positions,  # array for per-point control
    textfont=dict(size=9, color=annotation_font_color),
    showlegend=False,
    hoverinfo='skip'
))



# Update layout for dark mode
fig.update_layout(
    title='Judgemark Score vs Cost',
    xaxis_title='Cost ($) to complete benchmark',
    yaxis_title='Judgemark Score (Performance as LLM Judge)',
    template='plotly_dark',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    margin=dict(l=40, r=40, t=80, b=40),
    height=700,
    width=1000,
    hovermode='closest',
    xaxis=dict(
        type='log',  # <-- This makes the x-axis logarithmic
        gridcolor='rgba(80,80,80,0.5)'
    ),
    yaxis=dict(
        gridcolor='rgba(80,80,80,0.5)'
    )
)


xmin = df_filtered['cost'].min()
xmax = df_filtered['cost'].max()
logpad = 0.15  # 15% padding in log space

# Build “1–2–5” log ticks within [xmin, xmax], label as normal currency.
xmin = df_filtered['cost'].min()
xmax = df_filtered['cost'].max()
logpad = 0.15

# candidate decades
decades = np.arange(np.floor(np.log10(xmin)), np.ceil(np.log10(xmax)) + 1)
bases = np.array([1.0, 2.0, 5.0])
tickvals = []
for d in decades:
    tickvals.extend(list(bases * (10.0 ** d)))
tickvals = [v for v in tickvals if (v >= xmin / (10**logpad)) and (v <= xmax * (10**logpad))]

def fmt_money(v: float) -> str:
    # 0 < v < 1 → 2–3 decimals, otherwise up to 2 decimals, no SI prefixes
    if v < 1:
        s = f"${v:.3f}".rstrip('0').rstrip('.')
    else:
        s = f"${v:.2f}".rstrip('0').rstrip('.')
    return s

ticktext = [fmt_money(v) for v in tickvals]

fig.update_xaxes(
    type='log',
    range=[np.log10(xmin) - logpad, np.log10(xmax) + logpad],
    tickmode='array',
    tickvals=tickvals,
    ticktext=ticktext,
    showexponent='none',          # don’t show 10^n anywhere
    gridcolor='rgba(80,80,80,0.5)'
)

fig.update_yaxes(
    gridcolor='rgba(80,80,80,0.5)'
)


# Save to HTML and print confirmation + data
html_file = 'model_pareto_plot_dark.html' # Changed filename
fig.write_html(html_file)

print(f"Dark mode plot saved to {html_file}")
print("\nFiltered DataFrame (Cost > 0):")
print(df_filtered.to_string())
print("\nModels on the Pareto Frontier:")
print(df_frontier[['model', 'score', 'cost']].to_string(index=False))

Dark mode plot saved to model_pareto_plot_dark.html

Filtered DataFrame (Cost > 0):
                                            model  score   cost
0                                         gpt-4.1  76.31  10.43
1                                 claude-sonnet-4  81.99  18.73
2                                  gemini-2.5-pro  72.06  52.90
4   mistralai/Mistral-Small-3.2-24B-Instruct-2506  50.21   0.49
5                       qwen/qwen3-235b-a22b-2507  59.69   0.94
6                                    z-ai/glm-4.5  63.41   4.40
7                                gemini-2.5-flash  62.97   1.87
9                                              o3  75.10  16.16
10        gpt-5-mini-2025-08-07:minimal-reasoning  74.49   1.49
11        gpt-5-nano-2025-08-07:minimal-reasoning  52.20   0.29
12                            openai/gpt-oss-120b  54.78   1.14
13                             openai/gpt-oss-20b  42.80   0.45
14             gpt-5-2025-08-07:minimal-reasoning  81.20   7.43
15             gpt-5