# Load a summary of results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

import pandas as pd

from notebooks.results import load_run_result, find_run_paths

datasets = [
    "boolq",
    "cosmos_qa",
    "dream",
    "ethics_justice",
    "ethics_deontology",
    "ethics_virtue",
    "ethics_utilitarianism",
    "anli-r2",
    "cola",
    "sst2",
    "hellaswag",
    "mc_taco",
    "openbookqa",
    "paws",
    "quail",
    "piqa",
    "quartz",
    "sciq",
    "social_i_qa",
    "multirc",  
    "wic",  
    "twitter_sentiment",  
]

all_rs = dict()
df = []
root = "/mnt/ssd-1/alexm/weak-to-strong/results/logconf/"
sms = "meta-llama/Meta-Llama-3-8B"
print(f"Strong model: {sms}")
for ds_name in datasets:
    print(f"Loading {ds_name}")
    try:
        paths = find_run_paths(root, ds_name, strong_model=sms)

        r_all = load_run_result(**paths)
        print(f"Weak floor: {r_all.weak_acc:.2f}")
        print(f"Strong ceiling: {r_all.strong_acc:.2f}")
        print(f"W2S: {r_all.w2s_acc:.2f} (PGR: {r_all.pgr:.2f})")
        all_rs[ds_name] = r_all
        df.append({
            "dataset": ds_name,
            "weak_floor": r_all.weak_acc,
            "strong_ceiling": r_all.strong_acc,
            "w2s": r_all.w2s_acc,
            "pgr": r_all.pgr,
            **r_all.cfg
        })

    except (AssertionError, FileNotFoundError) as e:
        print(f"Failed to load {ds_name}: {e}")
        continue

df = pd.DataFrame(df)

In [None]:
sns.barplot(data=df, x="ds_name", y="pgr")
plt.title("logconf")