In [6]:
import pandas as pd
import numpy as np
import json
import os
from collections import defaultdict

### Generate Alignment Table

In [11]:
baselines = ["vanilla", "cot", "socratic", "subq"]
datasets = ["massmaps", "supernova", "politeness", "emotion", "cholec", "cardiac", "sepsis"]
models = ["gpt-4o"]

# Map for pretty LaTeX-style formatting of baselines
baseline_names = {
    "vanilla": "\\textbf{Vanilla}",
    "cot": "\\textbf{Chain-of-Thought}",
    "socratic": "\\textbf{Socratic Prompting}",
    "subq": "\\textbf{SubQ Decomposition}"
}

# Collect scores
results = defaultdict(list)

for model in models:
    for baseline in baselines:
        for dataset in datasets:
            path = os.path.join(baseline, f"{dataset}_{model}.json")
            if not os.path.exists(path):
                print(f"File not found: {path}")
                continue
            with open(path, 'r') as f:
                try:
                    data = json.load(f)
                    scores = [entry["final_alignment_score"] for entry in data if "final_alignment_score" in entry and entry["final_alignment_score"] is not None]
                    if scores:
                        avg = sum(scores) / len(scores)
                        results[(baseline, dataset)] = avg
                    #print length of scores
                    print(f"Scores for {baseline} on {dataset}: {len(scores)}")
                except Exception as e:
                    print(scores)
                    print(f"Error processing {path}: {e}")

    # Generate LaTeX rows
    print("Generating LaTeX table rows:\n")
    for baseline in baselines:
        row = [baseline_names[baseline]]
        for dataset in datasets:
            avg_score = results.get((baseline, dataset), "")
            if isinstance(avg_score, float):
                row.append(f"{avg_score:.2f}")
            else:
                row.append("")  # Leave blank if not available
                print(f"Warning: No score found for {baseline} on {dataset}")
        print(" & ".join(row) + " \\\\")

Scores for vanilla on massmaps: 100
Scores for vanilla on supernova: 107
Scores for vanilla on politeness: 119
Scores for vanilla on emotion: 112
Scores for vanilla on cholec: 150
Scores for vanilla on cardiac: 104
Scores for vanilla on sepsis: 108
Scores for cot on massmaps: 100
Scores for cot on supernova: 107
Scores for cot on politeness: 119
Scores for cot on emotion: 112
Scores for cot on cholec: 150
Scores for cot on cardiac: 124
Scores for cot on sepsis: 105
Scores for socratic on massmaps: 100
Scores for socratic on supernova: 110
Scores for socratic on politeness: 120
Scores for socratic on emotion: 112
Scores for socratic on cholec: 150
Scores for socratic on cardiac: 3
Scores for socratic on sepsis: 108
Scores for subq on massmaps: 100
Scores for subq on supernova: 110
Scores for subq on politeness: 120
Scores for subq on emotion: 2
Scores for subq on cholec: 150
Scores for subq on cardiac: 2
Scores for subq on sepsis: 108
Generating LaTeX table rows:

\textbf{Vanilla} & 0.4

In [29]:
baselines = ["vanilla", "cot", "socratic", "subq"]
datasets = ["massmaps", "supernova", "politeness", "emotion", "cholec", "cardiac", "sepsis"]
models = ["gpt-4o"]

for model in models:
    for baseline in baselines:
        for dataset in datasets:
            path = os.path.join(baseline, f"{dataset}_{model}.json")
            if not os.path.exists(path):
                print(f"File not found: {path}")
                continue
            with open(path, 'r') as f:
                try:
                    accuracy = []
                    data = json.load(f)
                    if "accuracy" in data[0]:
                        accuracy = [entry["accuracy"] for entry in data if entry["accuracy"] is not None]   
                    elif "mse" in data[0]:
                        accuracy = [entry["mse"] for entry in data if entry["mse"] is not None]   
                    elif "mse_loss" in data[0]:
                        accuracy = [(entry["mse_loss"]["Omega_m"] + entry["mse_loss"]["sigma_8"])/2 for entry in data if entry["mse_loss"] is not None]
                    elif "safe_iou" in data[0]:
                        accuracy = [(entry["safe_iou"] + entry["unsafe_iou"])/2 for entry in data if entry["safe_iou"] is not None]
                    else:
                        print(f"Neither accuracy nor mse found in {path}")           
                    if accuracy:
                        avg = sum(accuracy) / len(accuracy)
                        results[(baseline, dataset)] = avg
                except Exception as e:
                    print(f"Error processing {path}: {e}")

# Map for pretty LaTeX-style formatting of baselines
baseline_names = {
    "vanilla": "\\textbf{Vanilla}",
    "cot": "\\textbf{Chain-of-Thought}",
    "socratic": "\\textbf{Socratic Prompting}",
    "subq": "\\textbf{SubQ Decomposition}"
}

# Generate LaTeX rows
print("Generating LaTeX table rows:\n")
for baseline in baselines:
    row = [baseline_names[baseline]]
    for dataset in datasets:
        avg_score = results.get((baseline, dataset), "")
        if isinstance(avg_score, float):
            row.append(f"{avg_score:.2f}")
        else:
            row.append("")  # Leave blank if not available
            print(f"Warning: No score found for {baseline} on {dataset}")
    print(" & ".join(row) + " \\\\")


Neither accuracy nor mse found in vanilla/supernova_gpt-4o.json
Neither accuracy nor mse found in subq/cardiac_gpt-4o.json
Generating LaTeX table rows:

\textbf{Vanilla} & 0.04 & 0.04 & 0.92 & 0.26 & 0.07 & 0.57 & 0.66 \\
\textbf{Chain-of-Thought} & 0.04 & 0.09 & 0.82 & 0.27 & 0.10 & 0.46 & 0.71 \\
\textbf{Socratic Prompting} & 0.04 & 0.13 & 0.83 & 0.29 & 0.11 & 0.00 & 0.66 \\
\textbf{SubQ Decomposition} & 0.05 & 0.12 & 0.84 & 0.00 & 0.12 & 0.12 & 0.66 \\
