In [35]:
import pandas as pd
import numpy as np
import json
import os
from collections import defaultdict
from fuzzywuzzy import fuzz

import matplotlib.pyplot as plt



### Generate Alignment Table

In [17]:
baselines = ["vanilla", "cot", "socratic", "subq"]
datasets = ["massmaps", "supernova", "politeness", "emotion", "cholec", "cardiac", "sepsis"]
models = ["gpt-4o"]

# Map for pretty LaTeX-style formatting of baselines
baseline_names = {
    "vanilla": "\\textbf{Vanilla}",
    "cot": "\\textbf{Chain-of-Thought}",
    "socratic": "\\textbf{Socratic Prompting}",
    "subq": "\\textbf{SubQ Decomposition}"
}

# Collect scores
results = defaultdict(list)

for model in models:
    for baseline in baselines:
        for dataset in datasets:
            path = os.path.join(baseline, f"{dataset}_{model}.json")
            if not os.path.exists(path):
                print(f"File not found: {path}")
                continue
            with open(path, 'r') as f:
                try:
                    data = json.load(f)
                    scores = [entry["final_alignment_score"] for entry in data if "final_alignment_score" in entry and entry["final_alignment_score"] is not None]
                    if scores:
                        avg = sum(scores) / len(scores)
                        results[(baseline, dataset)] = avg
                    #print length of scores
                    if(len(scores) < 100):
                        print(f"Warning: Less than 100 scores for {baseline} on {dataset}: {len(scores)}")
                except Exception as e:
                    print(scores)
                    print(f"Error processing {path}: {e}")

    # Generate LaTeX rows
    print("-----------------------")
    for baseline in baselines:
        row = [baseline_names[baseline]]
        for dataset in datasets:
            avg_score = results.get((baseline, dataset), "")
            if isinstance(avg_score, float):
                row.append(f"{avg_score:.2f}")
            else:
                row.append("")  # Leave blank if not available
                print(f"Warning: No score found for {baseline} on {dataset}")
        print(" & ".join(row) + " \\\\")

-----------------------
\textbf{Vanilla} & 0.42 & 0.83 & 0.63 & 0.64 & 0.30 & 0.52 & 0.54 \\
\textbf{Chain-of-Thought} & 0.39 & 0.81 & 0.62 & 0.61 & 0.34 & 0.56 & 0.53 \\
\textbf{Socratic Prompting} & 0.41 & 0.80 & 0.60 & 0.62 & 0.37 & 0.50 & 0.54 \\
\textbf{SubQ Decomposition} & 0.35 & 0.82 & 0.60 & 0.58 & 0.36 & 0.39 & 0.56 \\


### Generate Accuracy Table

In [24]:
baselines = ["vanilla", "cot", "socratic", "subq"]
datasets = ["massmaps", "supernova", "politeness", "emotion", "cholec", "cardiac", "sepsis"]
models = ["gpt-4o"]

# Collect scores
results = defaultdict(list)

for model in models:
    for baseline in baselines:
        for dataset in datasets:
            path = os.path.join(baseline, f"{dataset}_{model}.json")
            if not os.path.exists(path):
                print(f"File not found: {path}")
                continue
            with open(path, 'r') as f:
                try:
                    accuracy = []
                    data = json.load(f)
                    if "accuracy" in data[0]:
                        accuracy = [entry["accuracy"] for entry in data if entry["accuracy"] is not None]   
                    elif "mse" in data[0]:
                        accuracy = [entry["mse"] for entry in data if entry["mse"] is not None]   
                    elif "mse_loss" in data[0]:
                        accuracy = [(entry["mse_loss"]["Omega_m"] + entry["mse_loss"]["sigma_8"])/2 for entry in data if entry["mse_loss"] is not None]
                    elif "safe_iou" in data[0]:
                        accuracy = [(entry["safe_iou"] + entry["unsafe_iou"])/2 for entry in data if entry["safe_iou"] is not None]
                    else:
                        print(f"Neither accuracy nor mse found in {path}")           
                    if accuracy:
                        avg = sum(accuracy) / len(accuracy)
                        results[(baseline, dataset)] = avg
                    if(len(accuracy) < 100):
                        print(f"Warning: Less than 100 scores for {baseline} on {dataset}: {len(accuracy)}")
                except Exception as e:
                    print(f"Error processing {path}: {e}")

# Map for pretty LaTeX-style formatting of baselines
baseline_names = {
    "vanilla": "\\textbf{Vanilla}",
    "cot": "\\textbf{Chain-of-Thought}",
    "socratic": "\\textbf{Socratic Prompting}",
    "subq": "\\textbf{SubQ Decomposition}"
}

# Generate LaTeX rows
print("Generating LaTeX table rows:\n")
for baseline in baselines:
    row = [baseline_names[baseline]]
    for dataset in datasets:
        avg_score = results.get((baseline, dataset), "")
        if isinstance(avg_score, float):
            row.append(f"{avg_score:.2f}")
        else:
            row.append("")  # Leave blank if not available
            print(f"Warning: No score found for {baseline} on {dataset}")
    print(" & ".join(row) + " \\\\")


Neither accuracy nor mse found in subq/cardiac_gpt-4o.json
Generating LaTeX table rows:

\textbf{Vanilla} & 0.04 & 0.10 & 0.92 & 0.26 & 0.07 & 0.57 & 0.66 \\
\textbf{Chain-of-Thought} & 0.04 & 0.09 & 0.82 & 0.27 & 0.10 & 0.46 & 0.71 \\
\textbf{Socratic Prompting} & 0.04 & 0.13 & 0.83 & 0.29 & 0.11 & 0.00 & 0.66 \\
\textbf{SubQ Decomposition} & 0.05 & 0.12 & 0.84 & 0.29 & 0.12 &  & 0.66 \\


### Criteria Coverage Analysis

In [40]:
baselines = ["vanilla", "cot", "socratic", "subq"]
datasets = ["massmaps", "supernova", "politeness", "emotion", "cholec", "cardiac", "sepsis"]
# datasets = ["emotion", "sepsis"]
models = ["gpt-4o"]

for dataset in datasets:
    categories_all = []
    for baseline in baselines:
        for model in models: 
            path = os.path.join(baseline, f"{dataset}_{model}.json")
            if not os.path.exists(path):
                print(f"File not found: {path}")
                continue
            with open(path, 'r') as f:
                try:
                    categories = []
                    data = json.load(f)
                    if "alignment_categories" in data[0]:
                        categories = [entry["alignment_categories"] for entry in data if entry["alignment_categories"] is not None] 
                    elif "alignment_category" in data[0]:
                        categories = [entry["alignment_category"] for entry in data if entry["alignment_category"] is not None]
                    elif "aligned_category_ids" in data[0]:
                        categories = [entry["aligned_category_ids"] for entry in data if entry["aligned_category_ids"] is not None]
                    else:
                        raise ValueError(f"alignment_categories not found in {path}") 
                    for category_list in categories:
                        for category_item in category_list:
                            categories_all.append(category_item)
                except Exception as e:
                    print(f"Error processing {path}: {e}")           
    # Count occurrences of each category
    names, counts = np.unique(categories_all, return_counts=True)

    print(f"\nCounts for {baseline} on {dataset}:")
    for name, count in zip(names, counts):
        print(f"{name}: {count}")
            #matplotlib bar plot
            # plt.figure(figsize=(10, 6))
            # plt.bar(names, counts)
            # plt.xlabel('Categories')
            # plt.ylabel('Counts')
            # plt.title(f'Category Counts for {baseline} on {dataset}')
            # plt.xticks(rotation=45)
            # plt.tight_layout()
            
            


Counts for subq on massmaps:
Connectivity of the Cosmic Web: 123
Density Contrast Extremes: 931
Filament Thickness and Sharpness: 17
Fine-Scale Clumpiness: 316
Lensing Peak (Cluster) Abundance: 361
Void Size and Frequency: 192

Counts for subq on supernova:
Characteristic rise-and-decline rates—such as the fast-rise/slow-fade morphology of many supernovae—encode energy-release physics and serve as strong class discriminators.: 2
Characteristic rise-and-decline rates—such as the fast‑rise/slow‑fade morphology of many supernovae—encode energy‑release physics and serve as strong class discriminators.: 1
Characteristic rise‑and‑decline rates—such as the fast‑rise/slow‑fade morphology of many supernovae—encode energy‑release physics and serve as strong class discriminators.: 295
Contiguous non-zero flux segments confirm genuine astrophysical activity and define the time windows from which transient features should be extracted.: 7
Contiguous non‑zero flux segments confirm genuine astrophys