In [14]:
import os
import json

folder = "../data/evaluation_results/"

for filename in os.listdir(folder):
    filepath = os.path.join(folder, filename)
    
    with open(filepath, "r") as f:
        data = json.load(f)
    
    average_score = sum(data.values()) / len(data)
    
    # Compute ratio of values equal to 1
    count_ones = sum(1 for v in data.values() if v == 1)
    ratio_ones = count_ones / len(data)
    
    print(f"{filename}: avg={average_score:.2f}, ratio_of_ones={ratio_ones:.2%}")


subset_superset_results_gpt-4.1-nano.json: avg=0.23, ratio_of_ones=23.08%
eq_jaccard_similarity_scores_gpt-4.1-mini-2025-04-14.json: avg=0.57, ratio_of_ones=37.16%
eq_percentage_scores_gpt-4.1-nano.json: avg=45.52, ratio_of_ones=0.00%
eq_percentage_scores_gpt-4.1-mini-2025-04-14.json: avg=57.06, ratio_of_ones=0.00%
eq_jaccard_similarity_scores_gpt-4.1-nano.json: avg=0.46, ratio_of_ones=30.57%
subset_superset_results_gpt-4.1-mini-2025-04-14.json: avg=0.28, ratio_of_ones=28.21%


In [20]:
import os
import json
import re
import pandas as pd

def summarize_evaluation_results(folder):
    rows = []

    for filename in os.listdir(folder):
        if not filename.endswith(".json"):
            continue

        filepath = os.path.join(folder, filename)

        with open(filepath, "r") as f:
            data = json.load(f)

        # Compute metrics
        average_score = sum(data.values()) / len(data)
        count_ones = sum(1 for v in data.values() if v == 1)
        ratio_ones = count_ones / len(data)

        # Detect task type
        if "subset_superset" in filename:
            task = "containment"
        elif "eq_" in filename:
            task = "equivalence"
        else:
            task = "unknown"

        # Extract model name (e.g., gpt-4.1-nano, gpt-4.1-mini-2025-04-14)
        match = re.search(r"gpt-[\w\.-]+", filename)
        model = match.group(0) if match else "unknown"

        rows.append({
            "Task": task,
            "Model": model.strip(".json"),
            "Average": round(average_score, 4),
            "Binary_Count (Ratio of 1s)": round(ratio_ones, 4)
        })

    return pd.DataFrame(rows)

# Example usage
df = summarize_evaluation_results("../data/evaluation_results/")
df


Unnamed: 0,Task,Model,Average,Binary_Count (Ratio of 1s)
0,containment,gpt-4.1-na,0.2308,0.2308
1,equivalence,gpt-4.1-mini-2025-04-14,0.5706,0.3716
2,equivalence,gpt-4.1-na,45.5235,0.0
3,equivalence,gpt-4.1-mini-2025-04-14,57.0559,0.0
4,equivalence,gpt-4.1-na,0.4552,0.3057
5,containment,gpt-4.1-mini-2025-04-14,0.2821,0.2821
