In [1]:
import os
import json
import pandas as pd

def summarize_json_scores(folder):
    """
    Reads all JSON files in the given folder and computes the average score for each file.
    Assumes each JSON file is a dictionary of numeric values.
    
    Returns:
        A pandas DataFrame with columns: Filename, Average Score, Count of 1s, Ratio of 1s
    """
    rows = []

    for filename in sorted(os.listdir(folder)):
        if not filename.endswith(".json"):
            continue

        filepath = os.path.join(folder, filename)

        with open(filepath, "r") as f:
            data = json.load(f)

        values = list(data.values())
        if not values:
            continue

        avg_score = sum(values) / len(values)
        count_ones = sum(1 for v in values if v == 1)
        ratio_ones = count_ones / len(values)

        rows.append({
            "Filename": filename,
            "Average Score": round(avg_score, 4),
            "Count of 1s": count_ones,
            "Size": len(values)
        })

    return pd.DataFrame(rows)

# Example usage
# df = summarize_json_scores("../data/evaluation_results/")
# print(df)



In [2]:
import os
import json

datasets = ['spinach']

for dataset in datasets:
    folder = f'../data/answers/zero-shot/{dataset}/relation-classification/'
    df = summarize_json_scores(folder)
    print(df)



                                   Filename  Average Score  Count of 1s  Size
0  Containment_gpt-4.1-mini-2025-04-14.json         0.9800          147   150
1  Containment_gpt-4.1-nano-2025-04-14.json         0.4000           60   150
2                  Containment_gpt-4.1.json         0.9933          149   150
3       Equivalence_gpt-4.1-2025-04-14.json         0.9667          145   150
4  Equivalence_gpt-4.1-mini-2025-04-14.json         0.9600          144   150
5  Equivalence_gpt-4.1-nano-2025-04-14.json         0.9467          142   150
6             Minus_gpt-4.1-2025-04-14.json         0.4533           68   150
7        Minus_gpt-4.1-mini-2025-04-14.json         0.4800           72   150
8        Minus_gpt-4.1-nano-2025-04-14.json         0.0133            2   150


### Evaluation Pipeline 

In [3]:
def jaccard_similarity(set1, set2):
    """Calculate the Jaccard similarity between two sets."""
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0.0
    return intersection / union

# For the subset-superset check
def is_subset(set1, set2):
    """Check if set1 is a subset of set2."""
    return set1.issubset(set2)

In [None]:
import os
import json
import pandas as pd

def compute_similarity(
    model: str,
    task: str,
    action: str,
    base_dir: str = '../data/answers',
    output_dir: str = '.',
    star = False
):
    if star:
        prefix = '*'
    else:
        prefix = ''
    # Build file paths
    action_word = action+ ""
    #if action == "wikidata":
    #     action_word = "wikidata_"
    
    if task == "equal":
        ql1_path = os.path.join(base_dir, task, f'{prefix}Q1_{task}_answers_{action_word}{model}.json')
        ql2_path = os.path.join(base_dir, task, f'{prefix}Q2_{task}_answers_{action_word}{model}.json')
    if task == "minus":
        ql3_path = os.path.join(base_dir, task, f'{prefix}Q4_{task}_answers_{action_word}{model}.json')
        ql1_path = os.path.join(base_dir, "equal", f'{prefix}Q1_equal_answers_{action_word}{model}.json')
        ql2_path = os.path.join(base_dir, "sup-sub", f'{prefix}Q3_sup-sub_answers_{action_word}{model}.json')
    if task == "sup-sub":
        ql1_path = os.path.join(base_dir, 'equal', f'{prefix}Q1_equal_answers_{action_word}{model}.json')
        ql2_path = os.path.join(base_dir, 'sup-sub', f'{prefix}Q3_sup-sub_answers_{action_word}{model}.json')
        ql3_path = os.path.join(base_dir, 'minus', f'{prefix}Q4_minus_answers_{action_word}{model}.json')

    # Load data
    try: 
        with open(ql1_path, 'r', encoding='utf-8') as f:
            ql1_answers = json.load(f)
        with open(ql2_path, 'r', encoding='utf-8') as f:
            ql2_answers = json.load(f)
        if task != "equal":
            with open(ql3_path, 'r', encoding='utf-8') as f:
                ql3_answers = json.load(f)
        else:
            ql3_answers = None
    except:
        print(f"Error loading files: {ql1_path}, {ql2_path}, {ql3_path}")   
        return None
    # Compute metrics per question
    similarity_scores = {}
    if task == "minus":
        # For minus task, we need to compare ql1 and ql2 with ql3
        for qid, ans3 in ql3_answers.items():
            set3 = set(ans3)
            #qid = str(int(qid) + 1)
            set_a = set(ql1_answers.get(qid, []))
            set_b = set(ql2_answers.get(qid, [])) 
            set_c =  set_a - set_b 
            sim = jaccard_similarity(set3, set_c)
            is_empty = int(len(set3) == 0 and len(set_c) == 0)
            binary_eq = int(set_c == set3)
            similarity_scores[qid] = (sim, is_empty, binary_eq)
    else:
        for qid, ans1 in ql1_answers.items():
            set1 = set(ans1)
            set2 = set(ql2_answers.get(qid, []))
            is_empty = int(len(set1) == 0 and len(set2) == 0)
            if task == "equal":
                sim = jaccard_similarity(set1, set2)
                binary_eq = int(set1 == set2)
            else:
                set_c = set(ql3_answers.get(qid, []))
                union = set2.union(set_c) 
                sim = jaccard_similarity(set1, union)
                binary_eq = int(is_subset(set2, set1))
            similarity_scores[qid] = (sim, is_empty, binary_eq)

    # Create DataFrame
    sim_df = pd.DataFrame.from_dict(
        similarity_scores,
        orient='index',
        columns=['JaccardSimilarity', 'IsEmptySet', 'BinaryEqual']
    )

    # Ensure index is a column
    sim_df = sim_df.reset_index().rename(columns={'index': 'QuestionID'})

    # Save to TSV
    output_filename = f'{prefix}{task}_{model}_{action}.tsv'
    output_path = os.path.join(output_dir, output_filename)
    sim_df.to_csv(output_path, sep='\t', index=False)
    print(f"Saved similarity results to {output_path}")


In [23]:
# Example usage:
model = ['gpt-4.1-nano-2025-04-14',"gpt-4.1-mini-2025-04-14","gpt-4.1-2025-04-14"]
task = ['equal', 'sup-sub', "minus"]
action = [""]
operations = ['zero-shot',"follow_up_fixing","rel_classification_and_questions"]
datasets = ['spinach']
for dataset in datasets:
    for m in model:
        for t in task:
            for a in action:
                compute_similarity(
                    model=m,
                    task=t,
                    action=a,
                    base_dir=f'../data/answers/{operations[0]}/{dataset}',
                    output_dir=f'../data/evaluation_results/{dataset}',
                )
                compute_similarity(
                    model=m,
                    task=t,
                    action=a,
                    base_dir=f'../data/answers/{operations[0]}/{dataset}',
                    output_dir=f'../data/evaluation_results/{dataset}',
                    star=False
                )


Saved similarity results to ../data/evaluation_results/spinach/equal_gpt-4.1-nano-2025-04-14_.tsv
Saved similarity results to ../data/evaluation_results/spinach/equal_gpt-4.1-nano-2025-04-14_.tsv
Saved similarity results to ../data/evaluation_results/spinach/sup-sub_gpt-4.1-nano-2025-04-14_.tsv
Saved similarity results to ../data/evaluation_results/spinach/sup-sub_gpt-4.1-nano-2025-04-14_.tsv
Saved similarity results to ../data/evaluation_results/spinach/minus_gpt-4.1-nano-2025-04-14_.tsv
Saved similarity results to ../data/evaluation_results/spinach/minus_gpt-4.1-nano-2025-04-14_.tsv
Saved similarity results to ../data/evaluation_results/spinach/equal_gpt-4.1-mini-2025-04-14_.tsv
Saved similarity results to ../data/evaluation_results/spinach/equal_gpt-4.1-mini-2025-04-14_.tsv
Saved similarity results to ../data/evaluation_results/spinach/sup-sub_gpt-4.1-mini-2025-04-14_.tsv
Saved similarity results to ../data/evaluation_results/spinach/sup-sub_gpt-4.1-mini-2025-04-14_.tsv
Saved simila

In [24]:
import os
import json
import re
import pandas as pd

def summarize_evaluation_results(folder):
    """
    Summarize per-question TSV evaluation files in a folder.
    Expects filenames in the format: task-model-action.tsv
    and each TSV contains columns: QuestionID, JaccardSimilarity, IsEmptySet, BinaryEqual
    """
    rows = []

    for filename in sorted(os.listdir(folder)):
        if not filename.endswith(".tsv"):
            continue

        # Extract metadata from filename
        base = filename[:-4]  # remove .tsv
        parts = base.split('_')
        task = parts[0]
        action = parts[-1]
        model = parts[1]

        # Read TSV
        df = pd.read_csv(os.path.join(folder, filename), sep='\t')

        # Ensure JaccardSimilarity column exists
        if 'JaccardSimilarity' not in df.columns:
            raise ValueError(f"Missing 'JaccardSimilarity' in {filename}")

        # Compute overall average
        avg_all = df['JaccardSimilarity'].mean()

        # Identify empty (zero) Jaccard entries
        is_empty = df['JaccardSimilarity'] == 0
        ratio_empty = is_empty.mean()

        # Average excluding empty entries
        non_empty = df.loc[~is_empty, 'JaccardSimilarity']
        avg_non_empty = non_empty.mean() if not non_empty.empty else float('nan')

        # Binary count: proportion where BinaryEqual == 1
        if 'BinaryEqual' in df.columns:
            binary_count = (df['BinaryEqual'] == 1).mean()
        else:
            binary_count = float('nan')

        rows.append({
            "Task": task,
            "Model": model,
            "Action": action,
            "Average_All": round(avg_all, 4),
            "Average_NoEmpty": round(avg_non_empty, 4),
            "Ratio_empty": round(ratio_empty, 4),
            "Binary_count": round(binary_count, 4),
        })

    return pd.DataFrame(rows)

# Example usage:
df = summarize_evaluation_results("../data/evaluation_results/spinach/")
df




Unnamed: 0,Task,Model,Action,Average_All,Average_NoEmpty,Ratio_empty,Binary_count
0,equal,gpt-4.1-2025-04-14,,0.7438,0.7748,0.04,0.4733
1,equal,gpt-4.1-2025-04-14,wikidata,0.798,0.8199,0.0267,0.5
2,equal,gpt-4.1-2025-04-14,classAndAnswer,0.9249,0.9751,0.0515,0.9485
3,equal,gpt-4.1-2025-04-14,fixing,0.9275,0.973,0.0467,0.9067
4,equal,gpt-4.1-mini-2025-04-14,,0.7116,0.7907,0.1,0.5067
5,equal,gpt-4.1-mini-2025-04-14,wikidata,0.7297,0.7549,0.0333,0.44
6,equal,gpt-4.1-mini-2025-04-14,classAndAnswer,0.8755,0.98,0.1067,0.94
7,equal,gpt-4.1-mini-2025-04-14,fixing,0.9138,0.9453,0.0333,0.8533
8,equal,gpt-4.1-nano-2025-04-14,,0.6025,0.7172,0.16,0.3467
9,equal,gpt-4.1-nano-2025-04-14,wikidata,0.6103,0.7042,0.1333,0.3333
