In [78]:
import os
import json
import pandas as pd

def summarize_json_scores(folder):
    """
    Reads all JSON files in the given folder and computes the average score for each file.
    Assumes each JSON file is a dictionary of numeric values.
    
    Returns:
        A pandas DataFrame with columns: Filename, Average Score, Count of 1s, Ratio of 1s
    """
    rows = []

    for filename in sorted(os.listdir(folder)):
        if not filename.endswith(".json"):
            continue

        filepath = os.path.join(folder, filename)

        with open(filepath, "r") as f:
            data = json.load(f)

        values = list(data.values())
        if not values:
            continue

        avg_score = sum(values) / len(values)
        count_ones = sum(1 for v in values if v == 1)
        ratio_ones = count_ones / len(values)

        rows.append({
            "Filename": filename,
            "Average Score": round(avg_score, 4),
            "Count of 1s": count_ones,
            "Size": len(values)
        })

    return pd.DataFrame(rows)

# Example usage
# df = summarize_json_scores("../data/evaluation_results/")
# print(df)



In [79]:
import os
import json

folder = '../data/answers/relation-classification/'
df = summarize_json_scores(folder)
df



Unnamed: 0,Filename,Average Score,Count of 1s,Size
0,*Containment_gpt-4.1-mini-2025-04-14.json,0.9211,35,38
1,*Containment_gpt-4.1-nano-2025-04-14.json,0.4211,16,38
2,*Equivalence_gpt-4.1-mini-2025-04-14.json,0.94,47,50
3,*Equivalence_gpt-4.1-nano-2025-04-14.json,1.0,50,50
4,*Minus_gpt-4.1-mini-2025-04-14.json,0.6053,23,38
5,*Minus_gpt-4.1-nano-2025-04-14.json,0.1316,5,38
6,Containment_gpt-4.1-mini-2025-04-14.json,0.9487,37,39
7,Containment_gpt-4.1-nano-2025-04-14.json,0.8462,33,39
8,Equivalence_gpt-4.1-mini-2025-04-14.json,0.9205,810,880
9,Equivalence_gpt-4.1-nano-2025-04-14.json,0.9409,828,880


### Evaluation Pipeline 

In [8]:
def jaccard_similarity(set1, set2):
    """Calculate the Jaccard similarity between two sets."""
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0.0
    return intersection / union

# For the subset-superset check
def is_subset(set1, set2):
    """Check if set1 is a subset of set2."""
    return set1.issubset(set2)

In [70]:
import os
import json
import pandas as pd

def compute_similarity(
    model: str,
    task: str,
    action: str,
    base_dir: str = '../data/answers',
    output_dir: str = '.',
    star = False
):
    if star:
        prefix = '*'
    else:
        prefix = ''
    # Build file paths
    action_word = ""
    if action == "wikidata":
        action_word = "wikidata_"
    
    ql1_path = os.path.join(base_dir, task, f'{prefix}ql1_{task}_answers_{action_word}{model}.json')
    ql2_path = os.path.join(base_dir, task, f'{prefix}ql2_{task}_answers_{action_word}{model}.json')
    
    ql3_path = None
    if task == "minus":
        ql3_path = os.path.join(base_dir, task, f'{prefix}ql3_{task}_answers_{action_word}{model}.json')
        ql1_path = os.path.join(base_dir, "sup-sub", f'{prefix}ql1_sup-sub_answers_{action_word}{model}.json')
        ql2_path = os.path.join(base_dir, "sup-sub", f'{prefix}ql2_sup-sub_answers_{action_word}{model}.json')
  
    # Load data
    try: 
        with open(ql1_path, 'r', encoding='utf-8') as f:
            ql1_answers = json.load(f)
        with open(ql2_path, 'r', encoding='utf-8') as f:
            ql2_answers = json.load(f)
        if ql3_path is not None:
            with open(ql3_path, 'r', encoding='utf-8') as f:
                ql3_answers = json.load(f)
        else:
            ql3_answers = None
    except:
        print(f"Error loading files: {ql1_path}, {ql2_path}, {ql3_path}")   
        return None
    # Compute metrics per question
    similarity_scores = {}
    if task == "minus":
        # For minus task, we need to compare ql1 and ql2 with ql3
        for qid, ans3 in ql3_answers.items():
            set3 = set(ans3)
            qid = str(int(qid) + 1)
            set_a = set(ql1_answers.get(qid, []))
            set_b = set(ql2_answers.get(qid, [])) 
            set_c =  set_b - set_a
            sim = jaccard_similarity(set3, set_c)
            is_empty = int(len(set3) == 0 and len(set_c) == 0)
            binary_eq = int(set_c == set3)
            similarity_scores[qid] = (sim, is_empty, binary_eq)
    else:
        for qid, ans1 in ql1_answers.items():
            set1 = set(ans1)
            set2 = set(ql2_answers.get(qid, []))
            sim = jaccard_similarity(set1, set2)
            is_empty = int(len(set1) == 0 and len(set2) == 0)
            binary_eq = int(set1 == set2)
            similarity_scores[qid] = (sim, is_empty, binary_eq)

    # Create DataFrame
    sim_df = pd.DataFrame.from_dict(
        similarity_scores,
        orient='index',
        columns=['JaccardSimilarity', 'IsEmptySet', 'BinaryEqual']
    )

    # Ensure index is a column
    sim_df = sim_df.reset_index().rename(columns={'index': 'QuestionID'})

    # Save to TSV
    output_filename = f'{prefix}{task}_{model}_{action}.tsv'
    output_path = os.path.join(output_dir, output_filename)
    sim_df.to_csv(output_path, sep='\t', index=False)
    print(f"Saved similarity results to {output_path}")



In [71]:
# Example usage:
model = ['gpt-4.1-nano-2025-04-14',"gpt-4.1-mini-2025-04-14"]
task = ['equal', 'sup-sub', "minus"]
action = ["original","wikidata"]

for m in model:
    for t in task:
        for a in action:
            compute_similarity(
                model=m,
                task=t,
                action=a,
                base_dir='../data/answers',
                output_dir='../data/evaluation_results',
            )
            compute_similarity(
                model=m,
                task=t,
                action=a,
                base_dir='../data/answers',
                output_dir='../data/evaluation_results',
                star=True
            )


Saved similarity results to ../data/evaluation_results/equal_gpt-4.1-nano-2025-04-14_original.tsv
Saved similarity results to ../data/evaluation_results/*equal_gpt-4.1-nano-2025-04-14_original.tsv
Saved similarity results to ../data/evaluation_results/equal_gpt-4.1-nano-2025-04-14_wikidata.tsv
Saved similarity results to ../data/evaluation_results/*equal_gpt-4.1-nano-2025-04-14_wikidata.tsv
Saved similarity results to ../data/evaluation_results/sup-sub_gpt-4.1-nano-2025-04-14_original.tsv
Saved similarity results to ../data/evaluation_results/*sup-sub_gpt-4.1-nano-2025-04-14_original.tsv
Saved similarity results to ../data/evaluation_results/sup-sub_gpt-4.1-nano-2025-04-14_wikidata.tsv
Saved similarity results to ../data/evaluation_results/*sup-sub_gpt-4.1-nano-2025-04-14_wikidata.tsv
Saved similarity results to ../data/evaluation_results/minus_gpt-4.1-nano-2025-04-14_original.tsv
Saved similarity results to ../data/evaluation_results/*minus_gpt-4.1-nano-2025-04-14_original.tsv
Saved s

In [72]:
import os
import json
import re
import pandas as pd

def summarize_evaluation_results(folder):
    """
    Summarize per-question TSV evaluation files in a folder.
    Expects filenames in the format: task-model-action.tsv
    and each TSV contains columns: QuestionID, JaccardSimilarity, IsEmptySet, BinaryEqual
    """
    rows = []

    for filename in sorted(os.listdir(folder)):
        if not filename.endswith(".tsv"):
            continue

        # Extract metadata from filename
        base = filename[:-4]  # remove .tsv
        parts = base.split('_')
        task = parts[0]
        action = parts[-1]
        model = parts[1]

        # Read TSV
        df = pd.read_csv(os.path.join(folder, filename), sep='\t')

        # Ensure JaccardSimilarity column exists
        if 'JaccardSimilarity' not in df.columns:
            raise ValueError(f"Missing 'JaccardSimilarity' in {filename}")

        # Compute overall average
        avg_all = df['JaccardSimilarity'].mean()

        # Identify empty (zero) Jaccard entries
        is_empty = df['JaccardSimilarity'] == 0
        ratio_empty = is_empty.mean()

        # Average excluding empty entries
        non_empty = df.loc[~is_empty, 'JaccardSimilarity']
        avg_non_empty = non_empty.mean() if not non_empty.empty else float('nan')

        # Binary count: proportion where BinaryEqual == 1
        if 'BinaryEqual' in df.columns:
            binary_count = (df['BinaryEqual'] == 1).mean()
        else:
            binary_count = float('nan')

        rows.append({
            "Task": task,
            "Model": model,
            "Action": action,
            "Average_All": round(avg_all, 4),
            "Average_NoEmpty": round(avg_non_empty, 4),
            "Ratio_empty": round(ratio_empty, 4),
            "Binary_count": round(binary_count, 4),
        })

    return pd.DataFrame(rows)

# Example usage:
df = summarize_evaluation_results("../data/evaluation_results/")
df



Unnamed: 0,Task,Model,Action,Average_All,Average_NoEmpty,Ratio_empty,Binary_count
0,*equal,gpt-4.1-mini-2025-04-14,original,0.5619,0.7593,0.26,0.48
1,*equal,gpt-4.1-mini-2025-04-14,wikidata,0.5143,0.7564,0.32,0.44
2,*equal,gpt-4.1-nano-2025-04-14,original,0.5319,0.7822,0.32,0.46
3,*equal,gpt-4.1-nano-2025-04-14,wikidata,0.3355,0.6989,0.52,0.42
4,*minus,gpt-4.1-mini-2025-04-14,original,0.0087,0.1652,0.9474,0.0526
5,*minus,gpt-4.1-mini-2025-04-14,wikidata,0.0197,0.375,0.9474,0.1316
6,*minus,gpt-4.1-nano-2025-04-14,original,0.0323,0.3067,0.8947,0.1053
7,*minus,gpt-4.1-nano-2025-04-14,wikidata,0.0627,0.5958,0.8947,0.2632
8,*sup-sub,gpt-4.1-mini-2025-04-14,original,0.3081,0.509,0.3947,0.1842
9,*sup-sub,gpt-4.1-mini-2025-04-14,wikidata,0.2268,0.507,0.5526,0.1579
