### Evaluation Pipeline 

In [8]:
def jaccard_similarity(set1, set2):
    """Calculate the Jaccard similarity between two sets."""
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0.0
    return intersection / union

# For the subset-superset check
def is_subset(set1, set2):
    """Check if set1 is a subset of set2."""
    return set1.issubset(set2)

In [44]:
import os
import json
import pandas as pd

def compute_similarity(
    model: str,
    task: str,
    action: str,
    base_dir: str = '../data/answers',
    output_dir: str = '.',
):

    # Build file paths
    action_word = ""
    if action == "wikidata":
        action_word = "wikidata_"
    
    ql1_path = os.path.join(base_dir, task, f'ql1_{task}_answers_{action_word}{model}.json')
    ql2_path = os.path.join(base_dir, task, f'ql2_{task}_answers_{action_word}{model}.json')
    
    ql3_path = None
    if task == "minus":
        ql3_path = os.path.join(base_dir, task, f'ql3_{task}_answers_{action_word}{model}.json')
        ql1_path = os.path.join(base_dir, "sup-sub", f'ql1_sup-sub_answers_{action_word}{model}.json')
        ql2_path = os.path.join(base_dir, "sup-sub", f'ql2_sup-sub_answers_{action_word}{model}.json')
  
    # Load data
    try: 
        with open(ql1_path, 'r', encoding='utf-8') as f:
            ql1_answers = json.load(f)
        with open(ql2_path, 'r', encoding='utf-8') as f:
            ql2_answers = json.load(f)
        if ql3_path is not None:
            with open(ql3_path, 'r', encoding='utf-8') as f:
                ql3_answers = json.load(f)
        else:
            ql3_answers = None
    except:
        print(f"Error loading files: {ql1_path}, {ql2_path}, {ql3_path}")   
        return None
    # Compute metrics per question
    similarity_scores = {}
    if task == "minus":
        # For minus task, we need to compare ql1 and ql2 with ql3
        for qid, ans3 in ql3_answers.items():
            set3 = set(ans3)
            qid = str(int(qid) + 1)
            set_a = set(ql1_answers[qid])
            set_b = set(ql2_answers.get(qid, [])) 
            set_c = set_a - set_b
            sim = jaccard_similarity(set3, set_c)
            is_empty = int(len(set3) == 0 and len(set_c) == 0)
            binary_eq = int(set_c == set3)
            similarity_scores[qid] = (sim, is_empty, binary_eq)
    else:
        for qid, ans1 in ql1_answers.items():
            set1 = set(ans1)
            set2 = set(ql2_answers.get(qid, []))
            sim = jaccard_similarity(set1, set2)
            is_empty = int(len(set1) == 0 and len(set2) == 0)
            binary_eq = int(set1 == set2)
            similarity_scores[qid] = (sim, is_empty, binary_eq)

    # Create DataFrame
    sim_df = pd.DataFrame.from_dict(
        similarity_scores,
        orient='index',
        columns=['JaccardSimilarity', 'IsEmptySet', 'BinaryEqual']
    )

    # Ensure index is a column
    sim_df = sim_df.reset_index().rename(columns={'index': 'QuestionID'})

    # Save to TSV
    output_filename = f'{task}_{model}_{action}.tsv'
    output_path = os.path.join(output_dir, output_filename)
    sim_df.to_csv(output_path, sep='\t', index=False)
    print(f"Saved similarity results to {output_path}")



In [45]:
# Example usage:
model = ['gpt-4.1-nano-2025-04-14',"gpt-4.1-mini-2025-04-14"]
task = ['equal', 'sup-sub', "minus"]
action = ["original","wikidata"]

for m in model:
    for t in task:
        for a in action:
            compute_similarity(
                model=m,
                task=t,
                action=a,
                base_dir='../data/answers',
                output_dir='../data/evaluation_results',
            )


Saved similarity results to ../data/evaluation_results/equal_gpt-4.1-nano-2025-04-14_original.tsv
Error loading files: ../data/answers/equal/ql1_equal_answers_wikidata_gpt-4.1-nano-2025-04-14.json, ../data/answers/equal/ql2_equal_answers_wikidata_gpt-4.1-nano-2025-04-14.json, None
Saved similarity results to ../data/evaluation_results/sup-sub_gpt-4.1-nano-2025-04-14_original.tsv
Saved similarity results to ../data/evaluation_results/sup-sub_gpt-4.1-nano-2025-04-14_wikidata.tsv
Saved similarity results to ../data/evaluation_results/minus_gpt-4.1-nano-2025-04-14_original.tsv
Saved similarity results to ../data/evaluation_results/minus_gpt-4.1-nano-2025-04-14_wikidata.tsv
Saved similarity results to ../data/evaluation_results/equal_gpt-4.1-mini-2025-04-14_original.tsv
Error loading files: ../data/answers/equal/ql1_equal_answers_wikidata_gpt-4.1-mini-2025-04-14.json, ../data/answers/equal/ql2_equal_answers_wikidata_gpt-4.1-mini-2025-04-14.json, None
Saved similarity results to ../data/eval

In [43]:
import os
import json
import re
import pandas as pd

def summarize_evaluation_results(folder):
    """
    Summarize per-question TSV evaluation files in a folder.
    Expects filenames in the format: task-model-action.tsv
    and each TSV contains columns: QuestionID, JaccardSimilarity, IsEmptySet, BinaryEqual
    """
    rows = []

    for filename in sorted(os.listdir(folder)):
        if not filename.endswith(".tsv"):
            continue

        # Extract metadata from filename
        base = filename[:-4]  # remove .tsv
        parts = base.split('_')
        task = parts[0]
        action = parts[-1]
        model = parts[1]

        # Read TSV
        df = pd.read_csv(os.path.join(folder, filename), sep='\t')

        # Ensure JaccardSimilarity column exists
        if 'JaccardSimilarity' not in df.columns:
            raise ValueError(f"Missing 'JaccardSimilarity' in {filename}")

        # Compute overall average
        avg_all = df['JaccardSimilarity'].mean()

        # Identify empty (zero) Jaccard entries
        is_empty = df['JaccardSimilarity'] == 0
        ratio_empty = is_empty.mean()

        # Average excluding empty entries
        non_empty = df.loc[~is_empty, 'JaccardSimilarity']
        avg_non_empty = non_empty.mean() if not non_empty.empty else float('nan')

        # Binary count: proportion where BinaryEqual == 1
        if 'BinaryEqual' in df.columns:
            binary_count = (df['BinaryEqual'] == 1).mean()
        else:
            binary_count = float('nan')

        rows.append({
            "Task": task,
            "Model": model,
            "Action": action,
            "Average_All": round(avg_all, 4),
            "Average_NoEmpty": round(avg_non_empty, 4),
            "Ratio_empty": round(ratio_empty, 4),
            "Binary_count": round(binary_count, 4),
        })

    return pd.DataFrame(rows)

# Example usage:
df = summarize_evaluation_results("../data/evaluation_results/")
df



Unnamed: 0,Task,Model,Action,Average_All,Average_NoEmpty,Ratio_empty,Binary_count
0,equal,gpt-4.1-mini-2025-04-14,original,0.5706,0.7022,0.1875,0.3716
1,equal,gpt-4.1-nano-2025-04-14,original,0.4552,0.6503,0.3,0.3057
2,minus,gpt-4.1-mini-2025-04-14,original,0.0105,0.1324,0.9211,0.0789
3,minus,gpt-4.1-nano-2025-04-14,original,0.0033,0.125,0.9737,0.1053
4,minus,gpt-4.1-nano-2025-04-14,wikidata,0.0,,1.0,0.5789
5,sup-sub,gpt-4.1-mini-2025-04-14,original,0.2654,0.3833,0.3077,0.0513
6,sup-sub,gpt-4.1-nano-2025-04-14,original,0.2119,0.459,0.5385,0.0769
7,sup-sub,gpt-4.1-nano-2025-04-14,wikidata,0.0427,0.8333,0.9487,0.3333
