### Load Questions

In [81]:
import os
import pandas as pd

folder = "../data/Dataset/"
language = "en"
file = "spinach.tsv"
file_path = os.path.join(folder, language, file)

df_questions = pd.read_csv(file_path, sep="\t", encoding="utf-8")
df_questions["type"] = df_questions["type"].apply(lambda x: str(x) if not pd.isna(x) else "0")
df_questions

Unnamed: 0,usparql,Q1,Q2,Q3,Q4,type
0,no_usparql_13,Which television shows were created by John Cl...,What television programs did John Cleese create?,Which television shows were created by John Cl...,Which television shows were created by John Cl...,0
1,no_usparql_17,Give me all current communist countries.,What countries have a communist government?,Give me all communist countries in Asia.,Give me all communist countries outside of Asia.,0
2,no_usparql_26,Which politicians were married to a German per...,Which politicians had a spouse who was German?,Which politicians were married to a German woman?,Which politicians were married to a German man...,0
3,no_usparql_29,Give me all soccer clubs in Spain that play in...,Can you list every soccer club located in Spai...,Please tell me all the Spanish football clubs ...,Please tell me all the Spanish football clubs ...,0
4,no_usparql_32,Which telecommunications organizations are loc...,Which telecommunications organizations are bas...,Which telecommunications organizations are loc...,Which telecommunications organizations are loc...,0
...,...,...,...,...,...,...
145,no_usparql_3730,Which monarchs of the United Kingdom were marr...,Which British monarchs had spouses of German o...,Which monarchs of the United Kingdom were marr...,Which monarchs of the United Kingdom were marr...,0
146,no_usparql_3743,Give me all writers that won the Nobel Prize i...,List all authors who have received the Nobel P...,Give me all Asian writers that won the Nobel P...,Give me all writers from outside Asia that wo...,0
147,no_usparql_3744,Give me English actors starring in Lovesick.,Which English actors appear in Lovesick?,Give me English male actors starring in Lovesick.,Give me English female or non-binary actors st...,0
148,no_usparql_3802,Which rivers flow into the North Sea?,Which rivers have their mouth on the North Sea?,Which rivers flow through Britain into the Nor...,Which rivers flow into the North Sea but not t...,0


### Load Answers

In [208]:
import os

folder = "../data/answers/"
actions = ["fixing", "classification","wikidata"]
tasks = ['equal', 'sup-sub', "minus"]
datasets = ['spinach']
llms = ['gpt-4.1-2025-04-14', 'gpt-4.1-mini-2025-04-14', 'gpt-4.1-nano-2025-04-14', 'gpt-4o',"o3"]
questions = ["Q1", "Q2", "Q3", "Q4"]
# List to store the full paths of JSON files

json_files = []
# Recursively walk through the directory
for root, dirs, files in os.walk(folder):
    for file in files:
        if file.endswith(".json"):
            full_path = os.path.join(root, file)
            json_files.append(full_path)

print(f"JSON files found:{len(json_files)}")


JSON files found:112


In [237]:
import json
df_answers = pd.DataFrame()
df_answers = pd.DataFrame(columns=["Q_ID", "Q_serie", "action", "task", "dataset", "llm"])

for file in json_files:
    elements = file.replace("_", "/")
    elements = elements.replace(".json", "")
    elements = elements.split("/")
    question = None
    action = "zero-shot"
    task = None
    dataset = None
    llm = None
    for q in questions:
        if q in elements:
            question = q
            break
    for a in actions:
        if a in elements:
            action = a
            break
    for t in tasks:
        if t in elements:
            task = t
            break
    for d in datasets:
        if d in elements:
            dataset = d
            break
    for l in llms:
        if l in elements:
            llm = l
            break
    if question and action and task and dataset and llm:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        df = pd.DataFrame([
                {"Q_ID": key, "Answer": value}
                for key, value in data.items()
            ])
        df["Q_serie"] = question
        df["action"] = action
        df["task"] = task
        df["dataset"] = dataset
        df["llm"] = llm
        df_answers = pd.concat([df_answers, df], ignore_index=True)
        
df_answers["Question"] = df_answers.apply(
    lambda x: df_questions.at[int(x["Q_ID"]), x["Q_serie"]] if int(x["Q_ID"]) in df_questions.index else None,
    axis=1
)

df_answers  

Unnamed: 0,Q_ID,Q_serie,action,task,dataset,llm,Answer,Question
0,45,Q1,classification,sup-sub,spinach,gpt-4o,[],Give me all spacecraft that flew to Mars.
1,46,Q1,classification,sup-sub,spinach,gpt-4o,[],Give me all taikonauts.
2,47,Q1,classification,sup-sub,spinach,gpt-4o,[],Which countries have more than ten volcanoes i...
3,48,Q1,classification,sup-sub,spinach,gpt-4o,"[Neil Armstrong, Buzz Aldrin, Michael Collins]",Who were the crew members on the Apollo 11 mis...
4,49,Q1,classification,sup-sub,spinach,gpt-4o,[],Give me all B-sides of the Ramones.
...,...,...,...,...,...,...,...,...
14343,145,Q2,zero-shot,equal,spinach,gpt-4.1-nano-2025-04-14,"[George the Third, Queen Victoria]",Which British monarchs had spouses of German o...
14344,146,Q2,zero-shot,equal,spinach,gpt-4.1-nano-2025-04-14,"[Gabriel Garcia Marquez, Toni Morrison, Albert...",List all authors who have received the Nobel P...
14345,147,Q2,zero-shot,equal,spinach,gpt-4.1-nano-2025-04-14,[Johnny Flynn],Which English actors appear in Lovesick?
14346,148,Q2,zero-shot,equal,spinach,gpt-4.1-nano-2025-04-14,"[River Thames, River Rhine, River Meuse, River...",Which rivers have their mouth on the North Sea?


In [238]:
df_answers = df_answers.copy()
# Define the group keys including Q_serie
group_keys = ["Q_ID", "Q_serie", "action", "dataset", "llm"]

# Assign unique case IDs for repeated rows with same key
df_answers["Answer_serie"] = (
    df_answers.groupby(group_keys)
    .cumcount()
    .apply(lambda x: x + 1)
)

df_answers.drop_duplicates(
    subset=["Q_ID", "Q_serie", "action", "task", "dataset", "llm"],
    inplace=True
)

df_answers.reset_index(drop=True, inplace=True)
df_answers

Unnamed: 0,Q_ID,Q_serie,action,task,dataset,llm,Answer,Question,Answer_serie
0,45,Q1,classification,sup-sub,spinach,gpt-4o,[],Give me all spacecraft that flew to Mars.,1
1,46,Q1,classification,sup-sub,spinach,gpt-4o,[],Give me all taikonauts.,1
2,47,Q1,classification,sup-sub,spinach,gpt-4o,[],Which countries have more than ten volcanoes i...,1
3,48,Q1,classification,sup-sub,spinach,gpt-4o,"[Neil Armstrong, Buzz Aldrin, Michael Collins]",Who were the crew members on the Apollo 11 mis...,1
4,49,Q1,classification,sup-sub,spinach,gpt-4o,[],Give me all B-sides of the Ramones.,1
...,...,...,...,...,...,...,...,...,...
14343,145,Q2,zero-shot,equal,spinach,gpt-4.1-nano-2025-04-14,"[George the Third, Queen Victoria]",Which British monarchs had spouses of German o...,1
14344,146,Q2,zero-shot,equal,spinach,gpt-4.1-nano-2025-04-14,"[Gabriel Garcia Marquez, Toni Morrison, Albert...",List all authors who have received the Nobel P...,1
14345,147,Q2,zero-shot,equal,spinach,gpt-4.1-nano-2025-04-14,[Johnny Flynn],Which English actors appear in Lovesick?,1
14346,148,Q2,zero-shot,equal,spinach,gpt-4.1-nano-2025-04-14,"[River Thames, River Rhine, River Meuse, River...",Which rivers have their mouth on the North Sea?,1


In [282]:
len(df_answers["Q_ID"].unique())

150

### Analysis

In [None]:
from utils import jaccard_similarity

def mark_logical_errors(df):
    df = df.copy()
    df["isEmpty"] = df["Answer"].apply(lambda x: 1 if isinstance(x, list) and len(x) == 0 else 0)

    # Initialize error flag columns
    for i in range(1, 6):
        df[f"consistency_{i}"] = 0


    df["jaccard_similarity(A1-A2)"] = -1
    df["jaccard_similarity(A1-A34)"] = -1
    

    # Group by the full combination
    group_keys = ["Q_ID", "action", "dataset", "llm"]
    grouped = df_answers[df_answers["Answer_serie"]==1].groupby(group_keys)
    
    for keys, group in grouped:
        # Ensure we have all four Q-series
        if set(group["Q_serie"]) >= {"Q1", "Q2", "Q3", "Q4"}:

            A1 = set(group[group["Q_serie"] == "Q1"]["Answer"].values[0]) if not group[group["Q_serie"] == "Q1"].empty else set()
            A2 = set(group[group["Q_serie"] == "Q2"]["Answer"].values[0]) if not group[group["Q_serie"] == "Q2"].empty else set()
            A3 = set(group[group["Q_serie"] == "Q3"]["Answer"].values[0]) if not group[group["Q_serie"] == "Q3"].empty else set()
            A4 = set(group[group["Q_serie"] == "Q4"]["Answer"].values[0]) if not group[group["Q_serie"] == "Q4"].empty else set()

            # print(f"Processing group: {keys} with answers: {A1}, {A2}, {A3}, {A4}")
            # Evaluate logical conditions
            check = {
                "consistency_1": A1 == A2,
                "consistency_2": A1 == A3.union(A4),
                "consistency_3": A3.issubset(A1),
                "consistency_4": A4.issubset(A1),
                "consistency_5": A3.isdisjoint(A4)
            }

            similarity = { 
                 "jaccard_similarity(A1-A2)": jaccard_similarity(A1, A2),
                "jaccard_similarity(A1-A34)": jaccard_similarity(A1, A3.union(A4))}
            

            for idx in group.index:
                for key, val in check.items():
                    if val:
                        df.at[idx, key] = 1
                for key, val in similarity.items():
                    df.at[idx, key] = float(f"{val:.3f}")
            
    return df


In [310]:
from utils import jaccard_similarity
import pandas as pd

def analysis(df):
    df = df.copy()
    summaries = []

    group_keys = ["Q_ID", "action", "dataset", "llm"]
    grouped = df[df["Answer_serie"] == 1].groupby(group_keys)

    for keys, group in grouped:
        if set(group["Q_serie"]) >= {"Q1", "Q2", "Q3", "Q4"}:
            # Answers from Answer_serie == 1
            A1 = set(group[group["Q_serie"] == "Q1"]["Answer"].values[0])
            A2 = set(group[group["Q_serie"] == "Q2"]["Answer"].values[0])
            A3 = set(group[group["Q_serie"] == "Q3"]["Answer"].values[0])
            A4 = set(group[group["Q_serie"] == "Q4"]["Answer"].values[0])

            # Questions
            q_map = {
                row["Q_serie"]: row["Question"]
                for _, row in group.iterrows()
                if row["Q_serie"] in {"Q1", "Q2", "Q3", "Q4"}
            }

            # A1_prime from Answer_serie == 2
            df_serie2 = df[
                (df["Answer_serie"] == 2) &
                (df["Q_serie"] == "Q1")
            ]
            for col, val in zip(group_keys, keys):
                df_serie2 = df_serie2[df_serie2[col] == val]
            A1_prime = set(df_serie2["Answer"].values[0]) if not df_serie2.empty else set()

            # A1_double_prime from Answer_serie == 3
            df_serie3 = df[
                (df["Answer_serie"] == 3) &
                (df["Q_serie"] == "Q1")
            ]
            for col, val in zip(group_keys, keys):
                df_serie3 = df_serie3[df_serie3[col] == val]
            A1_double_prime = set(df_serie3["Answer"].values[0]) if not df_serie3.empty else set()

            # Logical checks
            consistency = {
                "?A1=A2": int(A1 == A2),
                "?A1=A3+A4": int(A1 == A3.union(A4)),
                "?A1>A3": int(A3.issubset(A1)),
                "?A1>A4": int(A4.issubset(A1)),
                "?A3∅A4": int(A3.isdisjoint(A4)),
            }

            # Jaccard similarities
            similarities = {
                "J(A1-A2)": round(jaccard_similarity(A1, A2), 4),
                "J(A1-A34)": round(jaccard_similarity(A1, A3.union(A4)), 4),
                "J(A1-A1*)": round(jaccard_similarity(A1, A1_prime), 4),
                "J(A1-A1**)": round(jaccard_similarity(A1, A1_double_prime), 4)
            }

            # Count empty answers
            # is_empty_count = group["Answer"].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()

            # Build summary
            summary_row = {
                "Q_ID": keys[0],
                "action": keys[1],
                "dataset": keys[2],
                "llm": keys[3],
                # "isEmpty_count": is_empty_count,
                **consistency,
                **similarities,
                "Q1": q_map.get("Q1", ""),
                "Q2": q_map.get("Q2", ""),
                "Q3": q_map.get("Q3", ""),
                "Q4": q_map.get("Q4", ""),
                "A1": list(A1),
                "A2": list(A2),
                "A3": list(A3),
                "A4": list(A4),
                "A1*": list(A1_prime),
                "A1**": list(A1_double_prime)
            }
            summaries.append(summary_row)

    return pd.DataFrame(summaries)


In [311]:
df_analysis = analysis(df_answers)
df_analysis

Unnamed: 0,Q_ID,action,dataset,llm,?A1=A2,?A1=A3+A4,?A1>A3,?A1>A4,?A3∅A4,J(A1-A2),...,Q1,Q2,Q3,Q4,A1,A2,A3,A4,A1*,A1**
0,0,classification,spinach,gpt-4.1-2025-04-14,0,0,1,0,1,0.3333,...,Which television shows were created by John Cl...,What television programs did John Cleese create?,Which television shows were created by John Cl...,Which television shows were created by John Cl...,"[Seven Deadly Sins, The Human Face, Whoops Apo...","[Seven Deadly Sins (1970 television series), F...","[Seven Deadly Sins, The Human Face, Whoops Apo...",[The Secret Policeman's Ball 2006],"[The Secret Policeman's Ball, Seven Deadly Sin...","[Seven Deadly Sins (1970 television series), F..."
1,0,classification,spinach,gpt-4.1-mini-2025-04-14,1,0,1,0,1,1.0000,...,Which television shows were created by John Cl...,What television programs did John Cleese create?,Which television shows were created by John Cl...,Which television shows were created by John Cl...,"[Monty Python's Flying Circus, A Fish Called W...","[Monty Python's Flying Circus, A Fish Called W...","[The Frost Report, Ripping Yarns, Fawlty Tower...",[A Fish Called Wanda (television adaptation)],"[The World of Wooster, Monty Python's Flying C...","[Monty Python's Flying Circus, A Fish Called W..."
2,0,classification,spinach,gpt-4.1-nano-2025-04-14,1,1,1,1,1,0.0000,...,Which television shows were created by John Cl...,What television programs did John Cleese create?,Which television shows were created by John Cl...,Which television shows were created by John Cl...,[],[],[],[],[],[]
3,0,fixing,spinach,gpt-4.1-2025-04-14,0,1,1,1,1,0.6667,...,Which television shows were created by John Cl...,What television programs did John Cleese create?,Which television shows were created by John Cl...,Which television shows were created by John Cl...,"[Hold the Sunset, Fawlty Towers, Monty Python'...","[Fawlty Towers, Monty Python's Flying Circus]","[Fawlty Towers, Monty Python's Flying Circus]",[Hold the Sunset],"[Hold the Sunset, Fawlty Towers, Monty Python'...","[Fawlty Towers, Monty Python's Flying Circus]"
4,0,fixing,spinach,gpt-4.1-mini-2025-04-14,0,0,1,0,1,0.6667,...,Which television shows were created by John Cl...,What television programs did John Cleese create?,Which television shows were created by John Cl...,Which television shows were created by John Cl...,"[Monty Python's Flying Circus, A Fish Called W...","[Monty Python's Flying Circus, A Fish Called W...","[The Frost Report, Ripping Yarns, Fawlty Tower...",[A Fish Called Wanda (television pilot)],"[Monty Python's Flying Circus, A Fish Called W...","[Monty Python's Flying Circus, A Fish Called W..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2501,99,zero-shot,spinach,gpt-4.1-2025-04-14,1,1,1,1,1,1.0000,...,Which states border Illinois?,What states share a border with Illinois?,Which states in the Great Lakes region border ...,Which states not in the Great Lakes region bor...,"[Indiana, Kentucky, Iowa, Wisconsin, Missouri]","[Indiana, Kentucky, Iowa, Wisconsin, Missouri]","[Indiana, Wisconsin]","[Kentucky, Iowa, Missouri]",[],[]
2502,99,zero-shot,spinach,gpt-4.1-mini-2025-04-14,1,0,0,1,0,1.0000,...,Which states border Illinois?,What states share a border with Illinois?,Which states in the Great Lakes region border ...,Which states not in the Great Lakes region bor...,"[Indiana, Kentucky, Iowa, Wisconsin, Missouri]","[Indiana, Kentucky, Iowa, Wisconsin, Missouri]","[Indiana, Wisconsin, Michigan]","[Indiana, Kentucky, Iowa, Wisconsin, Missouri]",[],[]
2503,99,zero-shot,spinach,gpt-4.1-nano-2025-04-14,0,0,1,0,0,0.6250,...,Which states border Illinois?,What states share a border with Illinois?,Which states in the Great Lakes region border ...,Which states not in the Great Lakes region bor...,"[Indiana, Kentucky, Iowa, Wisconsin, Missouri]","[Arkansas, Tennessee, Indiana, Kentucky, Iowa,...","[Indiana, Wisconsin, Iowa, Missouri]","[Kentucky, Missouri, Tennessee]",[],[]
2504,99,zero-shot,spinach,gpt-4o,1,1,1,1,1,1.0000,...,Which states border Illinois?,What states share a border with Illinois?,Which states in the Great Lakes region border ...,Which states not in the Great Lakes region bor...,"[Indiana, Kentucky, Iowa, Wisconsin, Missouri]","[Indiana, Kentucky, Iowa, Wisconsin, Missouri]","[Indiana, Wisconsin]","[Kentucky, Iowa, Missouri]",[],[]


In [313]:
def summary(df_analysis):
    """
    Compute group-wise statistics from the output of `analysis(df)`.
    Grouped by dataset, action, llm.
    Returns mean values for consistency ratios, similarities, p-values, and empty answer ratios.
    """
    group_cols = ["dataset", "action", "llm"]

    # Columns to average
    consistency_cols = ["?A1=A2", "?A1=A3+A4", "?A1>A3", "?A1>A4", "?A3∅A4"]
    jaccard_cols = ["J(A1-A2)", "J(A1-A34)", "J(A1-A1*)", "J(A1-A1**)"]
    pval_cols = [col for col in df_analysis.columns if col.startswith("p_value_")]
    metric_cols = consistency_cols + jaccard_cols + pval_cols

    # Add empty answer flags
    for a in ["A1", "A2", "A3", "A4"]:
        df_analysis[f"{a}_empty_ratio"] = df_analysis[a].apply(lambda x: int(isinstance(x, list) and len(x) == 0))

    empty_cols = [f"{a}_empty_ratio" for a in ["A1", "A2", "A3", "A4"]]

    # Filter out rows with invalid Jaccard scores
    df_valid = df_analysis.copy()
    for col in jaccard_cols:
        df_valid = df_valid[df_valid[col] != -1]

    # Compute means grouped by dataset, action, and llm
    df_summary = (
        df_valid
        .groupby(group_cols)[metric_cols + empty_cols]
        .mean()
        .reset_index()
        .round(4)
        .rename(columns={col: f"{col}" for col in metric_cols + empty_cols})
    )

    return df_summary


In [314]:
df_summery = summary(df_analysis)
df_summery

Unnamed: 0,dataset,action,llm,?A1=A2,?A1=A3+A4,?A1>A3,?A1>A4,?A3∅A4,J(A1-A2),J(A1-A34),J(A1-A1*),J(A1-A1**),A1_empty_ratio,A2_empty_ratio,A3_empty_ratio,A4_empty_ratio
0,spinach,classification,gpt-4.1-2025-04-14,0.4124,0.5155,0.9485,0.7113,0.8763,0.7204,0.8272,0.7598,0.7187,0.0103,0.0515,0.0515,0.1443
1,spinach,classification,gpt-4.1-mini-2025-04-14,0.3867,0.4467,0.9667,0.72,0.8867,0.5929,0.6855,0.5664,0.5971,0.1,0.1067,0.14,0.2867
2,spinach,classification,gpt-4.1-nano-2025-04-14,0.8267,0.6667,0.84,0.8867,0.9867,0.0589,0.0617,0.0,0.0087,0.9,0.82,0.74,0.8733
3,spinach,fixing,gpt-4.1-2025-04-14,0.6133,0.6333,0.9467,0.7067,0.94,0.8176,0.8535,0.8417,0.8455,0.04,0.0333,0.0733,0.1133
4,spinach,fixing,gpt-4.1-mini-2025-04-14,0.5067,0.52,0.8867,0.7333,0.8467,0.7439,0.8074,0.7916,0.7759,0.0133,0.02,0.04,0.1267
5,spinach,fixing,gpt-4.1-nano-2025-04-14,0.24,0.3333,0.6467,0.5667,0.5667,0.3337,0.4644,0.5406,0.3253,0.1733,0.1667,0.14,0.1867
6,spinach,fixing,gpt-4o,0.5842,0.5644,0.9109,0.7129,0.8911,0.4864,0.5482,0.4952,0.4782,0.2871,0.3861,0.297,0.3861
7,spinach,fixing,o3,0.1967,0.3443,0.9508,0.3934,0.8689,0.5327,0.7311,0.5594,0.5181,0.0492,0.0492,0.0328,0.0656
8,spinach,wikidata,gpt-4.1-2025-04-14,0.5,0.2133,0.5667,0.44,0.6467,0.798,0.62,0.0,0.0,0.0,0.0,0.04,0.0933
9,spinach,wikidata,gpt-4.1-mini-2025-04-14,0.44,0.1533,0.4867,0.4267,0.4867,0.7297,0.567,0.0,0.0,0.0,0.0,0.0067,0.0467
