# Evaluation

### Answers 

In [59]:
import os
import json
import pandas as pd
from utils import jaccard_similarity
import datetime

def get_answer_set(df, q_serie, task):
    match = df[(df["Q_serie"] == q_serie) & (df["task"] == task)]
    if not match.empty:
        return set(match["Answer"].values[0])
    return set()

def load_question(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path, sep="\t", encoding="utf-8")
    # df["type"] = df["type"].apply(lambda x: str(x) if not pd.isna(x) else "0")
    return df



In [60]:
def load_all_questions(root_dir, datasets, languages):
    """
    Load and merge question files from multiple datasets and languages.

    Args:
        root_dir (str): Base directory containing the question files.
        datasets (list): List of dataset names.
        languages (list): List of language codes.
        load_questions_fn (Callable): Function to load a TSV file into a DataFrame.

    Returns:
        pd.DataFrame: Merged DataFrame with original index stored as 'q_index',
                      and columns 'dataset' and 'lang' added.
    """
    all_dfs = []

    for dataset in datasets:
        for lang in languages:
            question_path = os.path.join(root_dir, "data", "Dataset", lang, f"{dataset}.tsv")
            if not os.path.exists(question_path):
                print(f"File not found: {question_path}")
                continue

            df = load_question(question_path)
            df = df.copy()
            df["q_index"] = df.index
            df["dataset"] = dataset
            df["lang"] = lang

            all_dfs.append(df)

    return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()



In [61]:
def load_answers(folder: str, datasets, llms, actions, tasks, languages, questions) -> pd.DataFrame:
    df_answers = pd.DataFrame(columns=["Q_ID", "Q_serie", "action", "task", "dataset", "lang","llm"])

    json_files = [
        os.path.join(root, file)
        for root, _, files in os.walk(folder)
        for file in files if file.endswith(".json")
    ]

    print(f"JSON files found: {len(json_files)}")

    for file in json_files:
        if not file.split("/")[-1].startswith("Q"):
            continue
        elements = file.replace("_", "/").replace(".json", "").split("/")
        question = next((q for q in questions if q in elements), None)
        action = next((a for a in actions if a in elements), "zero-shot")
        task = next((t for t in tasks if t in elements), None)
        dataset = next((d for d in datasets if d in elements), None)
        lang = next((l for l in languages if l in elements), None)
        llm = next((l for l in llms if l in elements), None)

        if all([question, action, task, dataset, llm]):
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            df = pd.DataFrame([{"Q_ID": key, "Answer": value} for key, value in data.items()])
            df["Q_serie"] = question
            df["action"] = action
            df["task"] = task
            df["dataset"] = dataset
            df["llm"] = llm
            df["lang"] = lang
            df_answers = pd.concat([df_answers, df], ignore_index=True)

    return df_answers

In [62]:
def enrich_answers(df_answers, df_questions):
    df_answers["Question"] = df_answers.apply(
        lambda x: df_questions.loc[
            (df_questions["q_index"] == int(x["Q_ID"])) &
            (df_questions["dataset"] == x["dataset"])
        ][x["Q_serie"]].values[0]
        if not df_questions.loc[
            (df_questions["q_index"] == int(x["Q_ID"])) &
            (df_questions["dataset"] == x["dataset"]) 
        ].empty else None,
        axis=1
    )

    df_answers.drop_duplicates(
        subset=["Q_ID", "Q_serie", "action", "task", "dataset", "llm"],
        inplace=True
    )
    df_answers["Answer"] = df_answers["Answer"].apply(lambda x: x if isinstance(x, list) else [])
    df_answers.reset_index(drop=True, inplace=True)
    return df_answers

In [228]:
def analysis(df):
    rows = []
    group_keys = ["Q_ID", "action", "dataset", "llm"]
    grouped = df.groupby(group_keys)

    for keys, group in grouped: 
        if set(group["Q_serie"]) >= {"Q1", "Q2", "Q3", "Q4"}:
            action = group["action"].values[0]
            if action in ["zero-shot", "wikidata"]:
                A1 = get_answer_set(group, "Q1", "equal")
                A2 = get_answer_set(group, "Q2", "equal")
                A3 = get_answer_set(group, "Q3", "sup-sub")
                A4 = get_answer_set(group, "Q4", "minus")

                A1_prime = None
                A1_double_prime = None

                similarities = {
                    "J(A1-A2)": round(jaccard_similarity(A1, A2), 4),
                    "J(A1-A34)": round(jaccard_similarity(A1, A3.union(A4)), 4),
                    "J(A1-A1*)": None,
                    "J(A1-A1**)": None,
                    "J(A1*-A1**)": None
                    }
                consistency = {
                    "?A1=A2": int(A1 == A2),
                    "?A1=A3+A4": int(A1 == A3.union(A4)),
                    "?A1>A3": int(A3.issubset(A1)),
                    "?A1>A4": int(A4.issubset(A1)),
                    "?A3∅A4": int(A3.isdisjoint(A4)),
                    "?A1=A1*": None,
                    "?A1=A1**": None,
                    "?A1*=A1**": None
                    }
            elif action in ['classification','fixing']:
                # Usage
                A1_equal = get_answer_set(group, "Q1", "equal")
                A1_contain = get_answer_set(group, "Q1", "sup-sub")
                A1_minus = get_answer_set(group, "Q1", "minus")
                A2_equal = get_answer_set(group, "Q2", "equal")
                A3_contain = get_answer_set(group, "Q3", "sup-sub")
                A3_minus = get_answer_set(group, "Q3", "minus")
                A4_minus = get_answer_set(group, "Q4", "minus")
                similarities = {
                    "J(A1-A2)": round(jaccard_similarity(A1_equal, A2_equal), 4),
                    "J(A1-A34)": round(jaccard_similarity(A1_minus, A3_minus.union(A4_minus)), 4),
                    "J(A1-A1*)": round(jaccard_similarity(A1_equal, A1_contain), 4),
                    "J(A1-A1**)": round(jaccard_similarity(A1_equal, A1_minus), 4),
                    "J(A1*-A1**)": round(jaccard_similarity(A1_contain, A1_minus), 4)
                    }
                consistency = {
                    "?A1=A2": int(A1_equal == A2_equal),
                    "?A1=A3+A4": int(A1_minus == A3_minus.union(A4_minus)),
                    "?A1>A3": int(A3_contain.issubset(A1_contain)),
                    "?A1>A4": int(A4_minus.issubset(A1_minus)),
                    "?A3∅A4": int(A3_minus.isdisjoint(A4_minus)),
                    "?A1=A1*": int(A1_equal == A1_contain),
                    "?A1=A1**": int(A1_equal == A1_minus),
                    "?A1*=A1**": int(A1_contain == A1_minus)
                    }

                A1 = A1_equal
                A2 = A2_equal
                A3 = A3_contain
                A4 = A4_minus
                A1_prime = list(A1_contain)
                A1_double_prime = list(A1_minus)
                
            q_map = {
                row["Q_serie"]: row["Question"]
                for _, row in group.iterrows()
                if row["Q_serie"] in {"Q1", "Q2", "Q3", "Q4"}
            }

            row = {
                "Q_ID": keys[0], "action": keys[1], "dataset": keys[2], "llm": keys[3],
                **consistency, **similarities,
                "Q1": q_map.get("Q1", ""), "Q2": q_map.get("Q2", ""),
                "Q3": q_map.get("Q3", ""), "Q4": q_map.get("Q4", ""),
                "A1": list(A1), "A2": list(A2), "A3": list(A3), "A4": list(A4),
                "A1*": A1_prime, "A1**": A1_double_prime
            }
            rows.append(row)

    return pd.DataFrame(rows)

In [229]:
def summary(df_analysis):
    group_cols = ["dataset", "action", "llm"]
    consistency_cols = ["?A1=A2", "?A1=A3+A4", "?A1>A3", "?A1>A4", "?A3∅A4", "?A1=A1*", "?A1=A1**","?A1*=A1**"]
    jaccard_cols = ["J(A1-A2)", "J(A1-A34)", "J(A1-A1*)", "J(A1-A1**)","J(A1*-A1**)"]
    pval_cols = [col for col in df_analysis.columns if col.startswith("p_value_")]
    metric_cols = consistency_cols + jaccard_cols + pval_cols

    for a in ["A1", "A2", "A3", "A4"]:
        df_analysis[f"idk_{a}"] = df_analysis[a].apply(lambda x: int(
        (isinstance(x, list) and len(x) == 0)       # []
        or (x == "idk")                             # "idk"
        or (isinstance(x, list) and x == ["idk"])   # ["idk"]
    ))

    empty_cols = [f"idk_{a}" for a in ["A1", "A2", "A3", "A4"]]


    df_summary = (
        df_analysis
        .groupby(group_cols)[metric_cols + empty_cols]
        .mean()
        .reset_index()
        .round(4)
    )
    group_cols_overall = ["action", "llm"]
    df_summary_extend = (
        df_analysis
        .groupby(group_cols_overall)[metric_cols + empty_cols]
        .mean()
        .reset_index()
        .round(4)
    )
    df_summary_extend["dataset"] = "overall"
    
    df_summary = pd.concat([df_summary, df_summary_extend], ignore_index=True)
    df_summary["?A1=A1(ave)"] = df_summary[["?A1=A1*", "?A1=A1**","?A1*=A1**"]].mean(axis=1).round(4)
    df_summary["J_A1_ave"] = df_summary[["J(A1-A1*)", "J(A1-A1**)", "J(A1*-A1**)"]].mean(axis=1).round(4)
    return df_summary

In [65]:
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
datasets=["spinach", "qawiki",'synthetic']
# llms = ['gpt-4.1-2025-04-14', 'gpt-4.1-mini-2025-04-14', 'gpt-4.1-nano-2025-04-14', 
#         'gpt-4o','o3','gpt-5-nano',"gpt-5-mini","gpt-5",
#         "gemini-2.0-flash","gemini-2.5-flash","gemini-2.5-pro",
#         "grok-3-mini","deepseek-chat","deepseek-reasoner","llama3.1:8b","llama3.3:70b"]
llms = ['gpt-5']
actions = ["fixing", "classification", "wikidata"]
tasks = ['equal', 'sup-sub', "minus"]
languages = ['en']

df_questions = load_all_questions(root_dir, datasets, languages)


In [66]:
df_questions

Unnamed: 0,usparql,Q1,Q2,Q3,Q4,type,q_index,dataset,lang,q1,q2,Type,Unnamed: 7,0: fully containment
0,no_usparql_13,Which television shows were created by John Cl...,What television programs did John Cleese create?,Which television shows were created by John Cl...,Which television shows were created by John Cl...,,0,spinach,en,,,,,
1,no_usparql_17,Give me all current communist countries.,What countries have a communist government?,Give me all communist countries in Asia.,Give me all communist countries outside of Asia.,,1,spinach,en,,,,,
2,no_usparql_26,Which politicians were married to a German per...,Which politicians had a spouse who was German?,Which politicians were married to a German woman?,Which politicians were married to a German man...,,2,spinach,en,,,,,
3,no_usparql_29,Give me all soccer clubs in Spain that play in...,Can you list every soccer club located in Spai...,Please tell me all the Spanish football clubs ...,Please tell me all the Spanish football clubs ...,,3,spinach,en,,,,,
4,no_usparql_32,Which telecommunications organizations are loc...,Which telecommunications organizations are bas...,Which telecommunications organizations are loc...,Which telecommunications organizations are loc...,,4,spinach,en,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,,In which universities do Nobel Prize Laureates...,Which universities have Nobel laureates on the...,Which universities have Nobel laureates in Eco...,Which universities have Nobel laureates on the...,,145,synthetic,en,,,,,
446,,With which countries does India have ongoing b...,Which countries currently have territorial dis...,With which countries that are nuclear powers d...,With which countries that are not nuclear powe...,,146,synthetic,en,,,,,
447,,What countries on the Arabian Peninsula are co...,Which countries are in the Middle East and on ...,What countries on the Arabian Peninsula are pa...,What countries on the Arabian Peninsula are pa...,,147,synthetic,en,,,,,
448,,What rivers over 500 kilometers long flow into...,Which rivers over 500 kms in length feed into ...,What rivers over 500 kilometers long flow into...,What rivers over 500 kilometers long flow into...,,148,synthetic,en,,,,,


In [67]:
df_answers = load_answers(
    folder=root_dir + "/data/answers/",
    datasets = datasets,
    llms=llms,
    actions=actions,
    tasks=tasks,
    languages=languages,
    questions=["Q1", "Q2", "Q3", "Q4"]
)

df_answers = enrich_answers(df_answers, df_questions)

JSON files found: 1185


In [68]:
df_answers

Unnamed: 0,Q_ID,Q_serie,action,task,dataset,lang,llm,Answer,Question
0,0,Q1,classification,sup-sub,spinach,,gpt-5,"[At Last the 1948 Show, Monty Python's Flying ...",Which television shows were created by John Cl...
1,1,Q1,classification,sup-sub,spinach,,gpt-5,"[People's Republic of China, Republic of Cuba,...",Give me all current communist countries.
2,2,Q1,classification,sup-sub,spinach,,gpt-5,[idk],Which politicians were married to a German per...
3,3,Q1,classification,sup-sub,spinach,,gpt-5,"[Futbol Club Barcelona, Real Madrid Club de Fú...",Give me all soccer clubs in Spain that play in...
4,4,Q1,classification,sup-sub,spinach,,gpt-5,[idk],Which telecommunications organizations are loc...
...,...,...,...,...,...,...,...,...,...
9895,146,Q1,wikidata,equal,qawiki,,gpt-5,"[Álvaro Dias, Cabo Daciolo, Ciro Gomes, Felipe...",Who has run against Jair Bolsonaro for Preside...
9896,147,Q1,wikidata,equal,qawiki,,gpt-5,"[Lee Hae-chan, Han Myeong-sook, Han Duck-soo, ...",Who has served as Prime Minister of South Kore...
9897,148,Q1,wikidata,equal,qawiki,,gpt-5,"[Marie Curie, Linus Pauling]",Who has won more than one type of Nobel Prize?
9898,149,Q1,wikidata,equal,qawiki,,gpt-5,[idk],Who were the candidates in the 1972 United Sta...


In [230]:
df_analysis = analysis(df_answers)

In [231]:
print(df_analysis.columns)
df_analysis

Index(['Q_ID', 'action', 'dataset', 'llm', '?A1=A2', '?A1=A3+A4', '?A1>A3',
       '?A1>A4', '?A3∅A4', '?A1=A1*', '?A1=A1**', '?A1*=A1**', 'J(A1-A2)',
       'J(A1-A34)', 'J(A1-A1*)', 'J(A1-A1**)', 'J(A1*-A1**)', 'Q1', 'Q2', 'Q3',
       'Q4', 'A1', 'A2', 'A3', 'A4', 'A1*', 'A1**'],
      dtype='object')


Unnamed: 0,Q_ID,action,dataset,llm,?A1=A2,?A1=A3+A4,?A1>A3,?A1>A4,?A3∅A4,?A1=A1*,...,Q1,Q2,Q3,Q4,A1,A2,A3,A4,A1*,A1**
0,0,classification,qawiki,gpt-5,1,1,1,1,1,1.0,...,In which countries are tepuis found?,In which countries are tepuis located?,In which Spanish-speaking countries are tepuis...,In which countries that are not Spanish-speaki...,"[Venezuela, Guyana, Brazil]","[Venezuela, Guyana, Brazil]",[],"[Guyana, Brazil]","[Venezuela, Guyana, Brazil]","[Venezuela, Guyana, Brazil]"
1,0,classification,spinach,gpt-5,1,1,1,1,1,0.0,...,Which television shows were created by John Cl...,What television programs did John Cleese create?,Which television shows were created by John Cl...,Which television shows were created by John Cl...,"[At Last the 1948 Show, Fawlty Towers, Monty P...","[At Last the 1948 Show, Fawlty Towers, Monty P...",[],[],"[At Last the 1948 Show, Fawlty Towers, Monty P...","[At Last the 1948 Show, Fawlty Towers, Monty P..."
2,0,classification,synthetic,gpt-5,1,1,1,1,1,1.0,...,Name the films for which Quentin Tarantino was...,Which movies were directed by Quentin Tarantino?,Name the films directed by Quentin Tarantino s...,Name the films directed by Quentin Tarantino n...,"[The Hateful Eight, Django Unchained, Kill Bil...","[The Hateful Eight, Django Unchained, Kill Bil...",[],"[The Hateful Eight, Django Unchained, Kill Bil...","[The Hateful Eight, Django Unchained, Kill Bil...","[The Hateful Eight, Django Unchained, Kill Bil..."
3,0,fixing,qawiki,gpt-5,1,1,1,1,1,0.0,...,In which countries are tepuis found?,In which countries are tepuis located?,In which Spanish-speaking countries are tepuis...,In which countries that are not Spanish-speaki...,"[Suriname, Guyana, Brazil, Colombia, Venezuela]","[Suriname, Guyana, Brazil, Colombia, Venezuela]","[Colombia, Venezuela]","[Suriname, Guyana, Brazil]","[Colombia, Venezuela, Guyana, Brazil]","[Suriname, Guyana, Brazil, Colombia, Venezuela]"
4,0,fixing,spinach,gpt-5,0,1,1,1,1,0.0,...,Which television shows were created by John Cl...,What television programs did John Cleese create?,Which television shows were created by John Cl...,Which television shows were created by John Cl...,"[Fawlty Towers, Monty Python's Flying Circus]","[At Last the 1948 Show, Fawlty Towers, Monty P...","[At Last the 1948 Show, Monty Python's Fliegen...",[],"[At Last the 1948 Show, Monty Python's Fliegen...","[At Last the 1948 Show, Fawlty Towers, Monty P..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,99,wikidata,spinach,gpt-5,1,1,1,1,1,,...,Which states border Illinois?,What states share a border with Illinois?,Which states in the Great Lakes region border ...,Which states not in the Great Lakes region bor...,"[Indiana, Missouri, Wisconsin, Iowa, Kentucky]","[Indiana, Missouri, Wisconsin, Iowa, Kentucky]","[Indiana, Wisconsin]","[Missouri, Iowa, Kentucky]",,
1796,99,wikidata,synthetic,gpt-5,1,0,0,1,1,,...,Which colors are on the flag of both Germany a...,What are the colors used for both the German a...,What primary colors are used for both the Germ...,What non-primary colors are used for both the ...,"[black, red]","[black, red]","[yellow, red]",[black],,
1797,99,zero-shot,qawiki,gpt-5,1,1,1,1,0,,...,What languages are pro-drop?,What languages drop pronouns?,Which pro-drop languages use Latin script?,Which pro-drop languages do not use Latin script?,[idk],[idk],[idk],[idk],,
1798,99,zero-shot,spinach,gpt-5,1,1,1,1,1,,...,Which states border Illinois?,What states share a border with Illinois?,Which states in the Great Lakes region border ...,Which states not in the Great Lakes region bor...,"[Indiana, Missouri, Wisconsin, Iowa, Kentucky]","[Indiana, Missouri, Wisconsin, Iowa, Kentucky]","[Indiana, Wisconsin]","[Missouri, Iowa, Kentucky]",,


In [232]:
df_summary = summary(df_analysis)
df_summary

Unnamed: 0,dataset,action,llm,?A1=A2,?A1=A3+A4,?A1>A3,?A1>A4,?A3∅A4,?A1=A1*,?A1=A1**,...,J(A1-A34),J(A1-A1*),J(A1-A1**),J(A1*-A1**),idk_A1,idk_A2,idk_A3,idk_A4,?A1=A1(ave),J_A1_ave
0,qawiki,classification,gpt-5,0.9133,0.9267,1.0,0.9733,0.52,0.5267,0.6067,...,0.9644,0.7286,0.7324,0.746,0.4133,0.4067,1.0,0.5333,0.58,0.7357
1,qawiki,fixing,gpt-5,0.8933,0.8933,0.9533,0.9867,0.62,0.7067,0.7067,...,0.9191,0.822,0.8343,0.8067,0.5133,0.5267,0.5067,0.58,0.7,0.821
2,qawiki,wikidata,gpt-5,0.82,0.64,0.7867,0.7667,0.5,,,...,0.7825,,,,0.66,0.6467,0.54,0.5933,,
3,qawiki,zero-shot,gpt-5,0.68,0.5667,0.7267,0.74,0.5867,,,...,0.7421,,,,0.5467,0.5267,0.44,0.5267,,
4,spinach,classification,gpt-5,0.9333,0.86,1.0,0.9733,0.7,0.62,0.6067,...,0.9308,0.7761,0.787,0.7876,0.3,0.2933,1.0,0.3867,0.6133,0.7836
5,spinach,fixing,gpt-5,0.94,0.84,0.94,0.9733,0.78,0.7267,0.62,...,0.8977,0.8527,0.7902,0.816,0.3533,0.3533,0.3333,0.3867,0.6667,0.8196
6,spinach,wikidata,gpt-5,0.7533,0.5867,0.7533,0.7533,0.62,,,...,0.7381,,,,0.4533,0.5,0.3733,0.46,,
7,spinach,zero-shot,gpt-5,0.6333,0.46,0.6667,0.6733,0.7133,,,...,0.6965,,,,0.38,0.3667,0.3,0.34,,
8,synthetic,classification,gpt-5,0.8933,0.8867,1.0,0.9733,0.8867,0.6,0.5867,...,0.9285,0.8146,0.7942,0.8114,0.0933,0.0933,1.0,0.1667,0.6,0.8067
9,synthetic,fixing,gpt-5,0.88,0.9,0.9667,0.9933,0.9267,0.5667,0.56,...,0.9482,0.7964,0.7859,0.7853,0.1267,0.1333,0.12,0.1467,0.5578,0.7892


### Relation analysis

In [124]:
import os
import pandas as pd
# Relation Classification
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__name__))) + "/data/answers/zero-shot/"
datasets=["spinach", "qawiki",'synthetic']
# llms = ['gpt-4.1-2025-04-14', 'gpt-4.1-mini-2025-04-14', 'gpt-4.1-nano-2025-04-14', 
#         'gpt-4o','o3','gpt-5-nano',"gpt-5-mini","gpt-5",
#         "gemini-2.0-flash","gemini-2.5-flash","gemini-2.5-pro",
#         "grok-3-mini","deepseek-chat","deepseek-reasoner","llama3.1:8b","llama3.3:70b"]
llms = ['gpt-5']


In [125]:
import json

def load_relations(root_dir, datasets, llms):
    """
        DataFrame with columns: ["Q_ID", "dataset", "llm", "R(1-2)", "R(1-3)", "R(1-4)", "R(3-4)", "R(1-34)"]
    """
    # find JSON files
    json_files = [
        os.path.join(root, file)
        for root, _, files in os.walk(root_dir)
        for file in files
        if file.startswith("Relation") and file.endswith(".json")
    ]
    print(f"JSON files found: {len(json_files)}")

    # initialize dataframe
    df_relation = pd.DataFrame(
        columns=["Q_ID", "dataset", "llm", "R(1-2)", "R(1-3)", "R(1-4)", "R(3-4)", "R(1-34)"]
    )

    for file in json_files:
        elements = file.replace("_", "/").replace(".json", "").split("/")
        dataset = next((d for d in datasets if d in elements), None)
        llm = next((l for l in llms if l in elements), None)

        if all([dataset, llm]):
            with open(file, "r", encoding="utf-8") as f:
                data = json.load(f)

            # transform dict into rows
            rows = [
                {
                    "dataset": dataset,
                    "llm": llm,
                    "Q_ID": key,
                    "R(1-2)": value[0],
                    "R(1-3)": value[1],
                    "R(1-4)": value[2],
                    "R(3-4)": value[3],
                    "R(1-34)": value[4],
                }
                for key, value in data.items()
            ]
            df_relation = pd.concat([df_relation, pd.DataFrame(rows)], ignore_index=True)

    return df_relation



In [126]:
df_relation = load_relations(root_dir, datasets, llms)
df_relation


JSON files found: 37


Unnamed: 0,Q_ID,dataset,llm,R(1-2),R(1-3),R(1-4),R(3-4),R(1-34)
0,0,spinach,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence
1,1,spinach,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence
2,2,spinach,gpt-5,Equivalence,Contains,Contains,Overlap,ContainedBy
3,3,spinach,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence
4,4,spinach,gpt-5,Contains,Contains,Contains,Disjoint,Equivalence
...,...,...,...,...,...,...,...,...
445,145,qawiki,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence
446,146,qawiki,gpt-5,Equivalence,Contains,Contains,Overlap,ContainedBy
447,147,qawiki,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence
448,148,qawiki,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence


In [127]:
def relation_summary(df_relation):
    df_relation_summery = pd.DataFrame(
            columns=["dataset", "llm", "R(1-2)", "R(1-3)", "R(1-4)", "R(3-4)", "R(1-34)"]
        )

    group_keys = ["dataset", "llm"]
    grouped = df_relation.groupby(group_keys)

    for keys, group in grouped: 
        row = {
            "dataset": keys[0],
            "llm": keys[1],
            "R(1-2)":  round((group["R(1-2)"]  == "Equivalence").mean(), 4),
            "R(1-3)":  round((group["R(1-3)"]  == "Contains").mean(),    4),
            "R(1-4)":  round((group["R(1-4)"]  == "Contains").mean(),    4),
            "R(3-4)":  round((group["R(3-4)"]  == "Disjoint").mean(),    4),
            "R(1-34)": round((group["R(1-34)"] == "Equivalence").mean(), 4),
        }
        df_relation_summery = pd.concat([df_relation_summery, pd.DataFrame([row])], ignore_index=True)

    group_keys = ["llm"]
    grouped = df_relation.groupby(group_keys)

    for key, group in grouped: 
        row = {
            "llm": key[0],
            "dataset": "overall",
            "R(1-2)":  round((group["R(1-2)"]  == "Equivalence").mean(), 4),
            "R(1-3)":  round((group["R(1-3)"]  == "Contains").mean(),    4),
            "R(1-4)":  round((group["R(1-4)"]  == "Contains").mean(),    4),
            "R(3-4)":  round((group["R(3-4)"]  == "Disjoint").mean(),    4),
            "R(1-34)": round((group["R(1-34)"] == "Equivalence").mean(), 4),
        }
        df_relation_summery = pd.concat([df_relation_summery, pd.DataFrame([row])], ignore_index=True)
    return df_relation_summery

In [128]:
df_relation_summery = relation_summary(df_relation)
df_relation_summery

  df_relation_summery = pd.concat([df_relation_summery, pd.DataFrame([row])], ignore_index=True)


Unnamed: 0,dataset,llm,R(1-2),R(1-3),R(1-4),R(3-4),R(1-34)
0,qawiki,gpt-5,0.8733,0.98,0.98,0.9733,0.9467
1,spinach,gpt-5,0.92,0.9867,0.9867,0.9467,0.9133
2,synthetic,gpt-5,0.8867,0.9733,0.9867,0.9733,0.9733
3,overall,gpt-5,0.8933,0.98,0.9844,0.9644,0.9444


In [33]:
import pandas as pd

CANONICAL_LABELS = [
    "Equivalence", "Contains", "ContainedBy", "Overlap", "Disjoint", "Unknown", "Else"
]

GT = {
    "R(1-2)":  "Equivalence",
    "R(1-3)":  "Contains",
    "R(1-4)":  "Contains",
    "R(3-4)":  "Disjoint",
    "R(1-34)": "Equivalence",
}


def _normalize_pred(x: object) -> str:
    if pd.isna(x):
        return "Unknown"
    s = str(x).strip()
    if s in CANONICAL_LABELS:
        return s
    return "Else"

def per_model_confusions(
    df_relation: pd.DataFrame,
    relation_cols=None,
    include_overall: bool = True,
    round_digits: int = 4,
):
    """
    Build complete confusion matrices per (llm, dataset) and per relation column.
    Adds an 'overall' dataset per llm if include_overall=True.
    
    Returns
    -------
    cms_counts : pd.DataFrame
        MultiIndex rows: (llm, dataset, relation, True)
        Columns: Equivalence, Contains, ContainedBy, Overlap, Disjoint, Unknown, Else (counts)
    cms_ratio : pd.DataFrame
        Same shape, row-normalized ratios.
    """
    if relation_cols is None:
        relation_cols = list(GT.keys())

    needed = {"dataset", "llm", *relation_cols}
    missing = needed - set(df_relation.columns)
    if missing:
        raise ValueError(f"df_relation missing columns: {missing}")

    rows_counts, rows_ratio, idx = [], [], []

    # 1) Per (llm, dataset)
    for (llm, dataset), group in df_relation.groupby(["llm", "dataset"], dropna=False):
        n_group = len(group)
        for rel in relation_cols:
            truth = GT[rel]
            y_pred = group[rel].map(_normalize_pred)
            counts = y_pred.value_counts()
            row_counts = [int(counts.get(lbl, 0)) for lbl in CANONICAL_LABELS]
            row_ratio = [(c / n_group) if n_group > 0 else 0.0 for c in row_counts]
            rows_counts.append(row_counts)
            rows_ratio.append(row_ratio)
            idx.append((llm, dataset, rel, truth))

    # 2) Per llm (overall across datasets)
    if include_overall:
        for llm, group in df_relation.groupby("llm", dropna=False):
            n_group = len(group)
            for rel in relation_cols:
                truth = GT[rel]
                y_pred = group[rel].map(_normalize_pred)
                counts = y_pred.value_counts()
                row_counts = [int(counts.get(lbl, 0)) for lbl in CANONICAL_LABELS]
                row_ratio = [(c / n_group) if n_group > 0 else 0.0 for c in row_counts]
                rows_counts.append(row_counts)
                rows_ratio.append(row_ratio)
                idx.append((llm, "overall", rel, truth))

    index = pd.MultiIndex.from_tuples(idx, names=["llm", "dataset", "relation", "True"])
    cms_counts = pd.DataFrame(rows_counts, index=index, columns=CANONICAL_LABELS)
    cms_ratio  = pd.DataFrame(rows_ratio,  index=index, columns=CANONICAL_LABELS)
    if round_digits is not None:
        cms_ratio = cms_ratio.round(round_digits)

    return cms_counts, cms_ratio


In [46]:
import pandas as pd

# Reuse your existing constants if already defined
CANONICAL_LABELS = [
    "Equivalence", "Contains", "ContainedBy", "Overlap", "Disjoint", "Unknown", "Else"
]

def build_confusion_table(cms_counts: pd.DataFrame,
                          cms_ratio: pd.DataFrame,
                          round_digits: int = 4) -> pd.DataFrame:
    """
    Create one tidy table:
      llm | dataset | relation | True | Accuracy | Size | Equivalence | Contains | ... | Else
    where each label column is 'ratio(count)' and Accuracy is for the True label as 'ratio(count)'.
    """
    records = []
    for idx in cms_counts.index:
        llm, dataset, relation, true_label = idx
        counts_row = cms_counts.loc[idx]
        ratio_row  = cms_ratio.loc[idx]

        N = int(counts_row.sum())
        acc_ratio = float(ratio_row.get(true_label, 0.0))
        acc_count = int(counts_row.get(true_label, 0))

        row = {
            "llm": llm,
            "dataset": dataset,
            "relation": relation,
            "True": true_label,
            "Accuracy": f"{acc_ratio:.{round_digits}f}({acc_count})",
            "Size": N,
        }

        # Add each predicted label as ratio(count)
        for lbl in CANONICAL_LABELS:
            r = float(ratio_row.get(lbl, 0.0))
            c = int(counts_row.get(lbl, 0))
            row[lbl] = f"{r:.{round_digits}f}({c})"

        records.append(row)

    out = pd.DataFrame.from_records(records)
    # nice ordering
    cols = ["llm", "dataset", "relation", "True", "Accuracy", "Size"] + CANONICAL_LABELS
    return out[cols]


def confusion_table_from_df(df_relation: pd.DataFrame,
                            relation_cols=None,
                            include_overall: bool = True,
                            round_digits: int = 4) -> pd.DataFrame:
    """
    Convenience wrapper: calls your per_model_confusions(...) then builds the table.
    """
    # uses the per_model_confusions you already have
    cms_counts, cms_ratio = per_model_confusions(
        df_relation,
        relation_cols=relation_cols,
        include_overall=include_overall,
        round_digits=round_digits,
    )
    return build_confusion_table(cms_counts, cms_ratio, round_digits)


In [56]:
# From your df_relation:
table = confusion_table_from_df(df_relation, include_overall=True, round_digits=4)

# Save if you want:
table.to_csv("../output/relation_summary.csv", index=False)
table.to_excel("../output/relation_summary.xlsx", index=False)


In [49]:
table

Unnamed: 0,llm,dataset,relation,True,Accuracy,Size,Equivalence,Contains,ContainedBy,Overlap,Disjoint,Unknown,Else
0,gemini-2.0-flash,qawiki,R(1-2),Equivalence,0.9667(145),150,0.9667(145),0.0133(2),0.0000(0),0.0200(3),0.0000(0),0.0000(0),0.0000(0)
1,gemini-2.0-flash,qawiki,R(1-3),Contains,0.8667(130),150,0.0067(1),0.8667(130),0.1067(16),0.0200(3),0.0000(0),0.0000(0),0.0000(0)
2,gemini-2.0-flash,qawiki,R(1-4),Contains,0.1600(24),150,0.0000(0),0.1600(24),0.0267(4),0.7267(109),0.0867(13),0.0000(0),0.0000(0)
3,gemini-2.0-flash,qawiki,R(3-4),Disjoint,0.9400(141),150,0.0000(0),0.0067(1),0.0000(0),0.0533(8),0.9400(141),0.0000(0),0.0000(0)
4,gemini-2.0-flash,qawiki,R(1-34),Equivalence,0.8400(126),150,0.8400(126),0.0000(0),0.0000(0),0.0933(14),0.0667(10),0.0000(0),0.0000(0)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,o3,overall,R(1-2),Equivalence,0.9133(411),450,0.9133(411),0.0356(16),0.0378(17),0.0000(0),0.0022(1),0.0111(5),0.0000(0)
236,o3,overall,R(1-3),Contains,0.9733(438),450,0.0111(5),0.9733(438),0.0022(1),0.0022(1),0.0022(1),0.0089(4),0.0000(0)
237,o3,overall,R(1-4),Contains,0.9733(438),450,0.0111(5),0.9733(438),0.0000(0),0.0044(2),0.0089(4),0.0022(1),0.0000(0)
238,o3,overall,R(3-4),Disjoint,0.9467(426),450,0.0044(2),0.0067(3),0.0044(2),0.0200(9),0.9467(426),0.0178(8),0.0000(0)


In [50]:
df_relation

Unnamed: 0,Q_ID,dataset,llm,R(1-2),R(1-3),R(1-4),R(3-4),R(1-34)
0,0,spinach,grok-3-mini,Equivalence,Contains,Contains,Disjoint,Equivalence
1,1,spinach,grok-3-mini,Equivalence,Contains,Contains,Disjoint,Equivalence
2,2,spinach,grok-3-mini,Equivalence,Contains,Contains,Disjoint,Equivalence
3,3,spinach,grok-3-mini,Equivalence,Contains,Contains,Disjoint,Equivalence
4,4,spinach,grok-3-mini,Contains,Contains,Contains,Disjoint,Equivalence
...,...,...,...,...,...,...,...,...
5373,145,qawiki,gemini-2.5-pro,Equivalence,Contains,Contains,Disjoint,Equivalence
5374,146,qawiki,gemini-2.5-pro,Equivalence,Contains,Contains,Overlap,ContainedBy
5375,147,qawiki,gemini-2.5-pro,Equivalence,Contains,Contains,Disjoint,Equivalence
5376,148,qawiki,gemini-2.5-pro,Equivalence,Contains,Contains,Disjoint,Equivalence


In [None]:
import datetime

relation_file_format = datetime.datetime.now().strftime("relations_%Y-%m-%d_%H-%M.csv")
summary_file_format = datetime.datetime.now().strftime("relation_summary_%Y-%m-%d_%H-%M.csv")
summary_file_format_excel = datetime.datetime.now().strftime("relation_summary_%Y-%m-%d_%H-%M.xlsx")
output_folder = "../output/"

df_relation.to_csv(os.path.join(output_folder, relation_file_format), index=False)
df_relation_summery.to_csv(os.path.join(output_folder, summary_file_format), index=False)
df_relation_summery.to_excel("../output/relation_summary.xlsx", index=False)
df_relation_summery.to_excel(os.path.join(output_folder, summary_file_format_excel), index=False)


### Relation Identification and Consistency? 

for each of question in re-classification, there is the relation-classification. 

Add more columns [R(1-2), R(1-3), R(1-4), R(3-4),R(1-34)] in df_analysis to show the identified relation in re-classification. 

Maybe also add the initial relation in zero-shot at this column. 

Based on df_analysis do the summary for both zero-shot and re-classication in terms of relation and consistency. 

In [134]:
# def load_relation_ref(root_dir, datasets, llms):
datasets = ["spinach", "qawiki", 'synthetic']
llms = ['gpt-5']
tasks = ['equal', 'sup-sub', "minus"]
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__name__))) + "/data/answers/rel_classification_and_questions/"
json_files = [
    os.path.join(root, file)
    for root, _, files in os.walk(root_dir)
    for file in files
    if not file.startswith("Q") and file.endswith(".json")
]
df_relation_clf = pd.DataFrame(
        columns=["Q_ID", "dataset", "llm", "R(1-2)", "R(1-3)", "R(1-34)"]
    )

task_to_col = {
    "equal":   "R(1-2)",
    "sup-sub": "R(1-3)",
    "minus":   "R(1-34)",
}
for file in json_files:
    elements = file.replace("_", "/").replace(".json", "").split("/")
    dataset = next((d for d in datasets if d in elements), None)
    llm = next((l for l in llms if l in elements), None)
    task = next((t for t in tasks if t in elements), None)
    col = task_to_col.get(task)

    if all([dataset, llm, task]):
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
        # transform dict into rows
        rows = []
        for qid, rel in data.items():
            if qid in df_relation_clf["Q_ID"].values and \
            dataset in df_relation_clf["dataset"].values and \
            llm in df_relation_clf["llm"].values: 
                df_relation_clf.loc[
                    (df_relation_clf["Q_ID"] == qid) & 
                    (df_relation_clf["dataset"] == dataset) & 
                    (df_relation_clf["llm"] == llm), col
                ] = rel
            else:
                row = {
                        "dataset": dataset,
                        "llm": llm,
                        "Q_ID": qid,
                        col: rel if col else None,
                    }
                rows.append(row)
                
        df_relation_clf = pd.concat([df_relation_clf, pd.DataFrame(rows)], ignore_index=True)
        df_relation_clf["action"] = "classification"
df_relation_clf

Unnamed: 0,Q_ID,dataset,llm,R(1-2),R(1-3),R(1-34),action
0,0,spinach,gpt-5,equivalence,contains,equivalence,classification
1,1,spinach,gpt-5,equivalence,contains,equivalence,classification
2,2,spinach,gpt-5,equivalence,contains,containedby,classification
3,3,spinach,gpt-5,equivalence,contains,equivalence,classification
4,4,spinach,gpt-5,contains,contains,equivalence,classification
...,...,...,...,...,...,...,...
445,146,qawiki,gpt-5,equivalence,contains,equivalence,classification
446,147,qawiki,gpt-5,equivalence,contains,equivalence,classification
447,148,qawiki,gpt-5,equivalence,contains,equivalence,classification
448,149,qawiki,gpt-5,equivalence,contains,equivalence,classification


In [120]:
import re
import math
import unicodedata
import pandas as pd
from typing import Any, Iterable

CANONICAL_LABELS = [
    "Equivalence", "Contains", "ContainedBy", "Overlap", "Disjoint", "Unknown", "Else"
]

# --- helpers ---------------------------------------------------------------

_UNKNOWN_TOKENS = {
    "unknown","unk","n/a","na","none","null","nil","idk","don't know","dont know",
    "cannot determine","can’t determine","cant determine","unsure","uncertain",
    "not sure","not given","not specified","ambiguous"
}

def _first_nonempty_str(it: Iterable[Any]) -> str | None:
    for x in it:
        if x is None: 
            continue
        s = str(x).strip()
        if s:
            return s
    return None

def _pick_from_dict(d: dict) -> str | None:
    # common shapes: {"label": "..."} or {"relation": "..."} or {label: score, ...}
    for k in ("label","relation","pred","class"):
        if k in d and isinstance(d[k], (str, int, float)):
            return str(d[k])
    # try best-score key if numeric
    try:
        numeric = {k: float(v) for k, v in d.items() if isinstance(v, (int, float, str)) and str(v).replace('.','',1).lstrip('-').isdigit()}
        if numeric:
            return max(numeric, key=numeric.get)
    except Exception:
        pass
    # else first key
    if d:
        return str(next(iter(d.keys())))
    return None

def _clean_text(s: str) -> str:
    # Unicode normalize (e.g., different hyphens, spaces)
    s = unicodedata.normalize("NFKC", s)
    # drop parenthetical scores etc: "Equivalence (0.91)" -> "Equivalence"
    s = re.sub(r"\(.*?\)", "", s)
    # collapse spaces & hyphens around keywords
    s = re.sub(r"[-_]+", "-", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _is_unknown_like(s_lower: str) -> bool:
    return s_lower in _UNKNOWN_TOKENS

# --- main normalizer -------------------------------------------------------

def normalize_relation(pred: Any) -> str:
    """
    Map various prediction shapes/phrases/symbols into canonical relation labels.
    Returns one of CANONICAL_LABELS.
    """
    # None / NaN / empty -> Unknown
    if pred is None or (isinstance(pred, float) and math.isnan(pred)):
        return "Unknown"

    # list/tuple: pick first sensible string
    if isinstance(pred, (list, tuple)):
        s = _first_nonempty_str(pred)
        if not s:
            return "Unknown"
        pred = s

    # dict: try to extract label
    if isinstance(pred, dict):
        s = _pick_from_dict(pred)
        if not s:
            return "Unknown"
        pred = s

    # now string
    s_raw = str(pred)
    s = _clean_text(s_raw)
    s_lower = s.casefold()

    # quick exact canonical pass
    if s in CANONICAL_LABELS:
        return s

    # unknown-likes
    if _is_unknown_like(s_lower):
        return "Unknown"

    # guard: special negation patterns first (so "not disjoint" -> Overlap)
    if re.search(r"\bnot\s+disjoint\b", s_lower) or re.search(r"\bnon[-\s]?disjoint\b", s_lower):
        return "Overlap"
    if re.search(r"\bnot\s+overlap(ping)?\b", s_lower):
        return "Disjoint"

    # --- detect by symbols/phrases ---
    # Equivalence
    if re.search(r"\beq(uiv(alent|alence)?)\b", s_lower) or \
       re.search(r"\bequal(s)?\b", s_lower) or \
       re.search(r"\bsame(\s+set)?\b", s_lower) or \
       re.search(r"a\s*=\s*b", s_lower) or "≡" in s or "↔" in s:
        return "Equivalence"

    # Contains (A ⊃ B; superset; includes)
    if "⊃" in s or "⊇" in s or \
       re.search(r"\bsuper\s*set\b", s_lower) or \
       re.search(r"\bsuperset\s+of\b", s_lower) or \
       re.search(r"\bcontain(s|ment)?\b", s_lower) or \
       re.search(r"\bincludes?\b", s_lower) or \
       re.search(r"\b(a|set\s*a)?\s*includes?\s*(b|set\s*b)\b", s_lower):
        return "Contains"

    # ContainedBy (A ⊂ B; subset; contained by; is in)
    if "⊂" in s or "⊆" in s or \
       re.search(r"\bsub\s*set\b", s_lower) or \
       re.search(r"\bsubset\s+of\b", s_lower) or \
       re.search(r"\bcontained\s*by\b", s_lower) or \
       re.search(r"\bis\s+in\b", s_lower) or \
       re.search(r"\bbelongs\s+to\b", s_lower):
        return "ContainedBy"

    # Disjoint (A ∩ B = ∅; no overlap)
    if "∩" in s and ("∅" in s or "= 0" in s_lower) or \
       re.search(r"\bdis[-\s]?joint\b", s_lower) or \
       re.search(r"\bno\s+(overlap|intersection)\b", s_lower) or \
       re.search(r"\bmutual(ly)?\s+exclusive\b", s_lower) or \
       re.search(r"\bnon[-\s]?overlap(ping)?\b", s_lower):
        return "Disjoint"

    # Overlap (A ∩ B ≠ ∅; intersect; partial overlap)
    if "∩" in s and ("≠" in s or "!= " in s_lower) or \
       re.search(r"\boverlap(ping)?\b", s_lower) or \
       re.search(r"\bintersect(s|ion)?\b", s_lower) or \
       re.search(r"\b(non[-\s]?empty|some)\s+intersection\b", s_lower) or \
       re.search(r"\bshare(s)?\s+(elements|items|members)\b", s_lower):
        return "Overlap"

    # If the string literally says "unknown" in any decorative way, catch it late too
    if "unknown" in s_lower:
        return "Unknown"

    # Otherwise:
    return "Else"

# --- convenience wrappers ---------------------------------------------------

def normalize_relation_series(s: pd.Series) -> pd.Series:
    return s.apply(normalize_relation)

def normalize_relation_cols(df: pd.DataFrame, cols: list[str], inplace: bool = False, suffix: str = "_norm") -> pd.DataFrame:
    """
    Normalize multiple relation columns in a DataFrame.
    - If inplace=False, returns a copy with new normalized columns appended (col+suffix).
    - If inplace=True, overwrites the original columns.
    """
    target = df if inplace else df.copy()
    for c in cols:
        norm = target[c].apply(normalize_relation)
        if inplace:
            target[c] = norm
        else:
            target[c + suffix] = norm
    return target


In [135]:
import json
import pandas as pd
from pathlib import Path

datasets = ["spinach", "qawiki", "synthetic"]
llms     = ["gpt-5"]
tasks    = ["equal", "sup-sub", "minus"]

task_to_col = {
    "equal":   "R(1-2)",
    "sup-sub": "R(1-3)",
    "minus":   "R(1-34)",
}

def load_relation_clf(root_dir: str | None = None) -> pd.DataFrame:
    # Resolve default roo

    # Find JSON files (exclude those starting with 'Q')
    json_files = [
        os.path.join(root, file)
        for root, _, files in os.walk(root_dir)
        for file in files
        if not file.startswith("Q") and file.endswith(".json")
    ]
    rows_map: dict[tuple[str, str, str], dict] = {}

    for file in json_files:
        parts = file.replace("_", "/").replace(".json", "").split("/")
        dataset = next((d for d in datasets if d in parts), None)
        llm     = next((l for l in llms     if l in parts), None)
        task    = next((t for t in tasks    if t in parts), None)
        col     = task_to_col.get(task)

        if not (dataset and llm and col):
            continue

        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)

        for qid, rel in data.items():
            # allow value to be list/tuple or scalar
            pred = rel[0] if isinstance(rel, (list, tuple)) and len(rel) else rel
            
            key = (qid, dataset, llm)
            row = rows_map.setdefault(key, {"Q_ID": qid, "dataset": dataset, "llm": llm})
            row[col] = normalize_relation(pred)

    # Materialize dataframe
    df = pd.DataFrame(rows_map.values())
    for c in ["R(1-2)", "R(1-3)", "R(1-34)"]:
        if c not in df.columns:
            df[c] = pd.NA

    df = df[["Q_ID", "dataset", "llm", "R(1-2)", "R(1-3)", "R(1-34)"]]
    df["action"] = "classification"
    return df.sort_values(["llm", "dataset", "Q_ID"]).reset_index(drop=True)

# Use it
df_relation_clf = load_relation_clf(root_dir)
df_relation_clf


Unnamed: 0,Q_ID,dataset,llm,R(1-2),R(1-3),R(1-34),action
0,0,qawiki,gpt-5,Equivalence,Contains,Equivalence,classification
1,1,qawiki,gpt-5,Equivalence,Contains,Equivalence,classification
2,10,qawiki,gpt-5,ContainedBy,Equivalence,ContainedBy,classification
3,100,qawiki,gpt-5,Equivalence,Contains,Equivalence,classification
4,101,qawiki,gpt-5,Equivalence,Contains,Equivalence,classification
...,...,...,...,...,...,...,...
445,95,synthetic,gpt-5,Equivalence,Contains,Equivalence,classification
446,96,synthetic,gpt-5,Equivalence,Contains,Equivalence,classification
447,97,synthetic,gpt-5,Equivalence,Disjoint,Equivalence,classification
448,98,synthetic,gpt-5,Equivalence,Contains,Equivalence,classification


In [136]:
df_relation

Unnamed: 0,Q_ID,dataset,llm,R(1-2),R(1-3),R(1-4),R(3-4),R(1-34)
0,0,spinach,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence
1,1,spinach,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence
2,2,spinach,gpt-5,Equivalence,Contains,Contains,Overlap,ContainedBy
3,3,spinach,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence
4,4,spinach,gpt-5,Contains,Contains,Contains,Disjoint,Equivalence
...,...,...,...,...,...,...,...,...
445,145,qawiki,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence
446,146,qawiki,gpt-5,Equivalence,Contains,Contains,Overlap,ContainedBy
447,147,qawiki,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence
448,148,qawiki,gpt-5,Equivalence,Contains,Contains,Disjoint,Equivalence


In [151]:
import pandas as pd

def merge_relations_by_action(df_analysis, df_relation, df_relation_clf):
    keys = ["Q_ID", "dataset", "llm"]
    rel_cols = ["R(1-2)", "R(1-3)", "R(1-4)", "R(3-4)", "R(1-34)"]

    # Ensure target columns exist in df_analysis
    for c in rel_cols:
        if c not in df_analysis.columns:
            df_analysis[c] = pd.NA

    # Deduplicate right tables on keys
    df_rel = df_relation[keys + rel_cols].drop_duplicates(subset=keys)

    # df_relation_clf may have only a subset of rel_cols; align to full set
    clf_cols = [c for c in rel_cols if c in df_relation_clf.columns]
    df_rel_clf_aligned = (
        df_relation_clf[keys + clf_cols]
        .drop_duplicates(subset=keys)
        .reindex(columns=keys + rel_cols)  # add missing relation cols as NaN
    )

    # Fill zero-shot rows from df_relation
    m_zs = df_analysis["action"].eq("zero-shot")
    if m_zs.any():
        zs_merge = df_analysis.loc[m_zs, keys].merge(df_rel, on=keys, how="left")
        df_analysis.loc[m_zs, rel_cols] = zs_merge[rel_cols].values

    # Fill classification rows from df_relation_clf
    m_cls = df_analysis["action"].eq("classification")
    if m_cls.any():
        cls_merge = df_analysis.loc[m_cls, keys].merge(df_rel_clf_aligned, on=keys, how="left")
        df_analysis.loc[m_cls, rel_cols] = cls_merge[rel_cols].values
    df_analysis =  df_analysis.replace({None: pd.NA}).convert_dtypes()
    return df_analysis

# Usage
df_analysis = merge_relations_by_action(df_analysis, df_relation, df_relation_clf)
df_analysis


Unnamed: 0,Q_ID,action,dataset,llm,?A1=A2,?A1=A3+A4,?A1>A3,?A1>A4,?A3∅A4,J(A1-A2),...,A2,A3,A4,A1*,A1**,R(1-2),R(1-3),R(1-34),R(1-4),R(3-4)
0,0,classification,qawiki,gpt-5,1,1,1,1,1,1.0,...,"[Venezuela, Guyana, Brazil]",[],"[Guyana, Brazil]","[Venezuela, Guyana, Brazil]","[Venezuela, Guyana, Brazil]",Equivalence,Contains,Equivalence,,
1,0,classification,spinach,gpt-5,1,1,1,1,1,1.0,...,"[At Last the 1948 Show, Fawlty Towers, Monty P...",[],[],"[At Last the 1948 Show, Fawlty Towers, Monty P...","[At Last the 1948 Show, Fawlty Towers, Monty P...",Equivalence,Contains,Equivalence,,
2,0,classification,synthetic,gpt-5,1,1,1,1,1,1.0,...,"[The Hateful Eight, Django Unchained, Kill Bil...",[],"[The Hateful Eight, Django Unchained, Kill Bil...","[The Hateful Eight, Django Unchained, Kill Bil...","[The Hateful Eight, Django Unchained, Kill Bil...",Equivalence,Contains,Equivalence,,
3,0,fixing,qawiki,gpt-5,1,1,1,1,1,1.0,...,"[Suriname, Guyana, Brazil, Colombia, Venezuela]","[Colombia, Venezuela]","[Suriname, Guyana, Brazil]","[Colombia, Venezuela, Guyana, Brazil]","[Suriname, Guyana, Brazil, Colombia, Venezuela]",,,,,
4,0,fixing,spinach,gpt-5,0,1,1,1,1,0.6667,...,"[At Last the 1948 Show, Fawlty Towers, Monty P...","[At Last the 1948 Show, Monty Python's Fliegen...",[],"[At Last the 1948 Show, Monty Python's Fliegen...","[At Last the 1948 Show, Fawlty Towers, Monty P...",,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,99,wikidata,spinach,gpt-5,1,1,1,1,1,1.0,...,"[Indiana, Missouri, Wisconsin, Iowa, Kentucky]","[Indiana, Wisconsin]","[Missouri, Iowa, Kentucky]",,,,,,,
1796,99,wikidata,synthetic,gpt-5,1,0,0,1,1,1.0,...,"[black, red]","[yellow, red]",[black],,,,,,,
1797,99,zero-shot,qawiki,gpt-5,1,1,1,1,0,1.0,...,[idk],[idk],[idk],,,Equivalence,Contains,Equivalence,Contains,Disjoint
1798,99,zero-shot,spinach,gpt-5,1,1,1,1,1,1.0,...,"[Indiana, Missouri, Wisconsin, Iowa, Kentucky]","[Indiana, Wisconsin]","[Missouri, Iowa, Kentucky]",,,Equivalence,Contains,Equivalence,Contains,Disjoint


In [152]:
df_analysis.columns

Index(['Q_ID', 'action', 'dataset', 'llm', '?A1=A2', '?A1=A3+A4', '?A1>A3',
       '?A1>A4', '?A3∅A4', 'J(A1-A2)', 'J(A1-A34)', 'J(A1-A1*)', 'J(A1-A1**)',
       'J(A1*-A1**)', 'Q1', 'Q2', 'Q3', 'Q4', 'A1', 'A2', 'A3', 'A4', 'A1*',
       'A1**', 'R(1-2)', 'R(1-3)', 'R(1-34)', 'R(1-4)', 'R(3-4)'],
      dtype='object')

#### Analysis Correct Relations to Consistency and vice versa

In [234]:
df_summary = summary(df_analysis)
df_summary

Unnamed: 0,dataset,action,llm,?A1=A2,?A1=A3+A4,?A1>A3,?A1>A4,?A3∅A4,?A1=A1*,?A1=A1**,...,J(A1-A34),J(A1-A1*),J(A1-A1**),J(A1*-A1**),idk_A1,idk_A2,idk_A3,idk_A4,?A1=A1(ave),J_A1_ave
0,qawiki,classification,gpt-5,0.9133,0.9267,1.0,0.9733,0.52,0.5267,0.6067,...,0.9644,0.7286,0.7324,0.746,0.4133,0.4067,1.0,0.5333,0.58,0.7357
1,qawiki,fixing,gpt-5,0.8933,0.8933,0.9533,0.9867,0.62,0.7067,0.7067,...,0.9191,0.822,0.8343,0.8067,0.5133,0.5267,0.5067,0.58,0.7,0.821
2,qawiki,wikidata,gpt-5,0.82,0.64,0.7867,0.7667,0.5,,,...,0.7825,,,,0.66,0.6467,0.54,0.5933,,
3,qawiki,zero-shot,gpt-5,0.68,0.5667,0.7267,0.74,0.5867,,,...,0.7421,,,,0.5467,0.5267,0.44,0.5267,,
4,spinach,classification,gpt-5,0.9333,0.86,1.0,0.9733,0.7,0.62,0.6067,...,0.9308,0.7761,0.787,0.7876,0.3,0.2933,1.0,0.3867,0.6133,0.7836
5,spinach,fixing,gpt-5,0.94,0.84,0.94,0.9733,0.78,0.7267,0.62,...,0.8977,0.8527,0.7902,0.816,0.3533,0.3533,0.3333,0.3867,0.6667,0.8196
6,spinach,wikidata,gpt-5,0.7533,0.5867,0.7533,0.7533,0.62,,,...,0.7381,,,,0.4533,0.5,0.3733,0.46,,
7,spinach,zero-shot,gpt-5,0.6333,0.46,0.6667,0.6733,0.7133,,,...,0.6965,,,,0.38,0.3667,0.3,0.34,,
8,synthetic,classification,gpt-5,0.8933,0.8867,1.0,0.9733,0.8867,0.6,0.5867,...,0.9285,0.8146,0.7942,0.8114,0.0933,0.0933,1.0,0.1667,0.6,0.8067
9,synthetic,fixing,gpt-5,0.88,0.9,0.9667,0.9933,0.9267,0.5667,0.56,...,0.9482,0.7964,0.7859,0.7853,0.1267,0.1333,0.12,0.1467,0.5578,0.7892


In [200]:
import pandas as pd

def update_summary_by_relations(
    df_analysis: pd.DataFrame,
    df_summary: pd.DataFrame,
    task: str = "zero-shot",
    # Ground truth labels per relation
    relation_truths: dict[str, str] = None,
    # Metrics to average per relation: {relation: [(metric_column_name, output_label_prefix or None), ...]}
    # If output_label_prefix is None, we use the metric column name directly and append (+)/(-).
    metric_spec: dict[str, list[tuple[str, str | None]]] = None,
) -> pd.DataFrame:
    """
    For each (dataset, llm) and 'overall' per llm, compute mean metrics on
    positive vs negative rows for each relation, and write into df_summary.
    
    Returns
    -------
    df_summary : pd.DataFrame (mutated copy)
    """
    # Defaults
    if relation_truths is None:
        relation_truths = {
            "R(1-2)":  "Equivalence",
            "R(1-3)":  "Contains",
            "R(1-4)":  "Contains",
            "R(3-4)":  "Disjoint",
            "R(1-34)": "Equivalence",
        }

    if metric_spec is None:
        metric_spec = {
            "R(1-2)":  [("?A1=A2", None), ("J(A1-A2)", "J(1-2)")],
            "R(1-3)":  [("?A1>A3", None)],
            "R(1-4)":  [("?A1>A4", None), ("J(A1-A4)", "J(1-4)")],
            "R(3-4)":  [("?A3∅A4",None),("J(A3-A4)", "J(3-4)")],  # natural for Disjoint: J should be near 0
            "R(1-34)": [("?A1=A3+A4", None), ("J(A1-A34)", "J(1-34)")],
        }

    # Work on a copy to avoid accidental view issues
    out = df_summary.copy()
    df_temp = df_analysis[df_analysis["action"] == task]

    def set_means(mask, group, rel_col, truth_label):
        # Split pos/neg
        pos = group[group[rel_col] == truth_label]
        neg = group[group[rel_col] != truth_label]

        for metric_col, prefix in metric_spec.get(rel_col, []):
            if metric_col not in group.columns:
                # silently skip missing metrics
                continue

            # Compute means (NaN if empty)
            pos_mean = pos[metric_col].mean() if len(pos) else pd.NA
            neg_mean = neg[metric_col].mean() if len(neg) else pd.NA

            # Build output column names
            if prefix is None:
                col_pos = f"{metric_col}(+)"
                col_neg = f"{metric_col}(-)"
            else:
                col_pos = f"{prefix}+"
                col_neg = f"{prefix}-"

            # Ensure columns exist
            if col_pos not in out.columns:
                out[col_pos] = pd.NA
            if col_neg not in out.columns:
                out[col_neg] = pd.NA

            # Assign
            out.loc[mask, col_pos] = pos_mean
            out.loc[mask, col_neg] = neg_mean

    # Per (dataset, llm)
    for (dataset, llm), group in df_temp.groupby(["dataset", "llm"]):
        mask_common = (
            (out["action"] == task)
            & (out["dataset"] == dataset)
            & (out["llm"] == llm)
        )
        for rel_col, truth_label in relation_truths.items():
            # Skip relation columns that aren't present
            if rel_col not in group.columns:
                continue
            set_means(mask_common, group, rel_col, truth_label)

    # Overall per llm
    for llm, group in df_temp.groupby("llm"):
        mask_overall = ((out["dataset"] == "overall") 
                        & (out["llm"] == llm)
                        & (out["action"] == task))
    
        for rel_col, truth_label in relation_truths.items():
            if rel_col not in group.columns:
                continue
            set_means(mask_overall, group, rel_col, truth_label)

    return out


In [204]:
df_summary = summary(df_analysis)

df_summary = update_summary_by_relations(
    df_analysis=df_analysis,
    df_summary=df_summary,
    task="zero-shot",  # or "classification"
    # you can omit relation_truths & metric_spec to use the defaults shown above
)

df_summary = update_summary_by_relations(
    df_analysis=df_analysis,
    df_summary=df_summary,
    task="classification",  # or "classification"
    # you can omit relation_truths & metric_spec to use the defaults shown above
)
print(df_summary.shape)
print(df_summary.columns)
df_summary


(16, 32)
Index(['dataset', 'action', 'llm', '?A1=A2', '?A1=A3+A4', '?A1>A3', '?A1>A4',
       '?A3∅A4', 'J(A1-A2)', 'J(A1-A34)', 'J(A1-A1*)', 'J(A1-A1**)',
       'J(A1*-A1**)', 'idk_A1', 'idk_A2', 'idk_A3', 'idk_A4', 'J_A1_ave',
       '?A1=A2(+)', '?A1=A2(-)', 'J(1-2)+', 'J(1-2)-', '?A1>A3(+)',
       '?A1>A3(-)', '?A1>A4(+)', '?A1>A4(-)', '?A3∅A4(+)', '?A3∅A4(-)',
       '?A1=A3+A4(+)', '?A1=A3+A4(-)', 'J(1-34)+', 'J(1-34)-'],
      dtype='object')


Unnamed: 0,dataset,action,llm,?A1=A2,?A1=A3+A4,?A1>A3,?A1>A4,?A3∅A4,J(A1-A2),J(A1-A34),...,?A1>A3(+),?A1>A3(-),?A1>A4(+),?A1>A4(-),?A3∅A4(+),?A3∅A4(-),?A1=A3+A4(+),?A1=A3+A4(-),J(1-34)+,J(1-34)-
0,qawiki,classification,gpt-5,0.9133,0.9267,1.0,0.9733,0.52,0.9636,0.9644,...,1.0,1.0,,,,,0.93007,0.857143,0.963338,0.9864
1,qawiki,fixing,gpt-5,0.8933,0.8933,0.9533,0.9867,0.62,0.9418,0.9191,...,,,,,,,,,,
2,qawiki,wikidata,gpt-5,0.82,0.64,0.7867,0.7667,0.5,0.8907,0.7825,...,,,,,,,,,,
3,qawiki,zero-shot,gpt-5,0.68,0.5667,0.7267,0.74,0.5867,0.7996,0.7421,...,0.721088,1.0,0.741497,0.666667,0.60274,0.0,0.56338,0.625,0.735675,0.855812
4,spinach,classification,gpt-5,0.9333,0.86,1.0,0.9733,0.7,0.9625,0.9308,...,1.0,1.0,,,,,0.892086,0.454545,0.942163,0.787336
5,spinach,fixing,gpt-5,0.94,0.84,0.94,0.9733,0.78,0.9592,0.8977,...,,,,,,,,,,
6,spinach,wikidata,gpt-5,0.7533,0.5867,0.7533,0.7533,0.62,0.8193,0.7381,...,,,,,,,,,,
7,spinach,zero-shot,gpt-5,0.6333,0.46,0.6667,0.6733,0.7133,0.7885,0.6965,...,0.668919,0.5,0.675676,0.5,0.739437,0.25,0.452555,0.538462,0.702392,0.634269
8,synthetic,classification,gpt-5,0.8933,0.8867,1.0,0.9733,0.8867,0.9599,0.9285,...,1.0,1.0,,,,,0.903448,0.4,0.939812,0.599
9,synthetic,fixing,gpt-5,0.88,0.9,0.9667,0.9933,0.9267,0.9697,0.9482,...,,,,,,,,,,


#### P-values

In [None]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar

PREDICATES = ['?A1=A2', '?A1=A3+A4', '?A1>A3', '?A1>A4', '?A3∅A4']
P_COLS     = ['p(A1=A2)', 'p(A1=A3+A4)', 'p(A1>A3)', 'p(A1>A4)', 'p(A3∅A4)']

def _mcnemar_p(z, a):
    """Exact McNemar p-value for two paired 0/1 vectors (same length)."""
    n10 = ((z == 1) & (a == 0)).sum()
    n01 = ((z == 0) & (a == 1)).sum()
    if (n10 + n01) == 0:
        # no discordant pairs -> identical performance
        return 1.0, n10, n01
    res = mcnemar([[0, n01], [n10, 0]], exact=True)
    return float(res.pvalue), n10, n01

def _compare_group(g, dataset_label):
    """Run action-vs-zero-shot McNemar for all predicates within group g."""
    out_rows = []
    if 'zero-shot' not in set(g['action']):
        return pd.DataFrame(out_rows)

    base = g[g['action'] == 'zero-shot'][['Q_ID'] + PREDICATES].copy()

    for action, g_act in g.groupby('action'):
        merged = base.merge(
            g_act[['Q_ID'] + PREDICATES], 
            on='Q_ID', 
            suffixes=('_zero', '_act')
        )
        if merged.empty:
            continue

        row = {
            'dataset': dataset_label,
            'llm': g['llm'].iloc[0],
            'action': action
        }

        for pred, pcol in zip(PREDICATES, P_COLS):
            z = merged[f'{pred}_zero']
            a = merged[f'{pred}_act']
            pval, n10, n01 = _mcnemar_p(z, a)
            row[pcol] = pval
            # row[f'{pcol}_winner'] = (
            #     'action' if n01 > n10 else 
            #     'zero-shot' if n10 > n01 else 
            #     'tie'
            # )

        out_rows.append(row)

    return pd.DataFrame(out_rows)

def compute_pvals(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each (dataset, llm, action) compute McNemar p(action vs zero-shot).
    Includes zero-shot itself (p=1, tie).
    Also adds an 'overall' dataset (pooled across datasets per llm).
    Returns a wide DataFrame with p-values (4 decimals) and winner columns.
    """
    frames = []

    # per-dataset
    for (dataset, llm), g in df.groupby(['dataset', 'llm']):
        frames.append(_compare_group(g, dataset))

    # overall (pool datasets) per llm
    for llm, g in df.groupby('llm'):
        frames.append(_compare_group(g, 'overall'))

    res = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(
        columns=['dataset','llm','action'] + P_COLS
    )

    return res.sort_values(['dataset', 'llm', 'action'], ignore_index=True)



In [222]:
df_pval = compute_pvals(df_analysis)
df_pval


Unnamed: 0,dataset,llm,action,p(A1=A2),p(A1=A3+A4),p(A1>A3),p(A1>A4),p(A3∅A4)
0,overall,gpt-5,classification,4.528857e-72,9.877907999999999e-127,6.8791049999999995e-136,1.15067e-95,0.230875
1,overall,gpt-5,fixing,2.453163e-72,1.439585e-114,1.863491e-85,7.762493e-109,0.000601
2,overall,gpt-5,wikidata,3.176076e-13,1.191867e-10,4.601966e-08,8.012738e-07,0.000264
3,overall,gpt-5,zero-shot,1.0,1.0,1.0,1.0,1.0
4,qawiki,gpt-5,classification,7.878384e-08,1.187939e-14,9.094947e-13,2.841261e-09,0.087159
5,qawiki,gpt-5,fixing,9.430375e-07,1.541878e-12,1.076842e-09,1.455192e-10,0.48685
6,qawiki,gpt-5,wikidata,0.0001037158,0.09887175,0.1995909,0.6075914,0.014633
7,qawiki,gpt-5,zero-shot,1.0,1.0,1.0,1.0,1.0
8,spinach,gpt-5,classification,1.358992e-09,2.256225e-16,1.776357e-15,4.355627e-12,0.850554
9,spinach,gpt-5,fixing,3.481659e-13,1.641048e-15,1.000444e-11,6.82121e-13,0.021271


In [224]:
df_summery = df_summary.merge(df_pval, on=["dataset","llm","action"], how="left")
df_summery

Unnamed: 0,dataset,action,llm,?A1=A2,?A1=A3+A4,?A1>A3,?A1>A4,?A3∅A4,J(A1-A2),J(A1-A34),...,?A3∅A4(-),?A1=A3+A4(+),?A1=A3+A4(-),J(1-34)+,J(1-34)-,p(A1=A2),p(A1=A3+A4),p(A1>A3),p(A1>A4),p(A3∅A4)
0,qawiki,classification,gpt-5,0.9133,0.9267,1.0,0.9733,0.52,0.9636,0.9644,...,,0.93007,0.857143,0.963338,0.9864,7.878384e-08,1.187939e-14,9.094947e-13,2.841261e-09,0.087159
1,qawiki,fixing,gpt-5,0.8933,0.8933,0.9533,0.9867,0.62,0.9418,0.9191,...,,,,,,9.430375e-07,1.541878e-12,1.076842e-09,1.455192e-10,0.48685
2,qawiki,wikidata,gpt-5,0.82,0.64,0.7867,0.7667,0.5,0.8907,0.7825,...,,,,,,0.0001037158,0.09887175,0.1995909,0.6075914,0.014633
3,qawiki,zero-shot,gpt-5,0.68,0.5667,0.7267,0.74,0.5867,0.7996,0.7421,...,0.0,0.56338,0.625,0.735675,0.855812,1.0,1.0,1.0,1.0,1.0
4,spinach,classification,gpt-5,0.9333,0.86,1.0,0.9733,0.7,0.9625,0.9308,...,,0.892086,0.454545,0.942163,0.787336,1.358992e-09,2.256225e-16,1.776357e-15,4.355627e-12,0.850554
5,spinach,fixing,gpt-5,0.94,0.84,0.94,0.9733,0.78,0.9592,0.8977,...,,,,,,3.481659e-13,1.641048e-15,1.000444e-11,6.82121e-13,0.021271
6,spinach,wikidata,gpt-5,0.7533,0.5867,0.7533,0.7533,0.62,0.8193,0.7381,...,,,,,,0.007915897,0.002563208,0.04095959,0.1189205,0.006611
7,spinach,zero-shot,gpt-5,0.6333,0.46,0.6667,0.6733,0.7133,0.7885,0.6965,...,0.25,0.452555,0.538462,0.702392,0.634269,1.0,1.0,1.0,1.0,1.0
8,synthetic,classification,gpt-5,0.8933,0.8867,1.0,0.9733,0.8867,0.9599,0.9285,...,,0.903448,0.4,0.939812,0.599,4.290179e-13,1.4128409999999999e-19,3.469447e-18,4.618528e-14,0.663624
9,synthetic,fixing,gpt-5,0.88,0.9,0.9667,0.9933,0.9267,0.9697,0.9482,...,,,,,,4.618528e-14,5.361785e-21,1.187939e-14,4.440892e-16,0.078354


In [225]:
df_summery.columns

Index(['dataset', 'action', 'llm', '?A1=A2', '?A1=A3+A4', '?A1>A3', '?A1>A4',
       '?A3∅A4', 'J(A1-A2)', 'J(A1-A34)', 'J(A1-A1*)', 'J(A1-A1**)',
       'J(A1*-A1**)', 'idk_A1', 'idk_A2', 'idk_A3', 'idk_A4', 'J_A1_ave',
       '?A1=A2(+)', '?A1=A2(-)', 'J(1-2)+', 'J(1-2)-', '?A1>A3(+)',
       '?A1>A3(-)', '?A1>A4(+)', '?A1>A4(-)', '?A3∅A4(+)', '?A3∅A4(-)',
       '?A1=A3+A4(+)', '?A1=A3+A4(-)', 'J(1-34)+', 'J(1-34)-', 'p(A1=A2)',
       'p(A1=A3+A4)', 'p(A1>A3)', 'p(A1>A4)', 'p(A3∅A4)'],
      dtype='object')

# TODO:

1. relation-classification analysis

    In the zero-shot, there are relation classification to see 
 - if the LLMs able to understand the relation between answers correctly. 
 - if incorrect, normally which ones misclassified to which ones? Probably confusion matrix here. 
    
2. internal inconsistency

    We observe that even we ask the exactly same questions multiple times, there are always with difference/inconsistency. 
 - how much these internal inconsistency they are? 
 - how to exclude them into the inconsistency between questions, say properly evaluate the inconsistency caused by questions.
 - the impact factor of these internal inconsistency to answers inconsistency? 

3. *correct relation identification lead consistency answers? 

    We have the relation classification in classification-and-question actions
- the inconsistency if LLMs identify relation correctly, as well as when they identify incorrectly;
- the impact factor of this relation identification to inconsistency? 

4. can we build the impact factors model to attribute the cause of inconsistency? 
 We can conclude some factors such as internal inconsistency, relation-identification capability (semantic understanding to text), ...

5. how can we mitigate the inconsistency? 

    we tried several actions, and let's do further detailed analysis to actions and consequencs. 
- wikidata: consistency improved, the trade-off is more "idk" answers.

6. *an overall analysis include all datasets together. 


7. explain of how we get the data, and the properties of each datasets. 

8. *empty ratio need be include "idk", rename is no answers or idk. 

9. *compute p-values.
