In [81]:
import importlib
import json
import pandas as pd
from pathlib import Path
from collections import Counter
import re


In [254]:
def verdict_count(models, modes, folder_dir, filename_suffix):
    folder = Path(folder_dir)
    rows = []

    mode_counts = {}

    for model in models:
        for mode in modes:
            file_name = f"{model}_{mode}" + filename_suffix
            file_path = folder / file_name

            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            # record sample count once per mode
            if mode not in mode_counts:
                mode_counts[mode] = len(data)

            verdict_fields = {
                "basic": "response_basic_verdict",
                "internal": "response_internal_verdict",
                "claim": "response_claim_verdict",
            }

            for vtype, field in verdict_fields.items():
                verdicts = [item.get(field) for item in data if item.get(field)]
                counts = Counter(verdicts)

                rows.append({
                    "model": model,
                    "mode": mode,
                    "verdict_type": vtype,
                    "true": counts.get("true", 0),
                    "false": counts.get("false", 0),
                    "maybe": counts.get("maybe", 0),
                    "total": sum(counts.values())
                })

    df = pd.DataFrame(rows)

    return df, mode_counts



In [251]:
def verdict_table_to_latex_stacked(dfs, section_names, n_dict, modes_dict):

    def pct(x, total):
        return 100.0 * x / total if total > 0 else 0.0

    latex = []
    latex.append(r"\begin{table*}[t]")
    latex.append(r"\centering")
    latex.append(r"\small")
    latex.append(r"\setlength{\tabcolsep}{4pt}")
    latex.append(r"\begin{tabular}{l|ccc|ccc|ccc}")
    #latex.append(r"\hline")


    latex.append(r"\hline")

    for df, section in zip(dfs, section_names):

        #print(df)
        modes = modes_dict[section]
        cnts = n_dict[section]
        #print(modes)
        latex.append(r"\hline")
        latex.append(r"\rowcolor{violet!55}\multicolumn{10}{c}{\textbf{%s}} \\" % section)
        latex.append(r"\hline")
        latex.append(
            r"\rowcolor{violet!40} "
            + " ".join(
                r"& \multicolumn{3}{c%s}{%s (n = %d)}"
                % ("|" if i < len(modes) - 1 else "", mode, cnts[mode])
                for i, mode in enumerate(modes)
            )
            + r" \\"
        )


        latex.append(
            r"\rowcolor{violet!40} Model, prompt type & True & False & Maybe & True & False & Maybe & True & False & Maybe \\"
        )
        latex.append(r"\hline")


        df = df.copy()
        df["row"] = df["model"] + ", " + df["verdict_type"]


        models = ["llama", "mistral", "qwen"]
        verdict_types = ["basic", "internal", "claim"]
        
        for i, model in enumerate(models):
            
            for vtype in verdict_types:
                row_name = f"{model}, {vtype}"
                row_df = df[df["row"] == row_name]
        
                if row_df.empty:
                    continue
                    
                line = [row_name]
        
                for mode in modes:
                    sub = row_df[row_df["mode"] == mode]
                    #print(len(sub))
                    if len(sub) == 0:
                        line += ["--", "--", "--"]
                    else:
                        t = pct(sub.iloc[0]["true"], sub.iloc[0]["total"])
                        f = pct(sub.iloc[0]["false"], sub.iloc[0]["total"])
                        m = pct(sub.iloc[0]["maybe"], sub.iloc[0]["total"])
                        #print(t,f,m)
                        line += [f"{t:.1f}", f"{f:.1f}", f"{m:.1f}"]
                        
                if i % 2 == 1:
                    latex.append(r"\rowcolor{violet!10}" + " & ".join(line) + r" \\")
                else:
                    latex.append(" & ".join(line) + r" \\")
                    
                

            latex.append(r"\hline")

    latex.append(r"\end{tabular}")
    latex.append(
        r"\caption{Verdict distribution (\%) for all models, prompt types, and prompt styles.}"
    )
    latex.append(r"\label{tab:verdict_distribution}")
    latex.append(r"\end{table*}")

    return "\n".join(latex)


## summary of various verdict only/first/blabla ##

In [223]:


dir_verdict_only = "../data/no_split_verdict_only/"
filename_suffix_verdict_only = "_only_verdict_result.json"
models_verdict_only =["llama", "mistral", "qwen"]
modes_verdict_only =["2support", "2refute", "mix"]
df_verdict_only, cnt_by_mode_verdict_only = verdict_count(models_verdict_only, modes_verdict_only, dir_verdict_only, filename_suffix_verdict_only)
cnt_by_mode_verdict_only 

{'2support': 245, '2refute': 563, 'mix': 183}

In [217]:
#df_verdict_only

In [221]:
dir_bullet_first = "../data/no_split_bullet_first/"
filename_suffix_bullet_first = "_bullet_first_result.json"

models_bullet_first =["llama", "mistral", "qwen"]
modes_bullet_first =["2support", "2refute", "mix"]
df_bullet_first, cnt_by_mode_bullet_first = verdict_count(models_bullet_first, modes_bullet_first, dir_bullet_first, filename_suffix_bullet_first)
cnt_by_mode_bullet_first 

{'2support': 245, '2refute': 563, 'mix': 183}

In [229]:
dir_inverse_bullet_first = "../data/no_split_inverse_bullet_first/"
filename_suffix_inverse_bullet_first = "_inverse_bullet_first_result.json"

models_inverse_bullet_first =["llama", "mistral", "qwen"]
modes_inverse_bullet_first =["2support", "2refute", "mix"]

df_inverse_bullet_first, cnt_by_mode_inverse_bullet_first = verdict_count(models_inverse_bullet_first, modes_inverse_bullet_first, dir_inverse_bullet_first, filename_suffix_inverse_bullet_first)
cnt_by_mode_inverse_bullet_first

{'2support': 245, '2refute': 563, 'mix': 183}

In [263]:



latex_table = verdict_table_to_latex_stacked(
    dfs=[df_verdict_only, df_bullet_first, df_inverse_bullet_first],
    section_names=["Verdict only", "Bullet first", "Bullet first, inversed evidence"],
    n_dict= {"Verdict only": cnt_by_mode_verdict_only ,
                 "Bullet first": cnt_by_mode_bullet_first ,
                 "Bullet first, inversed evidence":  cnt_by_mode_inverse_bullet_first ,
                },
    modes_dict= {"Verdict only": ["2support", "2refute", "mix"],
                 "Bullet first": ["2support", "2refute", "mix"],
                 "Bullet first, inversed evidence":  ["2support", "2refute", "mix"],
                }
)

print(latex_table)


\begin{table*}[t]
\centering
\small
\setlength{\tabcolsep}{4pt}
\begin{tabular}{l|ccc|ccc|ccc}
\hline
\hline
\rowcolor{violet!55}\multicolumn{10}{c}{\textbf{Verdict only}} \\
\hline
\rowcolor{violet!40} & \multicolumn{3}{c|}{2support (n = 245)} & \multicolumn{3}{c|}{2refute (n = 563)} & \multicolumn{3}{c}{mix (n = 183)} \\
\rowcolor{violet!40} Model, prompt type & True & False & Maybe & True & False & Maybe & True & False & Maybe \\
\hline
llama, basic & 51.4 & 25.3 & 23.3 & 4.4 & 94.7 & 0.9 & 3.3 & 85.2 & 11.5 \\
llama, internal & 57.1 & 30.6 & 12.2 & 4.1 & 95.4 & 0.5 & 3.8 & 89.1 & 7.1 \\
llama, claim & 4.5 & 49.0 & 46.5 & 0.9 & 78.7 & 20.4 & 0.0 & 72.7 & 27.3 \\
\hline
\rowcolor{violet!10}mistral, basic & 76.7 & 20.8 & 2.4 & 5.5 & 94.5 & 0.0 & 7.1 & 91.3 & 1.6 \\
\rowcolor{violet!10}mistral, internal & 77.1 & 20.4 & 2.4 & 5.5 & 94.5 & 0.0 & 7.1 & 91.3 & 1.6 \\
\rowcolor{violet!10}mistral, claim & 15.9 & 38.8 & 45.3 & 5.0 & 76.7 & 18.3 & 10.9 & 62.3 & 26.8 \\
\hline
qwen, basic & 50.

## summary of verdict split by fackcheck ##

In [259]:
dir_true = "../data/split_by_verdict/true/"
filename_suffix_true = "_true_bullet_first_result.json"


models_true =["llama", "mistral", "qwen"]
modes_true =["2support", "1refute", "mix"]

df_true, cnt_by_mode_true = verdict_count(models_true, modes_true, dir_true, filename_suffix_true)
cnt_by_mode_true

{'2support': 110, '1refute': 11, 'mix': 8}

In [257]:
dir_false = "../data/split_by_verdict/false/"
filename_suffix_false = "_false_bullet_first_result.json"


models_false =["llama", "mistral", "qwen"]
modes_false =["2support", "2refute", "mix"]

df_false, cnt_by_mode_false = verdict_count(models_false, modes_false, dir_false, filename_suffix_false)
cnt_by_mode_false

{'2support': 56, '2refute': 520, 'mix': 161}

In [267]:
dir_half_true = "../data/split_by_verdict/half_true/"
filename_suffix_half_true = "_half_true_bullet_first_result.json"


models_half_true =["llama", "mistral", "qwen"]
modes_half_true =["2support", "2refute", "mix"]


df_half_true, cnt_by_mode_half_true = verdict_count(models_half_true, modes_half_true, dir_half_true, filename_suffix_half_true)
cnt_by_mode_half_true

{'2support': 25, '2refute': 4, 'mix': 7}

In [269]:


latex_table = verdict_table_to_latex_stacked(
    dfs=[df_true, df_false, df_half_true],
    section_names=["Factcheck True", "Factcheck False", "Factcheck Half True"],
    n_dict= {"Factcheck True": cnt_by_mode_true ,
                 "Factcheck False": cnt_by_mode_false ,
                 "Factcheck Half True":  cnt_by_mode_half_true ,
                },
    modes_dict= {"Factcheck True": ["2support", "1refute", "mix"],
                 "Factcheck False": ["2support", "2refute", "mix"],
                 "Factcheck Half True":  ["2support", "2refute", "mix"],
                }
)

print(latex_table)


\begin{table*}[t]
\centering
\small
\setlength{\tabcolsep}{4pt}
\begin{tabular}{l|ccc|ccc|ccc}
\hline
\hline
\rowcolor{violet!55}\multicolumn{10}{c}{\textbf{Factcheck True}} \\
\hline
\rowcolor{violet!40} & \multicolumn{3}{c|}{2support (n = 110)} & \multicolumn{3}{c|}{1refute (n = 11)} & \multicolumn{3}{c}{mix (n = 8)} \\
\rowcolor{violet!40} Model, prompt type & True & False & Maybe & True & False & Maybe & True & False & Maybe \\
\hline
llama, basic & 77.3 & 14.5 & 8.2 & 9.1 & 63.6 & 27.3 & 50.0 & 25.0 & 25.0 \\
llama, internal & 82.7 & 16.4 & 0.9 & 18.2 & 81.8 & 0.0 & 62.5 & 37.5 & 0.0 \\
llama, claim & 12.7 & 34.5 & 52.7 & 0.0 & 9.1 & 90.9 & 0.0 & 12.5 & 87.5 \\
\hline
\rowcolor{violet!10}mistral, basic & 94.5 & 3.6 & 1.8 & 45.5 & 54.5 & 0.0 & 62.5 & 12.5 & 25.0 \\
\rowcolor{violet!10}mistral, internal & 95.5 & 2.7 & 1.8 & 54.5 & 36.4 & 9.1 & 62.5 & 12.5 & 25.0 \\
\rowcolor{violet!10}mistral, claim & 26.6 & 8.3 & 65.1 & 0.0 & 18.2 & 81.8 & 0.0 & 25.0 & 75.0 \\
\hline
qwen, basic & 