## Import libraries

In [1]:
import os

import jsonlines
import numpy as np

In [2]:
result_dir = "../../../results_shared"
dataset_names = ["CommonsenseQA", "QASC", "100TFQA", "GSM8K"]
model_names = ["Llama-3.1-8B-Instruct"]
prompting_strategies = ["zero-shot", "zero-shot-cot", "few-shot", "few-shot-cot"]

accuracy_matrix, inconsistency_matrix = np.zeros((len(dataset_names), len(model_names), len(prompting_strategies))), np.zeros((len(dataset_names), len(model_names), len(prompting_strategies)))
for i, dataset_name in enumerate(dataset_names):
    for j, model_name in enumerate(model_names):
        for k, prompting_strategy in enumerate(prompting_strategies):
            # Read score file
            input_path = f"{result_dir}/{dataset_name}/{model_name}/{prompting_strategy}_predictions.jsonl"
            try:
                with jsonlines.open(input_path) as fin:
                    accuracy_list, inconsistency_list = [], []
                    for example in fin.iter():
                        accuracy_list.append(example["accuracy"]["mean"])
                        inconsistency_list.append(1.0*(example["consistency"]["mean"] < 1))
                    mean_accuracy = np.mean(accuracy_list)
                    mean_inconsistency = np.mean(inconsistency_list)

                    accuracy_matrix[i][j][k] = mean_accuracy
                    inconsistency_matrix[i][j][k] = mean_inconsistency
            except Exception as e:
                # print(k, e)
                accuracy_matrix[i][j][k] = -1
                inconsistency_matrix[i][j][k] = -1
                # exit(0)

prompting_strategies = ["Zero-shot", "Zero-shot CoT", "Few-shot", "Few-shot CoT"]
# Create LaTeX table string
latex_code = "\\begin{table*}[t]\n"
latex_code += "\\centering\n"
latex_code += "\\caption{Mean accuracy (left) and setwise inconsistency (right) across different tasks, models and prompting strategies. Blue values indicate performance improvement over zero-shot (or the leftmost strategy if not available), while red values denote performance drop.}\n"
latex_code += "\\label{tab:accuracy_inconsistency}\n"
latex_code += "\\begin{tabular}{c|c|cccc}\n"
latex_code += "\\toprule\n"
latex_code += "Task & Model & " + " & ".join(prompting_strategies) + " \\\\\n"
latex_code += "\\midrule\\midrule\n"

# Fill the table with data
for task_idx, task in enumerate(dataset_names):
    latex_code += f"\\multirow{{6}}{{*}}{{{task}}}\n"
    for model_idx, model in enumerate(model_names):
        latex_code += "      "
        latex_code += f"& {model} "
        for strat_idx in range(len(prompting_strategies)):
            acc = round(accuracy_matrix[task_idx, model_idx, strat_idx], 2)
            inc = round(inconsistency_matrix[task_idx, model_idx, strat_idx], 2)

            # Skip conditions
            if acc < 0:
                latex_code += f"& - "
                continue
            if model == "Llama-3.1-8B" and "Zero-shot" in prompting_strategies[strat_idx]:
                latex_code += f"& - "
                continue
            if model == "DeepSeek-R1-Distill-Llama-8B" and "CoT" not in prompting_strategies[strat_idx]:
                latex_code += f"& - "
                continue

            # Base idx setting
            base_idx = 1 if task in ["GSM8K"] else 0
            if model == "Llama-3.1-8B":
                base_idx = 3 if task in ["GSM8K"] else 2
            if model == "DeepSeek-R1-Distill-Llama-8B":
                base_idx = 1

            base_acc = round(accuracy_matrix[task_idx, model_idx, base_idx], 2)
            base_inc = round(inconsistency_matrix[task_idx, model_idx, base_idx], 2)
            acc_str = f"{acc:.2f}"
            inc_str = f"{inc:.2f}"
            if acc > base_acc:
                acc_str = "\\blue{" + acc_str + "}"
            elif acc < base_acc:
                acc_str = "\\red{" + acc_str + "}"
            if inc < base_inc:
                inc_str = "\\blue{" + inc_str + "}"
            elif inc > base_inc:
                inc_str = "\\red{" + inc_str + "}"
            latex_code += f"& {acc_str} | {inc_str} "
        latex_code += "\\\\\n"
    latex_code += "\\midrule\n"

# Close LaTeX table
latex_code = latex_code.rstrip("\\midrule\n")  # Remove last midrule
latex_code += "\\\\\n"
latex_code += "\\bottomrule\n"
latex_code += "\\end{tabular}\n"
latex_code += "\\end{table*}"

# Display the generated LaTeX table code
print(latex_code)

# with open("accuracy_inconsistency_table.txt", "w") as fout:
#     fout.write(latex_code)

\begin{table*}[t]
\centering
\caption{Mean accuracy (left) and setwise inconsistency (right) across different tasks, models and prompting strategies. Blue values indicate performance improvement over zero-shot (or the leftmost strategy if not available), while red values denote performance drop.}
\label{tab:accuracy_inconsistency}
\begin{tabular}{c|c|cccc}
\toprule
Task & Model & Zero-shot & Zero-shot CoT & Few-shot & Few-shot CoT \\
\midrule\midrule
\multirow{6}{*}{CommonsenseQA}
      & Llama-3.1-8B-Instruct & 0.75 | 0.10 & \blue{0.76} | \red{0.25} & \red{0.68} | \red{0.51} & - \\
\midrule
\multirow{6}{*}{QASC}
      & Llama-3.1-8B-Instruct & 0.82 | 0.09 & 0.82 | \red{0.19} & \red{0.61} | \red{0.84} & \red{0.68} | \red{0.74} \\
\midrule
\multirow{6}{*}{100TFQA}
      & Llama-3.1-8B-Instruct & 0.70 | 0.10 & \blue{0.72} | \red{0.26} & 0.70 | \red{0.24} & \red{0.68} | \red{0.48} \\
\midrule
\multirow{6}{*}{GSM8K}
      & Llama-3.1-8B-Instruct & - & 0.54 | 0.75 & - & \blue{0.79} | \blue{

## Main table

In [5]:
result_dir = "../../../results"
dataset_names = ["CommonsenseQA", "QASC", "100TFQA", "GSM8K"]
model_names = ["Llama-3.1-8B-Instruct"]
prompting_strategies = ["zero-shot", "zero-shot-cot", "few-shot", "few-shot-cot"]

accuracy_matrix, inconsistency_matrix = np.zeros((len(dataset_names), len(model_names), len(prompting_strategies))), np.zeros((len(dataset_names), len(model_names), len(prompting_strategies)))
for i, dataset_name in enumerate(dataset_names):
    for j, model_name in enumerate(model_names):
        for k, prompting_strategy in enumerate(prompting_strategies):
            # Read score file
            input_path = f"{result_dir}/{dataset_name}/{model_name}/{prompting_strategy}_finetuned_predictions.jsonl"
            try:
                with jsonlines.open(input_path) as fin:
                    accuracy_list, inconsistency_list = [], []
                    for example in fin.iter():
                        accuracy_list.append(example["accuracy"]["mean"])
                        inconsistency_list.append(1.0*(example["consistency"]["mean"] < 1))
                    mean_accuracy = np.mean(accuracy_list)
                    mean_inconsistency = np.mean(inconsistency_list)

                    accuracy_matrix[i][j][k] = mean_accuracy
                    inconsistency_matrix[i][j][k] = mean_inconsistency
            except Exception as e:
                # print(k, e)
                accuracy_matrix[i][j][k] = -1
                inconsistency_matrix[i][j][k] = -1
                # exit(0)

prompting_strategies = ["Zero-shot", "Zero-shot CoT", "Few-shot", "Few-shot CoT"]
# Create LaTeX table string
latex_code = "\\begin{table*}[t]\n"
latex_code += "\\centering\n"
latex_code += "\\caption{Mean accuracy (left) and setwise inconsistency (right) across different tasks, models and prompting strategies. Blue values indicate performance improvement over zero-shot (or the leftmost strategy if not available), while red values denote performance drop.}\n"
latex_code += "\\label{tab:accuracy_inconsistency}\n"
latex_code += "\\begin{tabular}{c|c|cccc}\n"
latex_code += "\\toprule\n"
latex_code += "Task & Model & " + " & ".join(prompting_strategies) + " \\\\\n"
latex_code += "\\midrule\\midrule\n"

# Fill the table with data
for task_idx, task in enumerate(dataset_names):
    latex_code += f"\\multirow{{6}}{{*}}{{{task}}}\n"
    for model_idx, model in enumerate(model_names):
        latex_code += "      "
        latex_code += f"& {model} "
        for strat_idx in range(len(prompting_strategies)):
            acc = round(accuracy_matrix[task_idx, model_idx, strat_idx], 2)
            inc = round(inconsistency_matrix[task_idx, model_idx, strat_idx], 2)

            # Skip conditions
            if acc < 0:
                latex_code += f"& - "
                continue
            if model == "Llama-3.1-8B" and "Zero-shot" in prompting_strategies[strat_idx]:
                latex_code += f"& - "
                continue
            if model == "DeepSeek-R1-Distill-Llama-8B" and "CoT" not in prompting_strategies[strat_idx]:
                latex_code += f"& - "
                continue

            # Base idx setting
            base_idx = 1 if task in ["GSM8K"] else 0
            if model == "Llama-3.1-8B":
                base_idx = 3 if task in ["GSM8K"] else 2
            if model == "DeepSeek-R1-Distill-Llama-8B":
                base_idx = 1

            base_acc = round(accuracy_matrix[task_idx, model_idx, base_idx], 2)
            base_inc = round(inconsistency_matrix[task_idx, model_idx, base_idx], 2)
            acc_str = f"{acc:.2f}"
            inc_str = f"{inc:.2f}"
            if acc > base_acc:
                acc_str = "\\blue{" + acc_str + "}"
            elif acc < base_acc:
                acc_str = "\\red{" + acc_str + "}"
            if inc < base_inc:
                inc_str = "\\blue{" + inc_str + "}"
            elif inc > base_inc:
                inc_str = "\\red{" + inc_str + "}"
            latex_code += f"& {acc_str} | {inc_str} "
        latex_code += "\\\\\n"
    latex_code += "\\midrule\n"

# Close LaTeX table
latex_code = latex_code.rstrip("\\midrule\n")  # Remove last midrule
latex_code += "\\\\\n"
latex_code += "\\bottomrule\n"
latex_code += "\\end{tabular}\n"
latex_code += "\\end{table*}"

# Display the generated LaTeX table code
print(latex_code)

# with open("accuracy_inconsistency_table.txt", "w") as fout:
#     fout.write(latex_code)

\begin{table*}[t]
\centering
\caption{Mean accuracy (left) and setwise inconsistency (right) across different tasks, models and prompting strategies. Blue values indicate performance improvement over zero-shot (or the leftmost strategy if not available), while red values denote performance drop.}
\label{tab:accuracy_inconsistency}
\begin{tabular}{c|c|cccc}
\toprule
Task & Model & Zero-shot & Zero-shot CoT & Few-shot & Few-shot CoT \\
\midrule\midrule
\multirow{6}{*}{CommonsenseQA}
      & Llama-3.1-8B-Instruct & - & - & - & - \\
\midrule
\multirow{6}{*}{QASC}
      & Llama-3.1-8B-Instruct & - & - & - & - \\
\midrule
\multirow{6}{*}{100TFQA}
      & Llama-3.1-8B-Instruct & - & - & - & - \\
\midrule
\multirow{6}{*}{GSM8K}
      & Llama-3.1-8B-Instruct & - & - & - & - \\
\bottomrule
\end{tabular}
\end{table*}
