## Import libraries

In [None]:
import os
import json

import numpy as np
from scipy.spatial.distance import hamming
from scipy.stats import spearmanr, pearsonr

In [None]:
def format_distance_3bits(a, b):
    a = list(f"{a:03b}")
    b = list(f"{b:03b}")
    return 3*hamming(a, b)

In [None]:
def format_distance_7bits(a, b):
    a = list(f"{a:07b}")
    b = list(f"{b:07b}")
    return 7*hamming(a, b)

In [None]:
result_dir = "../../../results"

## Correlation with consistency

In [None]:
os.makedirs(f"correlation_analysis_table", exist_ok=True)

dataset_names = ["CommonsenseQA", "QASC", "100TFQA", "GSM8K"]
model_names = ["Phi-3.5-mini-instruct", "Phi-3.5-vision-instruct", "Llama-3.1-8B", "Llama-3.1-8B-Instruct", "DeepSeek-R1-Distill-Llama-8B", "gpt-4o-2024-11-20"]
prompting_strategies = ["zero-shot", "zero-shot-cot", "few-shot", "few-shot-cot"]
num_formats = NUM_FORMATS = 8

corr_matrix, p_matrix = np.zeros((len(dataset_names), len(model_names), len(prompting_strategies))), np.zeros((len(dataset_names), len(model_names), len(prompting_strategies)))
for i, dataset_name in enumerate(dataset_names):
    for j, model_name in enumerate(model_names):
        for k, prompting_strategy in enumerate(prompting_strategies):
            output_dir = f"{result_dir}/{dataset_name}/{model_name}"

            # Prepare pairwise consistency matrix (macro)
            score_path = os.path.join(output_dir, f"{prompting_strategy}_score.json")
            try:
                with open(score_path, "r") as fin:
                    score = json.load(fin)
                pairwise_consistency_matrix = np.ones((num_formats, num_formats))
                format_distance_matrix = np.ones((num_formats, num_formats))
                for key, value in score["consistency"].items():
                    if key == "mean":
                        continue
                    split_key = key.split("_")
                    ii, jj = int(split_key[0]), int(split_key[-1])
                    pairwise_consistency_matrix[ii][jj] = value
                    pairwise_consistency_matrix[jj][ii] = value

                    distance = format_distance_3bits(ii, jj)
                    format_distance_matrix[ii][jj] = distance
                    format_distance_matrix[jj][ii] = distance
            except Exception as e:
                # print(e)
                corr_matrix[i][j][k] = -2
                p_matrix[i][j][k] = -2
                continue

            # Step 2: Macro analysis
            pairwise_consistency_list, format_distance_list = [], []
            for ii in range(num_formats):
                for jj in range(num_formats):
                    if ii < jj:
                        pairwise_consistency_list.append(pairwise_consistency_matrix[ii][jj])
                        format_distance_list.append(format_distance_matrix[ii][jj])

            pearson_corr, pearson_p = pearsonr(pairwise_consistency_list, format_distance_list)
            spearman_corr, spearman_p = spearmanr(pairwise_consistency_list, format_distance_list)

            # Print correlation results
            print(f"{dataset_name} / {model_name} / {prompting_strategy}")
            print("Macro analysis")
            print(f"Pearson correlation: {pearson_corr:.2f} (p-value: {pearson_p:.3e})")
            print(f"Spearman correlation: {spearman_corr:.2f} (p-value: {spearman_p:.3e})")
            print()

            corr_matrix[i][j][k] = spearman_corr
            p_matrix[i][j][k] = spearman_p

prompting_strategies = ["Zero-shot", "Zero-shot CoT", "Few-shot", "Few-shot CoT"]
# Create LaTeX table string
latex_code = "\\begin{table}[ht]\n"
latex_code += "\\centering\n"
latex_code += "\\caption{Correlation analysis between format distance and pairwise consistency. Each cell represents Spearman correlation coefficient and p-value (in parenthesis).}\n"
latex_code += "\\label{tab:format_distance_correlation}\n"
latex_code += "\\begin{tabular}{c|c|cccc}\n"
latex_code += "\\toprule\n"
latex_code += "Task & Model & " + " & ".join(prompting_strategies) + " \\\\\n"
latex_code += "\\midrule\\midrule\n"

# Fill the table with data
for task_idx, task in enumerate(dataset_names):
    latex_code += f"\\multirow{{6}}{{*}}{{{task}}}\n"
    for model_idx, model in enumerate(model_names):
        latex_code += "      "
        latex_code += f"& {model} "
        for strat_idx in range(len(prompting_strategies)):
            corr = corr_matrix[task_idx, model_idx, strat_idx]
            p = p_matrix[task_idx, model_idx, strat_idx]

            # Skip conditions
            if corr < -1.5:
                latex_code += f"& - "
                continue
            if model == "Llama-3.1-8B" and "Zero-shot" in prompting_strategies[strat_idx]:
                latex_code += f"& - "
                continue
            if model == "DeepSeek-R1-Distill-Llama-8B" and "CoT" not in prompting_strategies[strat_idx]:
                latex_code += f"& - "
                continue

            corr_str = f"{corr:.2f}"
            p_str = f"{p:.3e}" if p < 0.001 else f"{p:.3f}"
            
            latex_code += f"& {corr_str} ({p_str}) "
        latex_code += "\\\\\n"
    latex_code += "\\midrule\n"

# Close LaTeX table
latex_code = latex_code.rstrip("\\midrule\n")  # Remove last midrule
latex_code += "\\\\\n"
latex_code += "\\bottomrule\n"
latex_code += "\\end{tabular}\n"
latex_code += "\\end{table}"

# Display the generated LaTeX table code
print(latex_code)

# with open(f"correlation_analysis_table/correlation_with_consistency_table.txt", "w") as fout:
#     fout.write(latex_code)

### Extended analysis with 128 formats

In [None]:
os.makedirs(f"correlation_analysis_table", exist_ok=True)

dataset_names = ["CommonsenseQA", "QASC"]
# model_names = ["Phi-3.5-mini-instruct", "Llama-3.1-8B-Instruct", "gpt-4o-2024-11-20"]
model_names = ["Llama-3.1-8B-Instruct"]
prompting_strategies = ["zero-shot_ext"]
num_formats = NUM_FORMATS = 128

corr_matrix, p_matrix = np.zeros((len(dataset_names), len(model_names), len(prompting_strategies))), np.zeros((len(dataset_names), len(model_names), len(prompting_strategies)))
for i, dataset_name in enumerate(dataset_names):
    for j, model_name in enumerate(model_names):
        for k, prompting_strategy in enumerate(prompting_strategies):
            output_dir = f"{result_dir}/{dataset_name}/{model_name}"

            # Prepare pairwise consistency matrix (macro)
            score_path = os.path.join(output_dir, f"{prompting_strategy}_score.json")
            try:
                with open(score_path, "r") as fin:
                    score = json.load(fin)
                pairwise_consistency_matrix = np.ones((num_formats, num_formats))
                format_distance_matrix = np.ones((num_formats, num_formats))
                for key, value in score["consistency"].items():
                    if key == "mean":
                        continue
                    split_key = key.split("_")
                    ii, jj = int(split_key[0]), int(split_key[-1])
                    pairwise_consistency_matrix[ii][jj] = value
                    pairwise_consistency_matrix[jj][ii] = value

                    distance = format_distance_7bits(ii, jj)
                    format_distance_matrix[ii][jj] = distance
                    format_distance_matrix[jj][ii] = distance
            except Exception as e:
                # print(e)
                corr_matrix[i][j][k] = -2
                p_matrix[i][j][k] = -2
                continue

            # Step 2: Macro analysis
            pairwise_consistency_list, format_distance_list = [], []
            for ii in range(num_formats):
                for jj in range(num_formats):
                    if ii < jj:
                        pairwise_consistency_list.append(pairwise_consistency_matrix[ii][jj])
                        format_distance_list.append(format_distance_matrix[ii][jj])

            pearson_corr, pearson_p = pearsonr(pairwise_consistency_list, format_distance_list)
            spearman_corr, spearman_p = spearmanr(pairwise_consistency_list, format_distance_list)

            # Print correlation results
            print(f"{dataset_name} / {model_name} / {prompting_strategy}")
            print("Macro analysis")
            print(f"Pearson correlation: {pearson_corr:.2f} (p-value: {pearson_p:.3e})")
            print(f"Spearman correlation: {spearman_corr:.2f} (p-value: {spearman_p:.3e})")
            print()

            corr_matrix[i][j][k] = spearman_corr
            p_matrix[i][j][k] = spearman_p

prompting_strategies = ["Zero-shot"]
# Create LaTeX table string
latex_code = ""

# Fill the table with data
for task_idx, task in enumerate(dataset_names):
    latex_code += f"\\multirow{{6}}{{*}}{{{task}}}\n"
    for model_idx, model in enumerate(model_names):
        latex_code += "      "
        latex_code += f"& {model} "
        for strat_idx in range(len(prompting_strategies)):
            corr = corr_matrix[task_idx, model_idx, strat_idx]
            p = p_matrix[task_idx, model_idx, strat_idx]

            # Skip conditions
            if corr < -1.5:
                latex_code += f"& - "
                continue
            if model == "Llama-3.1-8B" and "Zero-shot" in prompting_strategies[strat_idx]:
                latex_code += f"& - "
                continue
            if model == "DeepSeek-R1-Distill-Llama-8B" and "CoT" not in prompting_strategies[strat_idx]:
                latex_code += f"& - "
                continue

            corr_str = f"{corr:.2f}"
            p_str = f"{p:.3e}" if p < 0.001 else f"{p:.3f}"
            
            latex_code += f"& {corr_str} ({p_str}) "
        latex_code += "\\\\\n"
    latex_code += "\\midrule\n"

# Display the generated LaTeX table code
print(latex_code)

# with open(f"correlation_analysis_table/ext-correlation_with_consistency_table.txt", "w") as fout:
#     fout.write(latex_code)