## Import libraries

In [None]:
import os
import json

import matplotlib.pyplot as plt
from collections import defaultdict
from scipy.spatial.distance import hamming

In [None]:
FORMAT_TYPE_MAP = {
    "0": {"separator": ":", "casing": "capitalize", "space": ""},
    "1": {"separator": ":", "casing": "capitalize", "space": " "},
    "2": {"separator": ":", "casing": "upper", "space": ""},
    "3": {"separator": ":", "casing": "upper", "space": " "},
    "4": {"separator": ": ", "casing": "capitalize", "space": ""},
    "5": {"separator": ": ", "casing": "capitalize", "space": " "},
    "6": {"separator": ": ", "casing": "upper", "space": ""},
    "7": {"separator": ": ", "casing": "upper", "space": " "},
}
format_config = {
    "0": [0,0,0],
    "1": [0,0,1],
    "2": [0,1,0],
    "3": [0,1,1],
    "4": [1,0,0],
    "5": [1,0,1],
    "6": [1,1,0],
    "7": [1,1,1],
}

In [None]:
def get_configs(format_factor_number):
    num_formats = pow(2, format_factor_number)
    assert num_formats == NUM_FORMATS
    return [f"{i}_{j}" for i in range(num_formats) for j in range(num_formats) if i < j]

def get_format_distance(f1, f2):
    f1 = format_config[f1]
    f2 = format_config[f2]
    return sum([abs(f1[i] - f2[i]) for i in range(format_num)])

In [None]:
result_dir = "../../../results"

## Correlation with consistency (bin chart)

In [None]:
os.makedirs(f"bin_chart", exist_ok=True)

dataset_names = ["CommonsenseQA", "QASC", "100TFQA", "GSM8K"]
model_names = ["Phi-3.5-mini-instruct", "Phi-3.5-vision-instruct", "Llama-3.1-8B", "Llama-3.1-8B-Instruct", "DeepSeek-R1-Distill-Llama-8B", "gpt-4o-2024-11-20"]
prompting_strategies = ["zero-shot", "zero-shot-cot", "few-shot", "few-shot-cot"]
NUM_FORMATS = 8
format_num = 3

# Read score file
scores = {}
for model_name in model_names:
    for dataset_name in dataset_names:
        for prompting_strategy in prompting_strategies:
            input_path = f"{result_dir}/{dataset_name}/{model_name}/{prompting_strategy}_score.json"
            try:
                with open(input_path) as fin:
                    score = json.load(fin)
                    scores[(model_name, dataset_name, prompting_strategy)] = score
            except Exception as e:
                # print(e)
                continue

pairwise_distance = {k : get_format_distance(k[0], k[2]) for k in get_configs(format_num)}

for m_i, model_name in enumerate(model_names):
    fig, axes = plt.subplots(nrows=len(dataset_names), ncols=len(prompting_strategies), figsize=(20, 20))
    fig.subplots_adjust(hspace=0.5)
    for d_j, dataset_name in enumerate(dataset_names):
        for p_k, prompting_strategy in enumerate(prompting_strategies):
            try:
                score = scores[(model_name, dataset_name, prompting_strategy)]
            except:
                continue
            # Bar
            dist_consistency = defaultdict(list)
            for k, v in pairwise_distance.items():
                dist_consistency[v].append(score["consistency"][k])

            y_vals = []
            y_errs = []
            for k, v in dist_consistency.items():
                mean = sum(v) / len(v)
                std = (sum([(x - mean) ** 2 for x in v]) / len(v)) ** 0.5

                y_vals.append(mean)
                y_errs.append(std)
                
            x_vals = [i+1 for i in range(format_num)]
            max_vals = [x + y for x, y in zip(y_vals, y_errs)]

            axes[d_j, p_k].bar(x_vals, y_vals, yerr=y_errs, capsize=5, color="mediumpurple")
            axes[d_j, p_k].set_title(f"{dataset_name} - {prompting_strategy}")
            axes[d_j, p_k].set_xlabel("Format Distance")
            axes[d_j, p_k].set_ylabel("Consistency")
            axes[d_j, p_k].set_xticks(x_vals)
            y_ticks = [0.3, 0.35, 0.4, 0.45, 0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0]
            axes[d_j, p_k].set_yticks(y_ticks)
            axes[d_j, p_k].set_ylim(0.45, 1.0)
            # if max(max_vals) > 0.95:
            #     axes[d_j, p_k].set_ylim(min(min(y_vals) - 0.1, 0.84), 1.00)
            # else:
            #     axes[d_j, p_k].set_ylim(min(min(y_vals) - 0.1, 0.84), min(max(y_vals) + 0.1, 1.0))
            axes[d_j, p_k].grid(axis='y', linestyle='--', alpha=0.5)
    # fig.suptitle(f"{model_name} : Consistency Across Format Distances")
    plt.tight_layout()
    plt.savefig(f"bin_chart/format-distance_{model_name}.pdf")

### Extended analysis with 128 formats

In [None]:
NUM_FORMATS = 128
FORMAT_ELE_NUM = 7

def get_configs(format_ele_num):
    num_formats = pow(2, format_ele_num)
    assert num_formats == NUM_FORMATS
    return [f"{i}_{j}" for i in range(num_formats) for j in range(num_formats) if i < j]

def get_format_distance(f1, f2, format_num=FORMAT_ELE_NUM):
    a = list(f"{f1:07b}")
    b = list(f"{f2:07b}")
    dis = format_num*hamming(a, b)
    return dis

In [None]:
os.makedirs(f"bin_chart", exist_ok=True)

dataset_names = ["CommonsenseQA", "QASC"]
model_names = ["Llama-3.1-8B-Instruct"]
prompting_strategies = ["zero-shot"]

# Read score file
scores = {}
for model_name in model_names:
    for dataset_name in dataset_names:
        for prompting_strategy in prompting_strategies:
            input_path = f"{result_dir}/{dataset_name}/{model_name}/{prompting_strategy}_ext_score.json"
            try:
                with open(input_path) as fin:
                    score = json.load(fin)
                    scores[(model_name, dataset_name, prompting_strategy)] = score
            except Exception as e:
                # print(e)
                continue

pairwise_distance = {k : get_format_distance(int(k.split("_")[0]), int(k.split("_")[1])) for k in get_configs(FORMAT_ELE_NUM)}

for m_i, model_name in enumerate(model_names):
    for d_j, dataset_name in enumerate(dataset_names):
        for p_k, prompting_strategy in enumerate(prompting_strategies):
            try:
                score = scores[(model_name, dataset_name, prompting_strategy)]
            except:
                continue
            # Bar
            dist_consistency = defaultdict(list)
            for k, v in pairwise_distance.items():
                dist_consistency[v].append(score["consistency"][k])

            y_vals = []
            y_errs = []
            for k, v in dist_consistency.items():
                mean = sum(v) / len(v)
                std = (sum([(x - mean) ** 2 for x in v]) / len(v)) ** 0.5

                y_vals.append(mean)
                y_errs.append(std)

            x_vals = [i+1 for i in range(FORMAT_ELE_NUM)]
            # with open(os.path.join(current_dir, f"..//accum.jsonl"), "a") as fout:
            #     json.dump({"model_name": model_name, "dataset_name": dataset_name, "prompting_strategy": prompting_strategy, "x_vals": x_vals, "y_vals": y_vals, "y_err": y_errs,"y_lists": dist_consistency}, fout)
            #     fout.write("\n")
            

            # Plot settings
            plt.figure(figsize=(10, 6))
            # plt.scatter(x_vals, y_vals, s=100, c="blue", alpha=0.5)    
            # plt.bar(x_vals, y_vals, yerr=y_errs, capsize=5, color="mediumpurple")
            plt.bar(x_vals, y_vals, yerr=y_errs, capsize=5, color="mediumpurple")
            plt.xlabel("Format Distance")
            plt.ylabel("Consistency")
            plt.xticks(x_vals)
            ticks = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
            ticks = [0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1]
            plt.yticks(ticks)
            plt.ylim(0.9, 1)
            # plt.ylim(0.95, 1)
            plt.grid(axis='y', linestyle='--', alpha=0.5)
            plt.title(f"Consistency Across Format Distances-{model_name}-{dataset_name} ({prompting_strategy})")
            plt.tight_layout()

            mn = "R1" if "R1" in model_name else "Llama-Instruct" if "Llama-3.1-8B-Instruct" in model_name else "Llama-Base" if "Llama-3.1-8B" in model_name else "Phi-mini" if "Phi-3.5-mini-instruct" in model_name else "Phi-vision" if "Phi-3.5-vision-instruct" in model_name else "Gpt" if "gpt" in model_name else None
            dn = "CQA" if "Commonsense" in dataset_name else dataset_name
            shots = "zs" if "zero" in prompting_strategy else "fs" if "few" in prompting_strategy else None
            strategy = ""
            strategy += "-cot" if "cot" in prompting_strategy else ""
            strategy += "-com" if "com" in prompting_strategy else ""
            strategy += "-rar" if "rar" in prompting_strategy else ""
            strategy += "-ref" if "ref" in prompting_strategy else ""
            strategy += "-guided" if "guided" in prompting_strategy else ""

            ps = shots + strategy
            plt.savefig(f"bin_chart/EXformat_distance-{dn}_{mn}_{ps}.pdf")