## Import libraries

In [None]:
import os
import jsonlines

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
plt.rcParams.update({
    "font.size": 16,  # Increase base font size
    "axes.labelsize": 18,  # Labels
    "axes.titlesize": 20,  # Title
    "xtick.labelsize": 16,  # X-axis tick labels
    "ytick.labelsize": 16,  # Y-axis tick labels
    "legend.fontsize": 16  # Legend
})

In [None]:
result_dir = "../../../results"

## Correlation with consistency (bin chart)

In [None]:
os.makedirs(f"bin_chart", exist_ok=True)

dataset_names = ["CommonsenseQA", "QASC", "100TFQA", "GSM8K"]
model_names = ["Llama-3.1-8B-Instruct", "gpt-4o-2024-11-20"]
prompting_strategies = ["zero-shot", "zero-shot-cot", "few-shot", "few-shot-cot"]

# Read score file
scores = {}
for model_name in model_names:
    for dataset_name in dataset_names:
        for prompting_strategy in prompting_strategies:
            output_dir = f"{result_dir}/{dataset_name}/{model_name}"
            predictions_path = os.path.join(output_dir, f"{prompting_strategy}_predictions.jsonl")
            raw_predictions_path = os.path.join(output_dir, f"{prompting_strategy}_raw_predictions.jsonl")
            try:
                with jsonlines.open(predictions_path) as fin:
                    id_predictions_map, id_consistency_map = {}, {}
                    for example in fin.iter():
                        id_predictions_map[example["id"]] = example["predictions"]
                        id_consistency_map[example["id"]] = example["consistency"]["mean"]
                X, Y, Z = [], [], []
                with jsonlines.open(raw_predictions_path) as fin:
                    for example in fin.iter():
                        confidences = []
                        for format_id, top_tokens in example["top_tokens"].items():
                            confidence = -1
                            for ii, top_tokenss in enumerate(top_tokens[::-1]):
                                if top_tokenss[0] == id_predictions_map[example["id"]][format_id]:
                                    if "top_probs" in example:
                                        confidence = example["top_probs"][format_id][-(ii+1)][0]
                                    else:
                                        confidence = np.exp(example["top_logprobs"][format_id][-(ii+1)][0])
                                    confidences.append(confidence)
                                    break
                        if len(confidences) != 8:
                            # print(example["top_tokens"])
                            # print(len(confidences))
                            pass
                            # raise Exception
                        else:
                            mean_confidence = np.mean(confidences)
                            X.append(mean_confidence)
                            Y.append(id_consistency_map[example["id"]])
                            Z.append(1.0*(id_consistency_map[example["id"]] >= 0.99))
                X, Y, Z = np.array(X), np.array(Y), np.array(Z)
                Y = Z # set-wise consistency
            except Exception as e:
                # print(e)
                continue

            # Define confidence bins
            num_bins = 5  # Adjust this for more granularity
            bins = np.array([0.0, 0.8, 0.9, 1.0])
            bin_labels = [f"[{bins[i]:.2f}, {bins[i+1]:.2f})" for i in range(len(bins)-2)] + [f"[{bins[len(bins)-2]:.2f}, {bins[len(bins)-1]:.2f}]"]

            # Assign each confidence value to a bin
            df = pd.DataFrame({"Confidence": X, "Consistency": Y})
            df["Bin"] = pd.cut(df["Confidence"], bins=bins, labels=bin_labels, include_lowest=True)

            # Compute mean consistency for each bin
            bin_means = df.groupby("Bin")["Consistency"].mean()
            bin_vars = df.groupby("Bin")["Consistency"].var()

            # Plot bar chart
            fig = plt.figure(figsize=(10, 6))
            plt.bar(bin_means.index, bin_means.values, yerr=bin_vars.values, capsize=5, color="royalblue")

            # Labels and formatting
            plt.xlabel("Confidence Bins")
            plt.ylabel("Consistency")
            plt.yticks([-0.01, 0.00, 1.00])
            plt.ylim(0.0, 1.0)
            plt.title("Consistency Across Confidence Levels")
            # plt.xticks(rotation=45)
            plt.grid(axis="y", linestyle="--", alpha=0.5)

            plt.tight_layout()
            plt.savefig(f"bin_chart/confidence_correlation_{dataset_name}_{model_name}_{prompting_strategy}.pdf")
            plt.show()

### Extended analysis with 128 formats

In [None]:
os.makedirs(f"bin_chart", exist_ok=True)

dataset_names = ["CommonsenseQA", "QASC"]
model_names = ["Llama-3.1-8B-Instruct"]
prompting_strategies = ["zero-shot"]

# Read score file
scores = {}
for model_name in model_names:
    for dataset_name in dataset_names:
        for prompting_strategy in prompting_strategies:
            output_dir = f"{result_dir}/{dataset_name}/{model_name}"
            predictions_path = os.path.join(output_dir, f"{prompting_strategy}_ext_predictions.jsonl")
            raw_predictions_path = os.path.join(output_dir, f"{prompting_strategy}_ext_raw_predictions.jsonl")
            try:
                with jsonlines.open(predictions_path) as fin:
                    id_predictions_map, id_consistency_map = {}, {}
                    for example in fin.iter():
                        id_predictions_map[example["id"]] = example["predictions"]
                        id_consistency_map[example["id"]] = example["consistency"]["mean"]
                X, Y, Z = [], [], []
                with jsonlines.open(raw_predictions_path) as fin:
                    for example in fin.iter():
                        confidences = []
                        for format_id, top_tokens in example["top_tokens"].items():
                            confidence = -1
                            for ii, top_tokenss in enumerate(top_tokens[::-1]):
                                if top_tokenss[0] == id_predictions_map[example["id"]][format_id]:
                                    if "top_probs" in example:
                                        confidence = example["top_probs"][format_id][-(ii+1)][0]
                                    else:
                                        confidence = np.exp(example["top_logprobs"][format_id][-(ii+1)][0])
                                    confidences.append(confidence)
                                    break
                        if len(confidences) != 128:
                            # print(example["top_tokens"])
                            # print(len(confidences))
                            pass
                            # raise Exception
                        else:
                            mean_confidence = np.mean(confidences)
                            X.append(mean_confidence)
                            Y.append(id_consistency_map[example["id"]])
                            Z.append(1.0*(id_consistency_map[example["id"]] >= 0.99))
                X, Y, Z = np.array(X), np.array(Y), np.array(Z)
                Y = Z # set-wise consistency
            except Exception as e:
                # print(e)
                continue

            # Define confidence bins
            num_bins = 5  # Adjust this for more granularity
            bins = np.array([0.0, 0.8, 0.9, 1.0])
            bin_labels = [f"[{bins[i]:.2f}, {bins[i+1]:.2f})" for i in range(len(bins)-2)] + [f"[{bins[len(bins)-2]:.2f}, {bins[len(bins)-1]:.2f}]"]

            # Assign each confidence value to a bin
            df = pd.DataFrame({"Confidence": X, "Consistency": Y})
            df["Bin"] = pd.cut(df["Confidence"], bins=bins, labels=bin_labels, include_lowest=True)

            # Compute mean consistency for each bin
            bin_means = df.groupby("Bin")["Consistency"].mean()
            bin_vars = df.groupby("Bin")["Consistency"].var()

            # Plot bar chart
            fig = plt.figure(figsize=(10, 6))
            plt.bar(bin_means.index, bin_means.values, yerr=bin_vars.values, capsize=5, color="royalblue")

            # Labels and formatting
            plt.xlabel("Confidence Bins")
            plt.ylabel("Consistency")
            plt.yticks([-0.01, 0.00, 1.00])
            plt.ylim(0.0, 1.0)
            plt.title("Consistency Across Confidence Levels")
            # plt.xticks(rotation=45)
            plt.grid(axis="y", linestyle="--", alpha=0.5)

            plt.tight_layout()
            plt.savefig(f"bin_chart/confidence_correlation_{dataset_name}_{model_name}_{prompting_strategy}_EXT.pdf")
            plt.show()