## Import libraries

In [1]:
import os
import json

import numpy as np
from scipy.spatial.distance import euclidean, hamming
from scipy.stats import spearmanr, pearsonr

In [2]:
def read_memmap(filepath):
    with open(filepath.replace(".dat", ".conf"), "r") as fin_config:
        memmap_configs = json.load(fin_config)
        return np.memmap(filepath, mode="r", shape=tuple(memmap_configs["shape"]), dtype=memmap_configs["dtype"])

In [3]:
def format_distance_3bits(a, b):
    a = list(f"{a:03b}")
    b = list(f"{b:03b}")
    return 3*hamming(a, b)

In [4]:
result_dir = "../../../results"

## Correlation analysis

### Correlation with consistency (last layer)

In [5]:
os.makedirs(f"correlation_analysis_table", exist_ok=True)

dataset_names = ["CommonsenseQA", "QASC", "100TFQA", "GSM8K"]
model_names = ["Phi-3.5-mini-instruct", "Llama-3.1-8B-Instruct"]
prompting_strategies = ["zero-shot", "zero-shot-cot", "few-shot", "few-shot-cot"]

corr_matrix, p_matrix = np.zeros((len(model_names), len(dataset_names), len(prompting_strategies))), np.zeros((len(model_names), len(dataset_names), len(prompting_strategies)))
for i, model_name in enumerate(model_names):
    for j, dataset_name in enumerate(dataset_names):
        for k, prompting_strategy in enumerate(prompting_strategies):
            output_dir = f"{result_dir}/{dataset_name}/{model_name}"

            layer_wise_path = os.path.join(output_dir, f"{prompting_strategy}_layer_wise_hidden_states.dat")
            head_wise_path = os.path.join(output_dir, f"{prompting_strategy}_head_wise_hidden_states.dat")

            try:
                layer_wise_hidden_states = read_memmap(layer_wise_path)
            except Exception as e:
                # print(e)
                corr_matrix[i][j][k] = -2
                p_matrix[i][j][k] = -2
                continue
            num_samples, num_formats, num_layers, hidden_size = layer_wise_hidden_states.shape

            # Prepare pairwise consistency matrix (macro)
            score_path = os.path.join(output_dir, f"{prompting_strategy}_score.json")
            with open(score_path, "r") as fin:
                score = json.load(fin)
            pairwise_consistency_matrix = np.ones((num_formats, num_formats))
            for key, value in score["consistency"].items():
                if key == "mean":
                    continue
                split_key = key.split("_")
                ii, jj = int(split_key[0]), int(split_key[-1])
                pairwise_consistency_matrix[ii][jj] = value
                pairwise_consistency_matrix[jj][ii] = value

            for layer_idx in range(num_layers):
                if layer_idx != num_layers - 1:
                    continue

                # Step 1: Prepare input
                X = layer_wise_hidden_states[:,:,layer_idx,:]
                Y = np.tile(np.arange(num_formats), num_samples)
                X_mean = X.mean(axis=0)
                
                # Step 2: Macro analysis
                pairwise_distance_list, pairwise_consistency_list = [], []
                for ii in range(num_formats):
                    for jj in range(num_formats):
                        if ii < jj:
                            dist = euclidean(X_mean[ii], X_mean[jj])
                            pairwise_distance_list.append(dist)
                            pairwise_consistency_list.append(pairwise_consistency_matrix[ii][jj])

                pearson_corr, pearson_p = pearsonr(pairwise_distance_list, pairwise_consistency_list)
                spearman_corr, spearman_p = spearmanr(pairwise_distance_list, pairwise_consistency_list)

                # Print correlation results
                print(f"{dataset_name} / {model_name} / {prompting_strategy}")
                print(f"Layer {layer_idx}")
                print("Macro analysis")
                print(f"Pearson correlation: {pearson_corr:.2f} (p-value: {pearson_p:.3e})")
                print(f"Spearman correlation: {spearman_corr:.2f} (p-value: {spearman_p:.3e})")
                print()

                corr_matrix[i][j][k] = spearman_corr
                p_matrix[i][j][k] = spearman_p

prompting_strategies = ["Zero-shot", "Zero-shot CoT", "Few-shot", "Few-shot CoT"]
# Create LaTeX table string
latex_code = "\\begin{table}[ht]\n"
latex_code += "\\centering\n"
latex_code += "\\caption{Correlation analysis between last layer embedding distance and pairwise consistency. Each cell represents Spearman correlation coefficient and p-value (in parenthesis).}\n"
latex_code += "\\label{tab:last_layer_embedding_distance_correlation}\n"
latex_code += "\\begin{tabular}{c|c|cccc}\n"
latex_code += "\\toprule\n"
latex_code += "Model & Task & " + " & ".join(prompting_strategies) + " \\\\\n"
latex_code += "\\midrule\\midrule\n"

# Fill the table with data
for model_idx, model in enumerate(model_names):
    latex_code += f"\\multirow{{4}}{{*}}{{{model}}}\n"
    for task_idx, task in enumerate(dataset_names):
        latex_code += "      "
        latex_code += f"& {task} "
        for strat_idx in range(len(prompting_strategies)):
            corr = corr_matrix[model_idx, task_idx, strat_idx]
            p = p_matrix[model_idx, task_idx, strat_idx]

            # Skip conditions
            if corr < -1.5:
                latex_code += f"& - "
                continue

            corr_str = f"{corr:.2f}"
            p_str = f"{p:.3e}" if p < 0.001 else f"{p:.3f}"
            
            latex_code += f"& {corr_str} ({p_str}) "
        latex_code += "\\\\\n"
    latex_code += "\\midrule\n"

# Close LaTeX table
latex_code = latex_code.rstrip("\\midrule\n")  # Remove last midrule
latex_code += "\\\\\n"
latex_code += "\\bottomrule\n"
latex_code += "\\end{tabular}\n"
latex_code += "\\end{table}"

# Display the generated LaTeX table code
print(latex_code)

with open(f"correlation_analysis_table/last_layer_embedding_distance_correlation_with_consistency_table.txt", "w") as fout:
    fout.write(latex_code)

CommonsenseQA / Phi-3.5-mini-instruct / zero-shot
Layer 31
Macro analysis
Pearson correlation: -0.60 (p-value: 7.451e-04)
Spearman correlation: -0.60 (p-value: 8.310e-04)

CommonsenseQA / Phi-3.5-mini-instruct / zero-shot-cot
Layer 31
Macro analysis
Pearson correlation: -0.82 (p-value: 7.295e-08)
Spearman correlation: -0.82 (p-value: 1.279e-07)

CommonsenseQA / Phi-3.5-mini-instruct / few-shot
Layer 31
Macro analysis
Pearson correlation: -0.12 (p-value: 5.558e-01)
Spearman correlation: -0.06 (p-value: 7.668e-01)

QASC / Phi-3.5-mini-instruct / zero-shot
Layer 31
Macro analysis
Pearson correlation: -0.39 (p-value: 4.068e-02)
Spearman correlation: -0.34 (p-value: 7.291e-02)

QASC / Phi-3.5-mini-instruct / zero-shot-cot
Layer 31
Macro analysis
Pearson correlation: -0.76 (p-value: 2.314e-06)
Spearman correlation: -0.62 (p-value: 4.291e-04)

QASC / Phi-3.5-mini-instruct / few-shot
Layer 31
Macro analysis
Pearson correlation: -0.58 (p-value: 1.158e-03)
Spearman correlation: -0.54 (p-value: 2

### Correlation with consistency (best layer)

In [6]:
os.makedirs(f"correlation_analysis_table", exist_ok=True)

dataset_names = ["CommonsenseQA", "QASC", "100TFQA", "GSM8K"]
model_names = ["Phi-3.5-mini-instruct", "Llama-3.1-8B-Instruct"]
prompting_strategies = ["zero-shot", "zero-shot-cot", "few-shot", "few-shot-cot"]

NUM_LAYERS = 32
corr_matrix, p_matrix = np.zeros((len(model_names), len(dataset_names), len(prompting_strategies), NUM_LAYERS)), np.zeros((len(model_names), len(dataset_names), len(prompting_strategies), NUM_LAYERS))
for i, model_name in enumerate(model_names):
    for j, dataset_name in enumerate(dataset_names):
        for k, prompting_strategy in enumerate(prompting_strategies):
            output_dir = f"{result_dir}/{dataset_name}/{model_name}"

            layer_wise_path = os.path.join(output_dir, f"{prompting_strategy}_layer_wise_hidden_states.dat")
            head_wise_path = os.path.join(output_dir, f"{prompting_strategy}_head_wise_hidden_states.dat")

            try:
                layer_wise_hidden_states = read_memmap(layer_wise_path)
            except Exception as e:
                print(e)
                # corr_matrix[i][j][k] = -2
                # p_matrix[i][j][k] = -2
                continue
            num_samples, num_formats, num_layers, hidden_size = layer_wise_hidden_states.shape

            # Prepare pairwise consistency matrix (macro)
            score_path = os.path.join(output_dir, f"{prompting_strategy}_score.json")
            with open(score_path, "r") as fin:
                score = json.load(fin)
            pairwise_consistency_matrix = np.ones((num_formats, num_formats))
            for key, value in score["consistency"].items():
                if key == "mean":
                    continue
                split_key = key.split("_")
                ii, jj = int(split_key[0]), int(split_key[-1])
                pairwise_consistency_matrix[ii][jj] = value
                pairwise_consistency_matrix[jj][ii] = value

            for layer_idx in range(num_layers):
                # Step 1: Prepare input
                X = layer_wise_hidden_states[:,:,layer_idx,:]
                Y = np.tile(np.arange(num_formats), num_samples)
                X_mean = X.mean(axis=0)
                
                # Step 2: Macro analysis
                pairwise_distance_list, pairwise_consistency_list = [], []
                for ii in range(num_formats):
                    for jj in range(num_formats):
                        if ii < jj:
                            dist = euclidean(X_mean[ii], X_mean[jj])
                            pairwise_distance_list.append(dist)
                            pairwise_consistency_list.append(pairwise_consistency_matrix[ii][jj])

                pearson_corr, pearson_p = pearsonr(pairwise_distance_list, pairwise_consistency_list)
                spearman_corr, spearman_p = spearmanr(pairwise_distance_list, pairwise_consistency_list)

                # Print correlation results
                print(f"{dataset_name} / {model_name} / {prompting_strategy}")
                print(f"Layer {layer_idx}")
                print("Macro analysis")
                print(f"Pearson correlation: {pearson_corr:.2f} (p-value: {pearson_p:.3e})")
                print(f"Spearman correlation: {spearman_corr:.2f} (p-value: {spearman_p:.3e})")
                print()

                # if spearman_corr < corr_matrix[i][j][k]:
                corr_matrix[i][j][k][layer_idx] = spearman_corr
                p_matrix[i][j][k][layer_idx] = spearman_p

# prompting_strategies = ["Zero-shot", "Zero-shot CoT", "Few-shot", "Few-shot CoT"]
# # Create LaTeX table string
# latex_code = "\\begin{table}[ht]\n"
# latex_code += "\\centering\n"
# latex_code += "\\caption{Correlation analysis between best layer embedding distance and pairwise consistency. A layer in which the correlation is maximized is selected as the best layer. Each cell represents Spearman correlation coefficient and p-value (in parenthesis).}\n"
# latex_code += "\\label{tab:best_layer_embedding_distance_correlation}\n"
# latex_code += "\\begin{tabular}{c|c|cccc}\n"
# latex_code += "\\toprule\n"
# latex_code += "Model & Task & " + " & ".join(prompting_strategies) + " \\\\\n"
# latex_code += "\\midrule\\midrule\n"

# # Fill the table with data
# for model_idx, model in enumerate(model_names):
#     latex_code += f"\\multirow{{4}}{{*}}{{{model}}}\n"
#     for task_idx, task in enumerate(dataset_names):
#         latex_code += "      "
#         latex_code += f"& {task} "
#         for strat_idx in range(len(prompting_strategies)):
#             corr = corr_matrix[model_idx, task_idx, strat_idx]
#             p = p_matrix[model_idx, task_idx, strat_idx]

#             # Skip conditions
#             if corr < -1.5:
#                 latex_code += f"& - "
#                 continue

#             corr_str = f"{corr:.2f}"
#             p_str = f"{p:.3e}" if p < 0.001 else f"{p:.3f}"
            
#             latex_code += f"& {corr_str} ({p_str}) "
#         latex_code += "\\\\\n"
#     latex_code += "\\midrule\n"

# # Close LaTeX table
# latex_code = latex_code.rstrip("\\midrule\n")  # Remove last midrule
# latex_code += "\\\\\n"
# latex_code += "\\bottomrule\n"
# latex_code += "\\end{tabular}\n"
# latex_code += "\\end{table}"

# # Display the generated LaTeX table code
# print(latex_code)

# with open(f"correlation_analysis_table/best_layer_embedding_distance_correlation_with_consistency_table.txt", "w") as fout:
#     fout.write(latex_code)

CommonsenseQA / Phi-3.5-mini-instruct / zero-shot
Layer 0
Macro analysis
Pearson correlation: -0.37 (p-value: 4.973e-02)
Spearman correlation: -0.44 (p-value: 1.794e-02)

CommonsenseQA / Phi-3.5-mini-instruct / zero-shot
Layer 1
Macro analysis
Pearson correlation: -0.38 (p-value: 4.808e-02)
Spearman correlation: -0.39 (p-value: 3.992e-02)

CommonsenseQA / Phi-3.5-mini-instruct / zero-shot
Layer 2
Macro analysis
Pearson correlation: -0.12 (p-value: 5.297e-01)
Spearman correlation: -0.15 (p-value: 4.436e-01)

CommonsenseQA / Phi-3.5-mini-instruct / zero-shot
Layer 3
Macro analysis
Pearson correlation: -0.19 (p-value: 3.219e-01)
Spearman correlation: -0.19 (p-value: 3.377e-01)

CommonsenseQA / Phi-3.5-mini-instruct / zero-shot
Layer 4
Macro analysis
Pearson correlation: -0.24 (p-value: 2.214e-01)
Spearman correlation: -0.29 (p-value: 1.342e-01)

CommonsenseQA / Phi-3.5-mini-instruct / zero-shot
Layer 5
Macro analysis
Pearson correlation: -0.27 (p-value: 1.617e-01)
Spearman correlation: -0

In [21]:
agg_corr_matrix = np.zeros((len(model_names), len(dataset_names), len(prompting_strategies), 6))

for i in range(len(model_names)):
    for j in range(len(dataset_names)):
        for k in range(len(prompting_strategies)):
            agg_corr_matrix[i][j][k][0] = corr_matrix[i][j][k].mean()
            agg_corr_matrix[i][j][k][1] = corr_matrix[i][j][k].std()
            agg_corr_matrix[i][j][k][2] = corr_matrix[i][j][k].min()
            agg_corr_matrix[i][j][k][3] = corr_matrix[i][j][k].max()
            agg_corr_matrix[i][j][k][4] = corr_matrix[i][j][k].argmin()
            agg_corr_matrix[i][j][k][5] = corr_matrix[i][j][k].argmax()

In [22]:
print('mean')
print(agg_corr_matrix[:,:,:,0])
print()

print('std')
print(agg_corr_matrix[:,:,:,1])
print()

print('min (best)')
print(agg_corr_matrix[:,:,:,2])
print()

print('argmin (best)')
print(agg_corr_matrix[:,:,:,4])
print()

print('max (worst)')
print(agg_corr_matrix[:,:,:,3])
print()

print('argmax (worst)')
print(agg_corr_matrix[:,:,:,5])
print()

mean
[[[-0.53408073 -0.70535095 -0.32020576  0.        ]
  [-0.36485402 -0.57703079 -0.59071586 -0.23680448]
  [-0.41255926 -0.36752502 -0.20267659  0.16303725]
  [ 0.         -0.35942561  0.         -0.50362384]]

 [[-0.33402801 -0.35706433 -0.25541575  0.        ]
  [-0.2537906  -0.39696294 -0.32867181 -0.19619476]
  [-0.10794023 -0.1220619  -0.35652805 -0.30595691]
  [ 0.         -0.11208958  0.         -0.40820752]]]

std
[[[0.14125284 0.11262076 0.17894259 0.        ]
  [0.1258162  0.11374226 0.0917844  0.0386227 ]
  [0.16137619 0.15401828 0.09256529 0.07963961]
  [0.         0.13888996 0.         0.0696427 ]]

 [[0.0895974  0.1353686  0.07136158 0.        ]
  [0.11865015 0.12954246 0.115158   0.13732236]
  [0.11696958 0.07837558 0.06635095 0.05466457]
  [0.         0.1025006  0.         0.0955328 ]]]

min (best)
[[[-0.717676   -0.87535975 -0.56586768  0.        ]
  [-0.52461375 -0.6923401  -0.74283813 -0.30904455]
  [-0.87106527 -0.56649778 -0.38775749 -0.0472542 ]
  [ 0.        