# Single Format Element Consistency Analysis

This notebook analyzes the consistency between format pairs that differ by only one formatting element.

## Format Analysis Strategy

We analyze 8 different formats represented as 3-bit binary numbers:
- Format 0: 000, Format 1: 001, Format 2: 010, Format 3: 011
- Format 4: 100, Format 5: 101, Format 6: 110, Format 7: 111

### Analysis Pairs by Position:
- **Last bit effect** (rightmost): (000,001), (010,011), (100,101), (110,111)
- **Middle bit effect**: (000,010), (001,011), (100,110), (101,111)
- **First bit effect** (leftmost): (000,100), (001,101), (010,110), (011,111)

In [None]:
import os
import json
import jsonlines
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple
from collections import defaultdict

# Import our analysis functions
from single_format_element_consistency import (
    get_format_pairs_by_position,
    analyze_single_element_consistency,
    analyze_dataset_model_combination
)

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Understand Format Pairs

In [None]:
# Get and display format pairs
pairs_by_position = get_format_pairs_by_position()

print("Format pairs that differ by exactly one bit:\n")
for position, pairs in pairs_by_position.items():
    print(f"{position.replace('_', ' ').title()} effect pairs:")
    for format_1, format_2 in pairs:
        print(f"  ({format_1:03b}, {format_2:03b}) = (Format {format_1}, Format {format_2})")
    print()

## 2. Analyze Sample Data

Let's start by analyzing one specific dataset-model combination to understand the results.

In [None]:
# # Analyze the DeepSeek model on 100TFQA with few-shot CoT
# sample_results = analyze_dataset_model_combination(
#     dataset_name="100TFQA",
#     model_name="DeepSeek-R1-Distill-Llama-8B", 
#     prompting_strategy="few-shot-cot"
# )

# if sample_results:
#     print("Sample Analysis Results for 100TFQA/DeepSeek-R1-Distill-Llama-8B/few-shot-cot:\n")
    
#     for position, results in sample_results.items():
#         print(f"{position.replace('_', ' ').title()} Effect:")
#         print(f"  Overall mean consistency: {results.get('mean', 0.0):.3f}")
        
#         for pair, consistency in results.items():
#             if pair != "mean":
#                 print(f"  Pair {pair}: {consistency:.3f}")
#         print()
# else:
#     print("Could not load sample data. Please check file paths.")

## 3. Comprehensive Analysis Across All Combinations

In [None]:
# Define all combinations to analyze
dataset_names = ["100TFQA", "CommonsenseQA", "GSM8K", "QASC", "MMLU-Pro-Law-100Q"]
model_names = [
    "DeepSeek-R1-Distill-Llama-8B", 
    "gpt-4o-2024-11-20",
    "Llama-3.1-8B",
    "Llama-3.1-8B-Instruct",
    "Llama-3.1-70B-Instruct",
    "Phi-3.5-mini-instruct",
    "Phi-3.5-vision-instruct"
]
prompting_strategies = ["zero-shot", "zero-shot-cot", "few-shot", "few-shot-cot"]

# Collect all results
all_results = {}
summary_data = []

for dataset_name in dataset_names:
    for model_name in model_names:
        for prompting_strategy in prompting_strategies:
            print(f"Analyzing {dataset_name}/{model_name}/{prompting_strategy}...")
            
            results = analyze_dataset_model_combination(
                dataset_name, model_name, prompting_strategy
            )
            
            if results is not None:
                # Store detailed results
                key = f"{dataset_name}_{model_name}_{prompting_strategy}"
                all_results[key] = results
                
                # Collect summary data
                for position in ["first_bit", "middle_bit", "last_bit"]:
                    if position in results:
                        summary_data.append({
                            "Dataset": dataset_name,
                            "Model": model_name,
                            "Strategy": prompting_strategy,
                            "Position": position,
                            "Mean_Consistency": 1 - results[position]["mean"] #.get("mean", 0.0)
                        })

print(f"\nAnalyzed {len(all_results)} combinations.")
print(f"Collected {len(summary_data)} data points.")

## 4. Summary Statistics and Visualization

In [None]:
# Create summary DataFrame
if summary_data:
    df_summary = pd.DataFrame(summary_data)
    
    # Display basic statistics
    print("Summary Statistics by Position:")
    position_stats = df_summary.groupby('Position')['Mean_Consistency'].agg(['mean', 'std', 'min', 'max', 'count'])
    print(position_stats)
    
    print("\nOverall Statistics:")
    print(f"Overall mean consistency: {df_summary['Mean_Consistency'].mean():.3f}")
    print(f"Overall standard deviation: {df_summary['Mean_Consistency'].std():.3f}")
else:
    print("No data collected for analysis.")

In [None]:
aaa = {}
# Collect all results
for dataset_name in dataset_names:
    for model_name in model_names:
        for prompting_strategy in prompting_strategies:
            # print(f"Analyzing {dataset_name}/{model_name}/{prompting_strategy}...")
            
            key = f"{dataset_name}_{model_name}_{prompting_strategy}"

            if key not in all_results:
                continue

            current_result = all_results[key]
            current_list = [current_result["last_bit"]["mean"], current_result["middle_bit"]["mean"], current_result["first_bit"]["mean"]]
            aaa[key] = current_list

In [None]:
from itertools import combinations
from scipy.stats import spearmanr
import numpy as np
from collections import defaultdict

# Assume aaa is already defined
def parse_key(key):
    parts = key.split('_')
    task = parts[0]
    model = parts[1]
    prompting = '_'.join(parts[2:])
    return task, model, prompting

def get_ranking(values):
    return list(np.argsort([-v for v in values]))  # descending

# Step 1: Organize groups for each condition
groupings = {
    'A': defaultdict(list),  # fix model + prompting => vary tasks
    'B': defaultdict(list),  # fix task + prompting => vary model
    'C': defaultdict(list),  # fix task + model => vary prompting strategy
}

for key, values in aaa.items():
    task, model, prompting = parse_key(key)
    groupings['A'][(model)].append((key, values))
    groupings['B'][(task)].append((key, values))
    groupings['C'][(prompting)].append((key, values))

# Step 2: Compute spearmanr
def compute_group_stats(group_dict):
    all_distances = []
    group_count = 0
    for group_key, entries in group_dict.items():
        if len(entries) < 2:
            continue
        group_count += 1
        # get rankings
        rankings = {k: get_ranking(v) for k, v in entries}
        keys = list(rankings.keys())
        for i, j in combinations(range(len(keys)), 2):
            r1, r2 = rankings[keys[i]], rankings[keys[j]]
            r, _ = spearmanr(r1, r2)
            if r is not None:
                all_distances.append(r)  # convert to distance
    return group_count, len(all_distances), np.mean(all_distances) if all_distances else None

# Step 3: Report
for label in ['A', 'B', 'C']:
    groups, comparisons, avg_spearmanr = compute_group_stats(groupings[label])
    print(f"Condition {label}:")
    print(f"  #Groups: {groups}")
    print(f"  #Pairwise comparisons: {comparisons}")
    print(f"  Avg spearman correlation coefficient: {avg_spearmanr:.3f}")
    print()

In [None]:
# Visualization 1: Box plot by position
if summary_data:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df_summary, x='Position', y='Mean_Consistency')
    plt.title('Consistency Distribution by Format Element Position')
    plt.xlabel('Format Element Position')
    plt.ylabel('Mean Consistency')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Visualization 2: Heatmap by dataset and model
    plt.figure(figsize=(15, 8))
    
    # Create pivot table for heatmap
    for i, position in enumerate(["first_bit", "middle_bit", "last_bit"]):
        plt.subplot(1, 3, i+1)
        
        position_data = df_summary[df_summary['Position'] == position]
        if not position_data.empty:
            pivot_table = position_data.pivot_table(
                values='Mean_Consistency', 
                index='Dataset', 
                columns='Model', 
                aggfunc='mean'
            )
            
            sns.heatmap(pivot_table, annot=True, fmt='.3f', cmap='viridis')
            plt.title(f'{position.replace("_", " ").title()} Effect')
            plt.xlabel('Model')
            plt.ylabel('Dataset')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Visualization 3: Comparison across prompting strategies
if summary_data:
    plt.figure(figsize=(12, 8))
    
    sns.boxplot(data=df_summary, x='Strategy', y='Mean_Consistency', hue='Position')
    plt.title('Consistency by Prompting Strategy and Format Element Position')
    plt.xlabel('Prompting Strategy')
    plt.ylabel('Mean Consistency')
    plt.xticks(rotation=45)
    plt.legend(title='Format Element Position')
    plt.tight_layout()
    plt.show()

## 5. Detailed Analysis for Specific Combinations

In [None]:
# Show detailed results for interesting cases
if all_results:
    print("Detailed Analysis Examples:\n")
    
    # Find cases with high and low consistency
    if summary_data:
        df_summary = pd.DataFrame(summary_data)
        
        # Highest consistency case
        max_idx = df_summary['Mean_Consistency'].idxmax()
        max_case = df_summary.loc[max_idx]
        
        print(f"Highest consistency case:")
        print(f"  {max_case['Dataset']}/{max_case['Model']}/{max_case['Strategy']}")
        print(f"  Position: {max_case['Position']}, Consistency: {max_case['Mean_Consistency']:.3f}")
        
        # Lowest consistency case
        min_idx = df_summary['Mean_Consistency'].idxmin()
        min_case = df_summary.loc[min_idx]
        
        print(f"\nLowest consistency case:")
        print(f"  {min_case['Dataset']}/{min_case['Model']}/{min_case['Strategy']}")
        print(f"  Position: {min_case['Position']}, Consistency: {min_case['Mean_Consistency']:.3f}")

## 6. Statistical Analysis

In [None]:
# Statistical tests to compare positions
if summary_data:
    from scipy import stats
    
    df_summary = pd.DataFrame(summary_data)
    
    # Group data by position
    first_bit_data = df_summary[df_summary['Position'] == 'first_bit']['Mean_Consistency']
    middle_bit_data = df_summary[df_summary['Position'] == 'middle_bit']['Mean_Consistency']
    last_bit_data = df_summary[df_summary['Position'] == 'last_bit']['Mean_Consistency']
    
    print("Statistical Comparisons (Kruskal-Wallis H-test):")
    
    if len(first_bit_data) > 0 and len(middle_bit_data) > 0 and len(last_bit_data) > 0:
        # Overall comparison
        h_stat, p_value = stats.kruskal(first_bit_data, middle_bit_data, last_bit_data)
        print(f"\nOverall position comparison:")
        print(f"  H-statistic: {h_stat:.3f}")
        print(f"  p-value: {p_value:.3e}")
        
        # Pairwise comparisons
        comparisons = [
            ("first_bit", "middle_bit", first_bit_data, middle_bit_data),
            ("first_bit", "last_bit", first_bit_data, last_bit_data),
            ("middle_bit", "last_bit", middle_bit_data, last_bit_data)
        ]
        
        print("\nPairwise comparisons (Mann-Whitney U test):")
        for name1, name2, data1, data2 in comparisons:
            if len(data1) > 0 and len(data2) > 0:
                u_stat, p_val = stats.mannwhitneyu(data1, data2, alternative='two-sided')
                print(f"  {name1} vs {name2}: U={u_stat:.3f}, p={p_val:.3e}")

## 7. Save Results

In [None]:
# Save results to files
os.makedirs("results", exist_ok=True)

if all_results:
    # Save detailed results
    with open("results/single_element_consistency_detailed.json", "w") as f:
        json.dump(all_results, f, indent=2)
    
if summary_data:
    # Save summary CSV
    df_summary = pd.DataFrame(summary_data)
    df_summary.to_csv("results/single_element_consistency_summary.csv", index=False)
    
    # Save position-specific summaries
    for position in ["first_bit", "middle_bit", "last_bit"]:
        position_data = df_summary[df_summary['Position'] == position]
        if not position_data.empty:
            position_data.to_csv(f"results/{position}_consistency_summary.csv", index=False)

print("Results saved to 'results/' directory.")

## Summary

This analysis measures the consistency between format pairs that differ by exactly one formatting element, allowing us to understand the individual effect of each formatting component on model consistency.