In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/Independent\ Study/fingerprints

In [None]:
import pandas as pd
df = pd.read_csv('Path to concerned csv')
df.head()

# Approach 1: Confusion Matrix Fingerprinting

This approach treats each model's confusion matrix as a unique "fingerprint"

that captures its systematic error patterns and prediction behavior.

## Key Concepts:

*   **Fingerprint**: 16-dimensional vector from flattened 4Ã—4 confusion matrix
*   **Identification Method**: Cosine similarity matching
*   **Goal**: Identify which model produced a set of predictions

---


**Cell 1 â€“ Setup and Imports**

Initializes the Python environment, imports all required numerical, machine learning,

and visualization libraries, and suppresses non-critical warnings to keep logs clean.

In [None]:
# Purpose: Import required libraries and configure the runtime environment.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("âœ“ Libraries imported successfully")

**Cell 2 â€“ Load Data and Extract Model Information**

Automatically discovers all evaluated models by scanning prediction columns and

prints a concise dataset summary. This avoids hardcoding model names and ensures

scalability to larger model sets.

In [None]:
# Purpose: Identify model prediction columns and summarize dataset structure.

# Assuming df is already loaded

# df = pd.read_csv('your_dataset.csv')

# Extract all model names from columns

pred_cols = [col for col in df.columns if col.endswith('_pred')]
model_names = [col.replace('_pred', '') for col in pred_cols]

print("=" * 80)
print("DATASET OVERVIEW")
print("=" * 80)
print(f"Total samples: {len(df)}")
print(f"Number of models: {len(model_names)}")
print(f"\nModels found:")
for i, name in enumerate(model_names, 1):
    print(f"  {i:2d}. {name}")
print()

**Cell 3 â€“ Data Quality Check**

Validates prediction ranges for each model, quantifies invalid outputs, and reports

per-model data hygiene to prevent corrupted or misleading fingerprints.

In [None]:
# Purpose: Verify prediction validity and quantify invalid label occurrences.

# Create model prediction columns dictionary

model_pred_cols = {name: f"{name}_pred" for name in model_names}
model_correct_cols = {name: f"{name}_correct" for name in model_names}

n_classes = 4

print("=" * 80)
print("DATA QUALITY CHECK")
print("=" * 80)
print(f"{'Model':<30s} {'Invalid':<10s} {'Valid':<10s} {'Invalid %':<10s}")
print("-" * 80)

quality_report = []
for name in model_names:
    col = model_pred_cols[name]
    n_invalid = ((df[col] < 0) | (df[col] >= n_classes)).sum()
    n_valid = len(df) - n_invalid
    pct_invalid = 100 * n_invalid / len(df)
    quality_report.append({
        'model': name,
        'invalid': n_invalid,
        'valid': n_valid,
        'pct_invalid': pct_invalid
    })
    print(f"{name:<30s} {n_invalid:<10d} {n_valid:<10d} {pct_invalid:<10.2f}%")

print()

**Cell 4 â€“ Train/Test Split**

Splits the dataset into disjoint training and test partitions. Fingerprints are

constructed exclusively on the training set to prevent information leakage during

identification evaluation.

In [None]:
# Purpose: Partition data to prevent fingerprint leakage during evaluation.

print("=" * 80)
print("TRAIN/TEST SPLIT")
print("=" * 80)

# Split data into train (80%) and test (20%) sets

# IMPORTANT: We'll compute fingerprints ONLY on training data to avoid leakage

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print()

**Cell 5 â€“ Confusion Matrix Fingerprinting Function**

Defines the core fingerprint primitive: a normalized confusion histogram that captures

systematic prediction tendencies and structured error behavior for each model.

In [None]:
# Purpose: Define the core transformation from predictions to fingerprint vectors.

def confusion_hist(gold_labels, pred_labels, n_classes=4):
    """
    Compute a confusion matrix and return it as a normalized histogram.
    This creates a 'fingerprint' of a model's error patterns.

    Args:
        gold_labels: Ground truth labels
        pred_labels: Model predictions
        n_classes: Number of classes (4 for HellaSwag)

    Returns:
        v: Flattened, normalized confusion histogram (16-d for 4 classes)
        conf: Raw confusion matrix (4x4)
    """
    # Initialize confusion matrix
    conf = np.zeros((n_classes, n_classes), dtype=np.int64)

    # Populate confusion matrix
    # conf[i,j] = number of times true label was i and prediction was j
    for g, p in zip(gold_labels, pred_labels):
        if 0 <= g < n_classes and 0 <= p < n_classes:
            conf[g, p] += 1

    # Flatten and normalize to create a probability distribution
    v = conf.flatten().astype(np.float32)
    v /= v.sum() + 1e-8  # Add epsilon to avoid division by zero

    return v, conf

print("âœ“ Confusion fingerprinting function defined")

**Cell 6 â€“ Compute Model Fingerprints**

Computes one reference fingerprint per model using only training data and reports

baseline accuracies to contextualize fingerprint quality.

In [None]:
# Purpose: Generate reference fingerprints for each model using training data.

print("=" * 80)
print("COMPUTING MODEL FINGERPRINTS")
print("=" * 80)
print("(Using training data only to avoid leakage)\n")

train_gold = train_df["label"].to_numpy()

fingerprints = {}  # Normalized 16-d vectors
conf_mats = {}     # Raw 4x4 confusion matrices

for name in model_names:
    col = model_pred_cols[name]
    train_pred = train_df[col].to_numpy()

    v, cm = confusion_hist(train_gold, train_pred, n_classes)
    fingerprints[name] = v
    conf_mats[name] = cm

    acc = cm.diagonal().sum() / cm.sum() if cm.sum() > 0 else 0
    print(f"{name:<30s} Accuracy: {acc:.3f}")

print(f"\nâœ“ Fingerprints computed for {len(fingerprints)} models")
print()

**Cell 7 â€“ Visualize Individual Confusion Matrices**

Displays normalized confusion matrices for a subset of models to enable qualitative

inspection of model-specific error patterns.

In [None]:
# Purpose: Qualitatively inspect per-model error patterns.

print("=" * 80)
print("VISUALIZING CONFUSION MATRICES")
print("=" * 80)

# Show first 6 models as example

n_display = min(6, len(model_names))
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, name in enumerate(model_names[:n_display]):
    cm = conf_mats[name]

    # Normalize by row (true label) to show distribution of predictions
    cm_norm = cm.astype(float)
    row_sums = cm_norm.sum(axis=1, keepdims=True)
    cm_norm = cm_norm / (row_sums + 1e-8)

    ax = axes[idx]
    sns.heatmap(cm_norm, annot=True, fmt='.2f', cmap='Blues',
                ax=ax, cbar=True, vmin=0, vmax=1)

    acc = cm.diagonal().sum()/cm.sum() if cm.sum() > 0 else 0
    ax.set_title(f'{name}\nAcc: {acc:.3f}', fontsize=11)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')

plt.tight_layout()
plt.savefig('confusion_matrices_sample.png', dpi=150, bbox_inches='tight')
print("âœ“ Saved: confusion_matrices_sample.png")
plt.show()

**Cell 8 â€“ Complete Confusion Matrix Grid**

Visualizes confusion matrices for all models simultaneously, facilitating comparison

and revealing structural similarities across model families.

In [None]:
# Purpose: Visualize confusion matrices for all models at once.

n_models = len(model_names)
n_cols = 4
n_rows = (n_models + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
axes = axes.flatten() if n_models > 1 else [axes]

for idx, name in enumerate(model_names):
    cm = conf_mats[name]

    # Normalize by row
    cm_norm = cm.astype(float)
    row_sums = cm_norm.sum(axis=1, keepdims=True)
    cm_norm = cm_norm / (row_sums + 1e-8)

    ax = axes[idx]
    sns.heatmap(cm_norm, annot=True, fmt='.2f', cmap='Blues',
                ax=ax, cbar=True, vmin=0, vmax=1)

    acc = cm.diagonal().sum()/cm.sum() if cm.sum() > 0 else 0
    ax.set_title(f'{name}\nAcc: {acc:.3f}', fontsize=10)
    ax.set_xlabel('Predicted', fontsize=9)
    ax.set_ylabel('True', fontsize=9)

# Hide unused subplots

for idx in range(n_models, len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig('confusion_matrices_all.png', dpi=150, bbox_inches='tight')
print("âœ“ Saved: confusion_matrices_all.png")
plt.show()

**Cell 9 â€“ Fingerprint Similarity Analysis**

Computes pairwise cosine similarity between fingerprints to quantify behavioral

proximity and divergence among models.

In [None]:
# Purpose: Quantify similarity between model fingerprints via cosine similarity.

print("=" * 80)
print("FINGERPRINT SIMILARITY ANALYSIS")
print("=" * 80)

# Stack all reference fingerprints into a matrix

family_matrix = np.stack([fingerprints[name] for name in model_names])

# Compute pairwise cosine similarities

similarity_matrix = cosine_similarity(family_matrix)

# Find most similar and most different pairs

sim_pairs = []
for i in range(len(model_names)):
    for j in range(i+1, len(model_names)):
        sim_pairs.append((model_names[i], model_names[j], similarity_matrix[i, j]))

sim_pairs.sort(key=lambda x: x[2], reverse=True)

print("\nMost Similar Model Pairs:")
for name1, name2, sim in sim_pairs[:5]:
    print(f"  {name1} <-> {name2}: {sim:.4f}")

print("\nMost Different Model Pairs:")
for name1, name2, sim in sim_pairs[-5:]:
    print(f"  {name1} <-> {name2}: {sim:.4f}")

print()

# Visualize similarity matrix

plt.figure(figsize=(14, 12))
sns.heatmap(similarity_matrix, annot=True, fmt='.3f', cmap='RdYlGn',
            xticklabels=model_names, yticklabels=model_names,
            vmin=0, vmax=1, center=0.5, linewidths=0.5)
plt.title('Confusion Fingerprint Similarity Between Models\n(Cosine Similarity)',
          fontsize=16, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('fingerprint_similarity.png', dpi=150, bbox_inches='tight')
print("âœ“ Saved: fingerprint_similarity.png")
plt.show()

**Cell 10 â€“ Model Identification System**

Implements the inference-time identification procedure that matches unknown prediction

sets to the closest reference fingerprint.

In [None]:
# Purpose: Define the inference-time model identification procedure.

print("=" * 80)
print("MODEL IDENTIFICATION SYSTEM")
print("=" * 80)

# Get test set ground truth

test_gold = test_df["label"].to_numpy()

def identify_family(sub_gold, sub_pred):
    """
    Identify which model family produced these predictions.

    Method:
    1. Compute confusion fingerprint for the subset
    2. Calculate cosine similarity with each reference fingerprint
    3. Return the model with highest similarity
    """
    v, _ = confusion_hist(sub_gold, sub_pred, n_classes)
    v = v.reshape(1, -1)

    # Compute cosine similarity with each reference fingerprint
    sims = cosine_similarity(v, family_matrix)[0]
    best_idx = np.argmax(sims)

    return model_names[best_idx], sims

print("âœ“ Model identification system ready")
print()

**Cell 11 â€“ Single-Model Identification Demonstration**

Demonstrates identification behavior as a function of probe size using a fixed target

model to illustrate robustness under limited observations.

In [None]:
# Purpose: Demonstrate identification behavior as probe size increases.

print("Testing identification with different probe sizes...\n")

test_model = model_names[0]
test_col = model_pred_cols[test_model]
test_pred = test_df[test_col].to_numpy()

# Test with increasing probe sizes

for n_probe in [5, 10, 20, 50, 100]:
    if n_probe <= len(test_df):
        idx = np.random.choice(len(test_df), size=n_probe, replace=False)
        sub_gold = test_gold[idx]
        sub_pred = test_pred[idx]

        identified, sims = identify_family(sub_gold, sub_pred)

        print(f"With {n_probe:3d} probes from {test_model}:")
        print(f"  Identified as: {identified}")
        print(f"  Correct: {identified == test_model}")
        print(f"  Confidence: {sims.max():.4f}")
        print()

**Cell 12 â€“ Comprehensive Probe Size Experiments**

Systematically evaluates identification accuracy across probe sizes and random trials

to estimate sample complexity for reliable model identification.

In [None]:
# Purpose: Systematically evaluate identification accuracy vs. probe size.

print("=" * 80)
print("PROBE SIZE EXPERIMENTS")
print("=" * 80)
print("Evaluating identification accuracy vs. probe set size...\n")

rng = np.random.default_rng(42)

def eval_probe_size(model_name, n_probe=20, n_trials=100):
    """
    Evaluate how many samples are needed to identify a model.
    """
    correct = 0
    col = model_pred_cols[model_name]
    full_test_pred = test_df[col].to_numpy()
    n_items_test = len(test_df)

    for _ in range(n_trials):
        # Sample random subset of test predictions
        idx = rng.choice(n_items_test, size=min(n_probe, n_items_test), replace=False)
        sub_gold = test_gold[idx]
        sub_pred = full_test_pred[idx]

        # Identify model based on this subset
        identified_family, _ = identify_family(sub_gold, sub_pred)
        if identified_family == model_name:
            correct += 1

    return correct / n_trials

# Test various probe sizes

probe_sizes = [5, 10, 20, 50, 100]
results = {}

for model_name in model_names:
    results[model_name] = []
    print(f"{model_name}:")

    for n_probe in probe_sizes:
        acc = eval_probe_size(model_name, n_probe=n_probe, n_trials=100)
        results[model_name].append(acc)
        print(f"  {n_probe:3d} probes: {acc:.3f} accuracy")
    print()

**Cell 13 â€“ Probe Size Results Visualization**

Plots identification accuracy versus probe size to characterize convergence behavior

and inter-model variability.

In [None]:
# Purpose: Plot identification accuracy as a function of probe size.

print("=" * 80)
print("VISUALIZING PROBE SIZE RESULTS")
print("=" * 80)

plt.figure(figsize=(14, 8))

for model_name, accs in results.items():
    plt.plot(probe_sizes, accs, marker='o', label=model_name,
             linewidth=2, markersize=6)

plt.xlabel('Number of Probe Samples', fontsize=14)
plt.ylabel('Identification Accuracy', fontsize=14)
plt.title('Model Identification Accuracy vs. Probe Set Size',
          fontsize=16, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.grid(True, alpha=0.3)
plt.ylim(0, 1.05)
plt.tight_layout()
plt.savefig('probe_size_accuracy.png', dpi=150, bbox_inches='tight')
print("âœ“ Saved: probe_size_accuracy.png")
plt.show()

**Cell 14 â€“ Performance vs. Identifiability Analysis**

Examines the relationship between task accuracy and fingerprint identifiability,

including correlation analysis to assess dependence.

In [None]:
# Purpose: Relate model accuracy to identifiability.

print("=" * 80)
print("IDENTIFICATION ACCURACY BY MODEL PERFORMANCE")
print("=" * 80)

# Get model accuracies

accuracies = {}
for name in model_names:
    col = model_pred_cols[name]
    test_pred = test_df[col].to_numpy()
    mask = (test_pred >= 0) & (test_pred < n_classes)

    if mask.sum() > 0:
        acc = (test_gold[mask] == test_pred[mask]).mean()
        accuracies[name] = acc

# Create scatter plot: model accuracy vs identification accuracy

acc_20 = [results[name][2] for name in model_names]  # 20-probe accuracy
model_accs = [accuracies[name] for name in model_names]

plt.figure(figsize=(12, 8))
plt.scatter(model_accs, acc_20, s=200, alpha=0.6, edgecolors='black', linewidth=2)

for i, name in enumerate(model_names):
    plt.annotate(name, (model_accs[i], acc_20[i]),
                 xytext=(5, 5), textcoords='offset points',
                 fontsize=9, fontweight='bold')

plt.xlabel('Model Accuracy on Test Set', fontsize=14)
plt.ylabel('Identification Accuracy (20 probes)', fontsize=14)
plt.title('Model Performance vs. Identifiability', fontsize=16, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('performance_vs_identifiability.png', dpi=150, bbox_inches='tight')
print("âœ“ Saved: performance_vs_identifiability.png")
plt.show()

# Calculate correlation

from scipy.stats import pearsonr, spearmanr
pearson_r, pearson_p = pearsonr(model_accs, acc_20)
spearman_r, spearman_p = spearmanr(model_accs, acc_20)

print(f"\nCorrelation Analysis:")
print(f"  Pearson correlation:  r={pearson_r:.3f}, p={pearson_p:.4f}")
print(f"  Spearman correlation: r={spearman_r:.3f}, p={spearman_p:.4f}")
print()

**Cell 15 â€“ Confusion Structure Analysis**

Analyzes off-diagonal confusion mass to quantify how dispersed each modelâ€™s error

distribution is, providing insight into fingerprint distinctiveness.

In [None]:
# Purpose: Compare models by distribution of systematic errors.

print("=" * 80)
print("CONFUSION MATRIX DISTANCE ANALYSIS")
print("=" * 80)

# Compute average off-diagonal mass (error distribution)

off_diag_mass = {}
for name in model_names:
    cm = conf_mats[name]
    cm_norm = cm / (cm.sum() + 1e-8)
    off_diag = cm_norm.sum() - cm_norm.diagonal().sum()
    off_diag_mass[name] = off_diag

# Sort models by error distribution

sorted_models = sorted(off_diag_mass.items(), key=lambda x: x[1], reverse=True)

print("\nModels by Error Distribution (off-diagonal mass):")
print(f"{'Model':<30s} {'Error Mass':<12s} {'Accuracy':<10s}")
print("-" * 52)
for name, mass in sorted_models:
    acc = accuracies.get(name, 0)
    print(f"{name:<30s} {mass:<12.4f} {acc:<10.3f}")

print()

**Cell 16 â€“ Summary Statistics**

Aggregates key performance and identification metrics into concise summaries,

highlighting best- and worst-case identifiability.

In [None]:
# Purpose: Summarize identification and performance results.

print("=" * 80)
print("SUMMARY STATISTICS")
print("=" * 80)

print("\nðŸ“Š MODEL ACCURACIES:")
print(f"{'Model':<30s} {'Accuracy':<10s}")
print("-" * 40)
for name in sorted(accuracies.keys(), key=lambda x: accuracies[x], reverse=True):
    print(f"{name:<30s} {accuracies[name]:.3f}")

print("\nðŸŽ¯ IDENTIFICATION PERFORMANCE:")
avg_acc_5 = np.mean([results[m][0] for m in model_names])
avg_acc_10 = np.mean([results[m][1] for m in model_names])
avg_acc_20 = np.mean([results[m][2] for m in model_names])
avg_acc_50 = np.mean([results[m][3] for m in model_names])
avg_acc_100 = np.mean([results[m][4] for m in model_names])

print(f"  With 5 probes:   {avg_acc_5:.3f} average accuracy")
print(f"  With 10 probes:  {avg_acc_10:.3f} average accuracy")
print(f"  With 20 probes:  {avg_acc_20:.3f} average accuracy")
print(f"  With 50 probes:  {avg_acc_50:.3f} average accuracy")
print(f"  With 100 probes: {avg_acc_100:.3f} average accuracy")

print("\nðŸ“ˆ BEST/WORST IDENTIFIABLE MODELS:")
id_20 = {name: results[name][2] for name in model_names}
sorted_id = sorted(id_20.items(), key=lambda x: x[1], reverse=True)

print(f"  Most identifiable:  {sorted_id[0][0]:30s} ({sorted_id[0][1]:.3f})")
print(f"  Least identifiable: {sorted_id[-1][0]:30s} ({sorted_id[-1][1]:.3f})")

print("\n" + "=" * 80)
print("âœ… APPROACH 1 ANALYSIS COMPLETE")
print("=" * 80)

**Cell 17 â€“ Export Results**

Persists all core metrics and identification results to disk to support reproducibility

and downstream analysis.

In [None]:
# Purpose: Persist summary metrics for downstream analysis.

# Create results summary dataframe

results_df = pd.DataFrame({
    'model': model_names,
    'accuracy': [accuracies.get(name, 0) for name in model_names],
    'id_5_probes': [results[name][0] for name in model_names],
    'id_10_probes': [results[name][1] for name in model_names],
    'id_20_probes': [results[name][2] for name in model_names],
    'id_50_probes': [results[name][3] for name in model_names],
    'id_100_probes': [results[name][4] for name in model_names],
    'off_diagonal_mass': [off_diag_mass[name] for name in model_names]
})

results_df = results_df.sort_values('accuracy', ascending=False)
results_df.to_csv('approach1_results.csv', index=False)
print("âœ“ Saved: approach1_results.csv")
print("\nResults DataFrame:")
print(results_df)