In [1]:
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


In [2]:
# Check if CUDA is available
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA device: NVIDIA A100 80GB PCIe


In [3]:
# Explore the repo structure
repo_path = '/net/scratch2/smallyan/InterpDetect_eval'

# List top-level contents
for item in sorted(os.listdir(repo_path)):
    full_path = os.path.join(repo_path, item)
    if os.path.isdir(full_path):
        print(f"[DIR]  {item}")
    else:
        print(f"[FILE] {item}")

[DIR]  .git
[FILE] .gitignore
[FILE] CodeWalkthrough.md
[FILE] LICENSE
[DIR]  datasets
[FILE] documentation.pdf
[FILE] plan.md
[FILE] requirements.txt
[DIR]  scripts
[DIR]  trained_models


In [4]:
# Read the plan file
with open(os.path.join(repo_path, 'plan.md'), 'r') as f:
    plan_content = f.read()
print(plan_content)

# Plan
## Objective
Develop a mechanistic interpretability-based hallucination detection method for Retrieval-Augmented Generation (RAG) systems by computing External Context Scores (ECS) across layers and attention heads and Parametric Knowledge Scores (PKS) across layers (FFN), training regression-based classifiers on these signals, and demonstrating generalization from a small proxy model (Qwen3-0.6b) to larger production models (GPT-4.1-mini).

## Hypothesis
1. RAG hallucinations correlate with:  later-layer FFN modules disproportionately inject parametric knowledge into the residual stream while attention heads fail to adequately exploit external context.
2. External Context Score (ECS) and Parametric Knowledge Score (PKS) are correlated with hallucination occurrence and can serve as predictive features for hallucination detection.
3. Mechanistic signals extracted from a small proxy model (0.6b parameters) can generalize to detect hallucinations in responses from larger production

In [5]:
# Read the Code Walkthrough
with open(os.path.join(repo_path, 'CodeWalkthrough.md'), 'r') as f:
    walkthrough_content = f.read()
print(walkthrough_content)

# InterpDetect

**InterpDetect: Interpretable Signals for Detecting Hallucinations in Retrieval-Augmented Generation**

[![Paper](https://img.shields.io/badge/Paper-OpenReview-blue)](https://openreview.net/pdf?id=TZzBKwHLwF)
[![License](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)

A comprehensive framework for detecting and analyzing hallucinations in Retrieval-Augmented Generation (RAG) systems using interpretability techniques and chunk-level analysis on the RAGBench/FinQA dataset.

## Overview

This project implements a novel approach to hallucination detection by leveraging interpretability methods. The framework consists of three main parts:

1. **Preprocessing Pipeline** - Convert raw datasets to labeled data with hallucination spans
2. **Training & Prediction** - Train classifiers or use pre-trained models for hallucination detection
3. **Baseline Comparisons** - Evaluate against multiple baseline methods (RAGAS, TruLens, RefChecker, GPT-based, etc.)

The frame

In [6]:
# List the scripts directory
scripts_path = os.path.join(repo_path, 'scripts')
for root, dirs, files in os.walk(scripts_path):
    level = root.replace(scripts_path, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}[DIR] {os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        print(f'{subindent}{file}')

[DIR] scripts/
  predict.py
  .DS_Store
  compute_scores.py
  classifier.py
  [DIR] baseline/
    run_refchecker.py
    requirements.txt
    run_hf.py
    run_ragas.py
    run_groq.py
    run_trulens.py
    run_gpt.py
  [DIR] plots/
    plot_correlation.ipynb
  [DIR] preprocess/
    generate_response_hf.py
    preprocess.py
    helper.py
    filter.py
    README.md
    generate_response_gpt.py
    generate_labels.py
    [DIR] datasets/
      [DIR] test/
        test1176_w_response_gpt41mini.jsonl
        test.jsonl
        test1176_w_labels_filtered.jsonl
        test1176_w_labels.jsonl
        test1176_w_response.jsonl
        test1176_w_labels_filtered_gpt41mini.jsonl
        test1176_w_labels_gpt41mini.jsonl


In [7]:
# List datasets directory
datasets_path = os.path.join(repo_path, 'datasets')
for root, dirs, files in os.walk(datasets_path):
    level = root.replace(datasets_path, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}[DIR] {os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        print(f'{subindent}{file}')

[DIR] datasets/
  OV_copying_score.json
  [DIR] train/
    train3000_w_chunk_score_part8.json
    train3000_w_chunk_score_part12.json
    train3000_w_chunk_score_part16.json
    train3000_w_chunk_score_part0.json
    train3000_w_chunk_score_part4.json
    train3000_w_chunk_score_part1.json
    train3000_w_chunk_score_part5.json
    train3000_w_chunk_score_part13.json
    train3000_w_chunk_score_part9.json
    train3000_w_chunk_score_part17.json
    train3000_w_chunk_score_part3.json
    train3000_w_chunk_score_part7.json
    train3000_w_chunk_score_part11.json
    train3000_w_chunk_score_part15.json
    train3000_w_chunk_score_part10.json
    train3000_w_chunk_score_part14.json
    train3000_w_chunk_score_part2.json
    train3000_w_chunk_score_part6.json
  [DIR] test/
    test_w_chunk_score_qwen06b.json
    test_w_chunk_score_gpt41mini.json


In [8]:
# Read the main scripts to understand implementation
# Start with compute_scores.py
with open(os.path.join(scripts_path, 'compute_scores.py'), 'r') as f:
    compute_scores_content = f.read()
print("=== compute_scores.py ===")
print(compute_scores_content[:5000])

=== compute_scores.py ===
# %%
#!pip install transformer_lens

import torch
from transformers import AutoTokenizer
from transformer_lens import HookedTransformer
import json
from torch.nn import functional as F
from typing import Dict, List, Tuple
import pdb
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import argparse
import sys
import os
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pointbiserialr

def load_examples(file_path):
    """Load examples from JSONL file"""
    print(f"Loading examples from {file_path}...")
    
    try:
        examples = []
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line)
                examples.append(data)
        
        print(f"Loaded {len(examples)} examples")
        return examples
    except Exception as e:
        print(f"Error loading examples: {e}")
        sys.exit(1)

def se

In [9]:
# Continue reading compute_scores.py
print(compute_scores_content[5000:10000])

nput_ids.shape[-1] - 4
        end_text_id = tokenizer(added_end_text, return_tensors="pt").input_ids.shape[-1] - 4
        prompt_spans.append([start_text_id, end_text_id])
    return prompt_spans

def calculate_sentence_similarity(bge_model, r_text, p_text):
    """Calculate sentence similarity using BGE model"""
    part_embedding = bge_model.encode([r_text], normalize_embeddings=True)
    q_embeddings = bge_model.encode([p_text], normalize_embeddings=True)
    
    # Calculate similarity score
    scores_named = np.matmul(q_embeddings, part_embedding.T).flatten()
    return float(scores_named[0])

class MockOutputs:
    """Mock outputs class for transformer lens compatibility"""
    def __init__(self, cache, model_cfg):
        self.cache = cache
        self.model_cfg = model_cfg

    @property
    def attentions(self):
        # Return attention patterns in the expected format
        attentions = []
        for layer in range(self.model_cfg.n_layers):
            # Get attention

In [10]:
# Read classifier.py
with open(os.path.join(scripts_path, 'classifier.py'), 'r') as f:
    classifier_content = f.read()
print("=== classifier.py ===")
print(classifier_content[:5000])

=== classifier.py ===
# %%
# !pip install feature_engine
# !pip install xgboost
# !pip install lightgbm
# !pip install optuna
# !pip install --upgrade scikit-learn
# !pip install unidecode

import pandas as pd
import json
import numpy as np
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
import pickle
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import argparse
import sys

def load_data(folder_path):
    """Load data from JSON files in the specified folder"""
    print(f"Loading data from {folder_path}...")
    
    try:
        response = []
        json_files = glob.glob(os.path.join(folder_path, "*.json

In [11]:
# Continue reading classifier.py
print(classifier_content[5000:10000])

corr),
        ])
    else:
        preprocessor = Pipeline([
            ('scaler', scaler),
        ])
    
    return preprocessor

def train_models(X_train, X_val, y_train, y_val, preprocessor, models_to_train=None):
    """Train multiple models and compare their performance"""
    print("Training models...")
    
    from sklearn.pipeline import make_pipeline
    from sklearn.metrics import precision_recall_fscore_support
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from xgboost import XGBClassifier
    
    # Define models to train
    if models_to_train is None:
        models_to_train = ["LR", "SVC", "RandomForest", "XGBoost"]
    
    models = []
    if "LR" in models_to_train:
        models.append(("LR", LogisticRegression()))
    if "SVC" in models_to_train:
        models.append(('SVC', SVC()))
    if "RandomForest" in models_to_train:
        models.append(('RandomForest

In [12]:
# Read predict.py
with open(os.path.join(scripts_path, 'predict.py'), 'r') as f:
    predict_content = f.read()
print("=== predict.py ===")
print(predict_content[:5000])

=== predict.py ===
# %%
# !pip install feature_engine
# !pip install xgboost
# !pip install lightgbm
# !pip install optuna
# !pip install --upgrade scikit-learn
# !pip install unidecode

# %%
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
import pickle
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
import argparse
import sys
import os

def load_data(data_path):
    """Load data from JSON file"""
    print(f"Loading data from {data_path}...")
    
    try:
        with open(data_path, "r") as f:
            response = json.load(f)
        
        print(f"Loaded

In [13]:
# Check the trained models directory
trained_models_path = os.path.join(repo_path, 'trained_models')
for item in os.listdir(trained_models_path):
    print(item)

model_RandomForest_3000.pickle
model_LR_3000.pickle
model_SVC_3000.pickle
model_XGBoost_3000.pickle


In [14]:
# Check the plot_correlation notebook to understand the correlation analysis
plots_path = os.path.join(scripts_path, 'plots', 'plot_correlation.ipynb')
with open(plots_path, 'r') as f:
    plot_notebook = json.load(f)
    
# Print all cells
for i, cell in enumerate(plot_notebook['cells']):
    cell_type = cell['cell_type']
    source = ''.join(cell['source'])
    print(f"=== Cell {i} ({cell_type}) ===")
    print(source[:2000] if len(source) > 2000 else source)
    print()

NameError: name 'json' is not defined

In [15]:
import json

# Check the plot_correlation notebook to understand the correlation analysis
plots_path = os.path.join(scripts_path, 'plots', 'plot_correlation.ipynb')
with open(plots_path, 'r') as f:
    plot_notebook = json.load(f)
    
# Print all cells
for i, cell in enumerate(plot_notebook['cells']):
    cell_type = cell['cell_type']
    source = ''.join(cell['source'])
    print(f"=== Cell {i} ({cell_type}) ===")
    print(source[:2000] if len(source) > 2000 else source)
    print()

=== Cell 0 (code) ===
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import json
import collections
from collections import Counter
from typing import List
from scipy.stats import pearsonr
import ast
import torch
import glob
import os

=== Cell 1 (markdown) ===
### Load file

=== Cell 2 (code) ===
folder_path = "../../datasets/train"

examples = []
json_files = glob.glob(os.path.join(folder_path, "*.json"))

for file_path in json_files:
    with open(file_path, "r") as f:
        data = json.load(f)
        examples.extend(data)

=== Cell 3 (code) ===
examples[0]['scores'][1].keys()

=== Cell 4 (markdown) ===
### Create subsets of Truthful and Hallucinated

=== Cell 5 (code) ===
ecs_truthful = collections.defaultdict(list)
ecs_hallucinated = collections.defaultdict(list)
pks_truthful = collections.defaultdict(list)
pks_hallucinated = collections.defaultdict(list)

for example in examples:
    for score in example['scores']:
        if score['h

In [16]:
# Check baseline scripts to see if they exist
baseline_path = os.path.join(scripts_path, 'baseline')
baseline_files = os.listdir(baseline_path)
print("Baseline scripts:", baseline_files)

# Read preprocess folder structure
preprocess_path = os.path.join(scripts_path, 'preprocess')
preprocess_files = os.listdir(preprocess_path)
print("\nPreprocess scripts:", preprocess_files)

Baseline scripts: ['run_refchecker.py', 'requirements.txt', 'run_hf.py', 'run_ragas.py', 'run_groq.py', 'run_trulens.py', 'run_gpt.py']

Preprocess scripts: ['generate_response_hf.py', 'preprocess.py', 'helper.py', 'filter.py', 'datasets', 'README.md', 'generate_response_gpt.py', 'generate_labels.py']


In [17]:
# Now let's systematically verify CS1: Conclusions vs Results
# The plan states specific results that we need to verify against actual implementation/data

# First, let's understand the claims from the plan:
plan_claims = """
### Correlation Analysis: ECS vs Hallucination
- Main result: All attention heads exhibit negative correlations; hallucinated responses utilize less external context than truthful ones.

### Correlation Analysis: PKS vs Hallucination
- Main result: Later-layer FFNs exhibit substantially higher PKS for hallucinated responses and are positively correlated with hallucinations.

### Classifier Training and Selection
- Main result: SVC achieved highest validation F1 (76.60%) and was selected; XGBoost overfitted despite strong training performance.

### Self-Evaluation Detection
- Main result: Method achieved F1=74.68%, outperforming TruLens (67.32%) and llama-3.1-8b-instant (57.53%), comparable to RefChecker (75.86%).

### Proxy-Based Evaluation Detection
- Main result: Method achieved F1=75.36%, outperforming nearly all models except GPT-5 (76.92%) and RAGAS (76.19%), using only 0.6b parameter signals.
"""
print(plan_claims)


### Correlation Analysis: ECS vs Hallucination
- Main result: All attention heads exhibit negative correlations; hallucinated responses utilize less external context than truthful ones.

### Correlation Analysis: PKS vs Hallucination
- Main result: Later-layer FFNs exhibit substantially higher PKS for hallucinated responses and are positively correlated with hallucinations.

### Classifier Training and Selection
- Main result: SVC achieved highest validation F1 (76.60%) and was selected; XGBoost overfitted despite strong training performance.

### Self-Evaluation Detection
- Main result: Method achieved F1=74.68%, outperforming TruLens (67.32%) and llama-3.1-8b-instant (57.53%), comparable to RefChecker (75.86%).

### Proxy-Based Evaluation Detection
- Main result: Method achieved F1=75.36%, outperforming nearly all models except GPT-5 (76.92%) and RAGAS (76.19%), using only 0.6b parameter signals.



In [18]:
# Let's verify CS1 by running the correlation analysis using the training data
# Load training data to compute ECS and PKS correlations

import glob
import collections
import numpy as np
from scipy.stats import pearsonr

# Load training data
folder_path = os.path.join(repo_path, "datasets/train")

examples = []
json_files = glob.glob(os.path.join(folder_path, "*.json"))

for file_path in json_files:
    with open(file_path, "r") as f:
        data = json.load(f)
        examples.extend(data)

print(f"Loaded {len(examples)} examples from {len(json_files)} files")

Loaded 1800 examples from 18 files


In [19]:
# Verify Claim 1: ECS vs Hallucination - All attention heads exhibit negative correlations
ecs_truthful = collections.defaultdict(list)
ecs_hallucinated = collections.defaultdict(list)
pks_truthful = collections.defaultdict(list)
pks_hallucinated = collections.defaultdict(list)

for example in examples:
    for score in example['scores']:
        if score['hallucination_label']==0:
            for k, v in score['prompt_attention_score'].items():
                ecs_truthful[k].append(v)
            for k, v in score['parameter_knowledge_scores'].items():
                pks_truthful[k].append(v)
        else:
            for k, v in score['prompt_attention_score'].items():
                ecs_hallucinated[k].append(v)
            for k, v in score['parameter_knowledge_scores'].items():
                pks_hallucinated[k].append(v)

print(f"Number of attention heads: {len(ecs_truthful)}")
print(f"Number of FFN layers: {len(pks_truthful)}")
print(f"Truthful spans: {len(list(ecs_truthful.values())[0])}")
print(f"Hallucinated spans: {len(list(ecs_hallucinated.values())[0])}")

Number of attention heads: 448
Number of FFN layers: 28
Truthful spans: 4406
Hallucinated spans: 3393


In [20]:
# Compute Pearson's r: ECS vs. Inverse Hallucination Label
def pearson_corr(attention_scores, hallucination_labels, inverse=False):
    scores = np.array(attention_scores, dtype=float)
    labels = np.array(hallucination_labels, dtype=int)
    
    if inverse:
        inverse_labels = 1 - labels  # invert 0<->1
        r, p_value = pearsonr(scores, inverse_labels)
    else:
        r, p_value = pearsonr(scores, labels)
    
    return r, p_value

# Aggregate ECS data for correlation
ecs_lst = collections.defaultdict(list) 
ecs_label_lst = collections.defaultdict(list) 
for k, v in ecs_truthful.items():
    for a in v:
        ecs_lst[k].append(a)
        ecs_label_lst[k].append(0)
    for a in ecs_hallucinated[k]:
        ecs_lst[k].append(a)
        ecs_label_lst[k].append(1)

# Compute ECS correlations
ecs_pcc = {}
for k, v in ecs_lst.items():
    r, p_val = pearson_corr(v, ecs_label_lst[k], inverse=True)
    ecs_pcc[k] = r

# Count positive and negative correlations
positive_corr = sum(1 for v in ecs_pcc.values() if v > 0)
negative_corr = sum(1 for v in ecs_pcc.values() if v < 0)
zero_corr = sum(1 for v in ecs_pcc.values() if v == 0)

print(f"ECS Correlation Analysis (ECS vs Inverse Hallucination Label):")
print(f"  Positive correlations: {positive_corr}")
print(f"  Negative correlations: {negative_corr}")
print(f"  Zero correlations: {zero_corr}")
print(f"  Total heads: {len(ecs_pcc)}")
print(f"\nClaim: 'All attention heads exhibit negative correlations'")
print(f"Result: {'MATCHES' if positive_corr == len(ecs_pcc) else 'DOES NOT MATCH'} the claim")
print(f"        (Note: The claim says negative correlation between ECS and hallucination, which means positive correlation with inverse hallucination label)")

ECS Correlation Analysis (ECS vs Inverse Hallucination Label):
  Positive correlations: 448
  Negative correlations: 0
  Zero correlations: 0
  Total heads: 448

Claim: 'All attention heads exhibit negative correlations'
Result: MATCHES the claim
        (Note: The claim says negative correlation between ECS and hallucination, which means positive correlation with inverse hallucination label)


In [21]:
# Let me verify this more directly - compute ECS vs Hallucination (not inverse)
ecs_pcc_direct = {}
for k, v in ecs_lst.items():
    r, p_val = pearson_corr(v, ecs_label_lst[k], inverse=False)
    ecs_pcc_direct[k] = r

# Count positive and negative correlations
positive_corr_direct = sum(1 for v in ecs_pcc_direct.values() if v > 0)
negative_corr_direct = sum(1 for v in ecs_pcc_direct.values() if v < 0)

print(f"ECS Correlation Analysis (ECS vs Hallucination Label directly):")
print(f"  Positive correlations: {positive_corr_direct}")
print(f"  Negative correlations: {negative_corr_direct}")
print(f"\nClaim: 'All attention heads exhibit negative correlations [between ECS and hallucination]'")
print(f"Result: {'MATCHES' if negative_corr_direct == len(ecs_pcc_direct) else 'DOES NOT MATCH'} the claim")

# Print some example correlations
sorted_ecs = sorted(ecs_pcc_direct.items(), key=lambda x: x[1])
print("\nMost negative correlations (top 5):")
for k, v in sorted_ecs[:5]:
    print(f"  {k}: {v:.4f}")
print("\nLeast negative correlations (bottom 5):")
for k, v in sorted_ecs[-5:]:
    print(f"  {k}: {v:.4f}")

ECS Correlation Analysis (ECS vs Hallucination Label directly):
  Positive correlations: 0
  Negative correlations: 448

Claim: 'All attention heads exhibit negative correlations [between ECS and hallucination]'
Result: MATCHES the claim

Most negative correlations (top 5):
  (5, 10): -0.3378
  (8, 12): -0.3371
  (5, 13): -0.3360
  (4, 1): -0.3283
  (14, 9): -0.3278

Least negative correlations (bottom 5):
  (11, 1): -0.0880
  (10, 6): -0.0693
  (11, 11): -0.0441
  (2, 7): -0.0190
  (19, 10): -0.0102


In [22]:
# Verify Claim 2: PKS vs Hallucination - Later-layer FFNs exhibit higher PKS for hallucinated responses
# and are positively correlated with hallucinations

# Aggregate PKS data for correlation
pks_lst = collections.defaultdict(list) 
pks_label_lst = collections.defaultdict(list) 
for k, v in pks_truthful.items():
    for a in v:
        pks_lst[k].append(a)
        pks_label_lst[k].append(0)
    for a in pks_hallucinated[k]:
        pks_lst[k].append(a)
        pks_label_lst[k].append(1)

# Compute PKS correlations
pks_pcc = {}
for k, v in pks_lst.items():
    r, p_val = pearson_corr(v, pks_label_lst[k], inverse=False)
    pks_pcc[k] = r

# Count positive and negative correlations
positive_pks = sum(1 for v in pks_pcc.values() if v > 0)
negative_pks = sum(1 for v in pks_pcc.values() if v < 0)

print(f"PKS Correlation Analysis (PKS vs Hallucination Label):")
print(f"  Positive correlations: {positive_pks}")
print(f"  Negative correlations: {negative_pks}")
print(f"  Total layers: {len(pks_pcc)}")

# Sort by layer number and show correlations
sorted_pks = sorted(pks_pcc.items(), key=lambda x: int(x[0].split('_')[1]))
print("\nPKS correlations by layer:")
for k, v in sorted_pks:
    layer_num = int(k.split('_')[1])
    sign = "+" if v > 0 else ""
    print(f"  Layer {layer_num:2d}: {sign}{v:.4f}")

# Check if later layers have higher positive correlations
early_layers = [v for k, v in sorted_pks[:14]]  # layers 0-13
later_layers = [v for k, v in sorted_pks[14:]]  # layers 14-27

print(f"\nMean correlation - Early layers (0-13): {np.mean(early_layers):.4f}")
print(f"Mean correlation - Later layers (14-27): {np.mean(later_layers):.4f}")
print(f"\nClaim: 'Later-layer FFNs exhibit substantially higher PKS for hallucinated responses and are positively correlated'")
print(f"Result: {'MATCHES' if np.mean(later_layers) > np.mean(early_layers) else 'DOES NOT MATCH'} the claim")

PKS Correlation Analysis (PKS vs Hallucination Label):
  Positive correlations: 26
  Negative correlations: 2
  Total layers: 28

PKS correlations by layer:
  Layer  0: +0.0144
  Layer  1: +0.0459
  Layer  2: +0.0477
  Layer  3: +0.0264
  Layer  4: -0.0123
  Layer  5: +0.0155
  Layer  6: +0.0391
  Layer  7: +0.0770
  Layer  8: +0.1354
  Layer  9: +0.1417
  Layer 10: +0.1530
  Layer 11: +0.1267
  Layer 12: +0.1524
  Layer 13: +0.0945
  Layer 14: +0.1229
  Layer 15: +0.1659
  Layer 16: +0.1340
  Layer 17: +0.1632
  Layer 18: +0.2578
  Layer 19: +0.2215
  Layer 20: +0.2639
  Layer 21: +0.3210
  Layer 22: +0.1475
  Layer 23: +0.3243
  Layer 24: +0.3246
  Layer 25: +0.3033
  Layer 26: +0.2294
  Layer 27: -0.0102

Mean correlation - Early layers (0-13): 0.0755
Mean correlation - Later layers (14-27): 0.2121

Claim: 'Later-layer FFNs exhibit substantially higher PKS for hallucinated responses and are positively correlated'
Result: MATCHES the claim


In [23]:
# Verify Claim 3: Classifier Training - SVC achieved highest validation F1 (76.60%)
# Let's load the trained models and verify performance on test data

import pickle

# Load the test data with chunk scores
test_data_path = os.path.join(repo_path, "datasets/test/test_w_chunk_score_qwen06b.json")
with open(test_data_path, "r") as f:
    test_response = json.load(f)
    
print(f"Loaded {len(test_response)} test examples")

Loaded 256 test examples


In [24]:
# Load training data for validation split to verify the classifier claim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, f1_score

# Use the existing training data we loaded earlier to verify classifier performance
# Preprocess data into dataframe
ATTENTION_COLS = list(examples[0]['scores'][0]['prompt_attention_score'].keys())
PARAMETER_COLS = list(examples[0]['scores'][0]['parameter_knowledge_scores'].keys())

data_dict = {
    "identifier": [],
    **{col: [] for col in ATTENTION_COLS},
    **{col: [] for col in PARAMETER_COLS},
    "hallucination_label": []
}

for i, resp in enumerate(examples):
    for j in range(len(resp["scores"])):
        data_dict["identifier"].append(f"response_{i}_item_{j}")
        for col in ATTENTION_COLS:
            data_dict[col].append(resp["scores"][j]['prompt_attention_score'][col])
        
        for col in PARAMETER_COLS:
            data_dict[col].append(resp["scores"][j]['parameter_knowledge_scores'][col])
        data_dict["hallucination_label"].append(resp["scores"][j]["hallucination_label"])

df = pd.DataFrame(data_dict)
print(f"Created DataFrame with {len(df)} samples")
print(f"Class distribution: {df['hallucination_label'].value_counts().to_dict()}")

Created DataFrame with 7799 samples
Class distribution: {0: 4406, 1: 3393}


In [25]:
# Balance classes and split data (as done in classifier.py)
min_count = df['hallucination_label'].value_counts().min()
df_balanced = (
    df.groupby('hallucination_label', group_keys=False)
      .apply(lambda x: x.sample(min_count, random_state=42))
)
print(f"After balancing: {df_balanced['hallucination_label'].value_counts().to_dict()}")

# Split data
train, val = train_test_split(df_balanced, test_size=0.1, random_state=42, stratify=df_balanced['hallucination_label'])

features = [col for col in df_balanced.columns if col not in ['identifier', 'hallucination_label']]

X_train = train[features]
y_train = train["hallucination_label"]
X_val = val[features]
y_val = val["hallucination_label"]

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")

After balancing: {0: 3393, 1: 3393}
Train set: 6107 samples
Validation set: 679 samples


  .apply(lambda x: x.sample(min_count, random_state=42))


In [26]:
# Load and evaluate the pre-trained models
models_path = os.path.join(repo_path, "trained_models")

model_names = ["LR", "SVC", "RandomForest", "XGBoost"]
model_results = {}

for name in model_names:
    model_file = os.path.join(models_path, f"model_{name}_3000.pickle")
    with open(model_file, "rb") as f:
        model = pickle.load(f)
    
    # Make predictions on validation set
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='binary')
    
    model_results[name] = {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    print(f"{name}:")
    print(f"  Precision: {precision:.4f} ({precision*100:.2f}%)")
    print(f"  Recall: {recall:.4f} ({recall*100:.2f}%)")
    print(f"  F1: {f1:.4f} ({f1*100:.2f}%)")
    print()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


LR:
  Precision: 0.7744 (77.44%)
  Recall: 0.7493 (74.93%)
  F1: 0.7616 (76.16%)



https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


SVC:
  Precision: 0.8258 (82.58%)
  Recall: 0.7552 (75.52%)
  F1: 0.7889 (78.89%)

RandomForest:
  Precision: 0.7792 (77.92%)
  Recall: 0.7286 (72.86%)
  F1: 0.7530 (75.30%)



https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


XGBoost:
  Precision: 0.9373 (93.73%)
  Recall: 0.9705 (97.05%)
  F1: 0.9536 (95.36%)



configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

  model = pickle.load(f)


In [27]:
# The results show that SVC has F1=78.89%, which is close to the claimed 76.60%
# The XGBoost has extremely high validation performance (95.36%), which suggests potential overfitting
# as mentioned in the plan

# Let's verify by comparing training and validation performance
print("Classifier Performance Comparison:")
print("="*60)
print(f"{'Model':<15} {'Val F1':<15} {'Claimed F1':<15}")
print("="*60)
print(f"{'LR':<15} {model_results['LR']['f1']*100:.2f}%{'':<10}")
print(f"{'SVC':<15} {model_results['SVC']['f1']*100:.2f}%{'':<6} 76.60% (claimed)")
print(f"{'RandomForest':<15} {model_results['RandomForest']['f1']*100:.2f}%{'':<10}")
print(f"{'XGBoost':<15} {model_results['XGBoost']['f1']*100:.2f}%{'':<10}")
print("="*60)

# Find the best model
best_model = max(model_results.items(), key=lambda x: x[1]['f1'])
print(f"\nBest validation F1: {best_model[0]} with {best_model[1]['f1']*100:.2f}%")

# Verify the claim about SVC
svc_f1 = model_results['SVC']['f1'] * 100
claimed_f1 = 76.60
print(f"\nClaim: 'SVC achieved highest validation F1 (76.60%)'")
print(f"Actual SVC F1: {svc_f1:.2f}%")
print(f"Difference: {abs(svc_f1 - claimed_f1):.2f}%")

# The actual highest is XGBoost but it's likely overfitting as mentioned
print(f"\nNote: XGBoost shows {model_results['XGBoost']['f1']*100:.2f}% F1, suggesting overfitting as claimed in plan")

Classifier Performance Comparison:
Model           Val F1          Claimed F1     
LR              76.16%          
SVC             78.89%       76.60% (claimed)
RandomForest    75.30%          
XGBoost         95.36%          

Best validation F1: XGBoost with 95.36%

Claim: 'SVC achieved highest validation F1 (76.60%)'
Actual SVC F1: 78.89%
Difference: 2.29%

Note: XGBoost shows 95.36% F1, suggesting overfitting as claimed in plan


In [28]:
# The claim says "SVC achieved highest validation F1" but XGBoost has higher validation F1
# However, the claim also mentions XGBoost overfitting - let's verify by checking training performance
# Let's compute training F1 to see if XGBoost overfits

model_train_results = {}
for name in model_names:
    model_file = os.path.join(models_path, f"model_{name}_3000.pickle")
    with open(model_file, "rb") as f:
        model = pickle.load(f)
    
    # Make predictions on training set
    y_pred_train = model.predict(X_train)
    
    # Calculate training metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_train, y_pred_train, average='binary')
    
    model_train_results[name] = {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

print("Training vs Validation F1 Comparison:")
print("="*60)
print(f"{'Model':<15} {'Train F1':<15} {'Val F1':<15} {'Gap':<10}")
print("="*60)
for name in model_names:
    train_f1 = model_train_results[name]['f1'] * 100
    val_f1 = model_results[name]['f1'] * 100
    gap = train_f1 - val_f1
    print(f"{name:<15} {train_f1:.2f}%{'':<9} {val_f1:.2f}%{'':<9} {gap:+.2f}%")
print("="*60)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Training vs Validation F1 Comparison:
Model           Train F1        Val F1          Gap       
LR              77.84%          76.16%          +1.68%
SVC             80.95%          78.89%          +2.06%
RandomForest    77.25%          75.30%          +1.95%
XGBoost         95.02%          95.36%          -0.34%


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [29]:
# Interesting - XGBoost doesn't show overfitting in the traditional sense (train >> val)
# But still shows very high performance on both. The claim about SVC being highest is NOT accurate
# based on our validation - XGBoost has higher F1.

# The issue is the claim says "SVC achieved highest validation F1" but XGBoost is higher.
# However, the claim also says "XGBoost overfitted despite strong training performance"
# The XGBoost Train F1 (95.02%) and Val F1 (95.36%) are suspiciously high

# Let's verify claims 4 and 5 about Self-Evaluation and Proxy-Based Detection
# We need to check if response-level F1 matches the claims

# Load SVC model and compute response-level metrics on test data
test_data_path = os.path.join(repo_path, "datasets/test/test_w_chunk_score_qwen06b.json")
with open(test_data_path, "r") as f:
    test_response = json.load(f)

# Preprocess test data
test_data_dict = {
    "identifier": [],
    **{col: [] for col in ATTENTION_COLS},
    **{col: [] for col in PARAMETER_COLS},
    "hallucination_label": []
}

for i, resp in enumerate(test_response):
    for j in range(len(resp["scores"])):
        test_data_dict["identifier"].append(f"response_{i}_item_{j}")
        for col in ATTENTION_COLS:
            test_data_dict[col].append(resp["scores"][j]['prompt_attention_score'][col])
        
        for col in PARAMETER_COLS:
            test_data_dict[col].append(resp["scores"][j]['parameter_knowledge_scores'][col])
        test_data_dict["hallucination_label"].append(resp["scores"][j]["hallucination_label"])

test_df = pd.DataFrame(test_data_dict)
print(f"Test DataFrame: {len(test_df)} samples")
print(f"Class distribution: {test_df['hallucination_label'].value_counts().to_dict()}")

Test DataFrame: 975 samples
Class distribution: {0: 699, 1: 276}


In [30]:
# Load SVC model and make predictions for response-level evaluation
from sklearn.metrics import confusion_matrix

svc_model_path = os.path.join(models_path, "model_SVC_3000.pickle")
with open(svc_model_path, "rb") as f:
    svc_model = pickle.load(f)

test_features = [col for col in test_df.columns if col not in ['identifier', 'hallucination_label']]
test_df['pred'] = svc_model.predict(test_df[test_features])

# Response-level evaluation (aggregate span predictions to response level)
test_df["response_id"] = test_df["identifier"].str.extract(r"(response_\d+)_item_\d+")

# Group by response_id, aggregate with OR (max works for binary 0/1)
agg_df = test_df.groupby("response_id").agg({
    "pred": "max",
    "hallucination_label": "max"
}).reset_index()

# Calculate response-level metrics
tn, fp, fn, tp = confusion_matrix(agg_df["hallucination_label"], agg_df["pred"]).ravel()
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("Self-Evaluation Detection (Qwen3-0.6b signals, SVC classifier):")
print(f"  Response-level TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
print(f"  Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"  Recall: {recall:.4f} ({recall*100:.2f}%)")
print(f"  F1: {f1:.4f} ({f1*100:.2f}%)")
print(f"\nClaim: 'Method achieved F1=74.68%'")
print(f"Actual F1: {f1*100:.2f}%")
print(f"Difference: {abs(f1*100 - 74.68):.2f}%")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Self-Evaluation Detection (Qwen3-0.6b signals, SVC classifier):
  Response-level TP: 115, TN: 63, FP: 65, FN: 13
  Precision: 0.6389 (63.89%)
  Recall: 0.8984 (89.84%)
  F1: 0.7468 (74.68%)

Claim: 'Method achieved F1=74.68%'
Actual F1: 74.68%
Difference: 0.00%


In [31]:
# Great! The Self-Evaluation F1 exactly matches the claim (74.68%)
# Now let's verify the Proxy-Based Evaluation claim (GPT-4.1-mini responses)

test_gpt_path = os.path.join(repo_path, "datasets/test/test_w_chunk_score_gpt41mini.json")
with open(test_gpt_path, "r") as f:
    test_gpt_response = json.load(f)

# Preprocess GPT test data
test_gpt_data_dict = {
    "identifier": [],
    **{col: [] for col in ATTENTION_COLS},
    **{col: [] for col in PARAMETER_COLS},
    "hallucination_label": []
}

for i, resp in enumerate(test_gpt_response):
    for j in range(len(resp["scores"])):
        test_gpt_data_dict["identifier"].append(f"response_{i}_item_{j}")
        for col in ATTENTION_COLS:
            test_gpt_data_dict[col].append(resp["scores"][j]['prompt_attention_score'][col])
        
        for col in PARAMETER_COLS:
            test_gpt_data_dict[col].append(resp["scores"][j]['parameter_knowledge_scores'][col])
        test_gpt_data_dict["hallucination_label"].append(resp["scores"][j]["hallucination_label"])

test_gpt_df = pd.DataFrame(test_gpt_data_dict)
print(f"GPT Test DataFrame: {len(test_gpt_df)} samples")
print(f"Class distribution: {test_gpt_df['hallucination_label'].value_counts().to_dict()}")

GPT Test DataFrame: 1105 samples
Class distribution: {0: 835, 1: 270}


In [32]:
# Make predictions on GPT test data using SVC model
test_gpt_features = [col for col in test_gpt_df.columns if col not in ['identifier', 'hallucination_label']]
test_gpt_df['pred'] = svc_model.predict(test_gpt_df[test_gpt_features])

# Response-level evaluation
test_gpt_df["response_id"] = test_gpt_df["identifier"].str.extract(r"(response_\d+)_item_\d+")

# Group by response_id
agg_gpt_df = test_gpt_df.groupby("response_id").agg({
    "pred": "max",
    "hallucination_label": "max"
}).reset_index()

# Calculate response-level metrics
tn_gpt, fp_gpt, fn_gpt, tp_gpt = confusion_matrix(agg_gpt_df["hallucination_label"], agg_gpt_df["pred"]).ravel()
precision_gpt = tp_gpt / (tp_gpt + fp_gpt) if (tp_gpt + fp_gpt) > 0 else 0
recall_gpt = tp_gpt / (tp_gpt + fn_gpt) if (tp_gpt + fn_gpt) > 0 else 0
f1_gpt = 2 * precision_gpt * recall_gpt / (precision_gpt + recall_gpt) if (precision_gpt + recall_gpt) > 0 else 0

print("Proxy-Based Evaluation Detection (Qwen3-0.6b signals on GPT-4.1-mini responses, SVC classifier):")
print(f"  Response-level TP: {tp_gpt}, TN: {tn_gpt}, FP: {fp_gpt}, FN: {fn_gpt}")
print(f"  Precision: {precision_gpt:.4f} ({precision_gpt*100:.2f}%)")
print(f"  Recall: {recall_gpt:.4f} ({recall_gpt*100:.2f}%)")
print(f"  F1: {f1_gpt:.4f} ({f1_gpt*100:.2f}%)")
print(f"\nClaim: 'Method achieved F1=75.36%'")
print(f"Actual F1: {f1_gpt*100:.2f}%")
print(f"Difference: {abs(f1_gpt*100 - 75.36):.2f}%")



Proxy-Based Evaluation Detection (Qwen3-0.6b signals on GPT-4.1-mini responses, SVC classifier):
  Response-level TP: 78, TN: 37, FP: 46, FN: 5
  Precision: 0.6290 (62.90%)
  Recall: 0.9398 (93.98%)
  F1: 0.7536 (75.36%)

Claim: 'Method achieved F1=75.36%'
Actual F1: 75.36%
Difference: 0.00%


In [33]:
# Excellent! The Proxy-Based Evaluation F1 also exactly matches the claim (75.36%)

# Now let's summarize CS1 findings
print("="*80)
print("CS1: CONCLUSIONS VS ORIGINAL RESULTS SUMMARY")
print("="*80)

cs1_results = {
    "ECS_Correlation": {
        "claim": "All attention heads exhibit negative correlations between ECS and hallucination",
        "verified": True,
        "details": f"All {len(ecs_pcc_direct)} attention heads show negative correlation (range: {min(ecs_pcc_direct.values()):.4f} to {max(ecs_pcc_direct.values()):.4f})"
    },
    "PKS_Correlation": {
        "claim": "Later-layer FFNs exhibit substantially higher PKS for hallucinated responses and are positively correlated",
        "verified": True,
        "details": f"Early layers (0-13) mean r={np.mean(early_layers):.4f}, Later layers (14-27) mean r={np.mean(later_layers):.4f}"
    },
    "Classifier_Selection": {
        "claim": "SVC achieved highest validation F1 (76.60%) and was selected",
        "verified": False,
        "details": f"SVC achieved {model_results['SVC']['f1']*100:.2f}% but XGBoost achieved {model_results['XGBoost']['f1']*100:.2f}%"
    },
    "Self_Evaluation": {
        "claim": "Method achieved F1=74.68%",
        "verified": True,
        "details": f"Actual F1={f1*100:.2f}% exactly matches the claim"
    },
    "Proxy_Evaluation": {
        "claim": "Method achieved F1=75.36%",
        "verified": True,
        "details": f"Actual F1={f1_gpt*100:.2f}% exactly matches the claim"
    }
}

for key, result in cs1_results.items():
    status = "✓ MATCHES" if result['verified'] else "✗ DOES NOT MATCH"
    print(f"\n{key}:")
    print(f"  Claim: {result['claim']}")
    print(f"  Status: {status}")
    print(f"  Details: {result['details']}")

CS1: CONCLUSIONS VS ORIGINAL RESULTS SUMMARY

ECS_Correlation:
  Claim: All attention heads exhibit negative correlations between ECS and hallucination
  Status: ✓ MATCHES
  Details: All 448 attention heads show negative correlation (range: -0.3378 to -0.0102)

PKS_Correlation:
  Claim: Later-layer FFNs exhibit substantially higher PKS for hallucinated responses and are positively correlated
  Status: ✓ MATCHES
  Details: Early layers (0-13) mean r=0.0755, Later layers (14-27) mean r=0.2121

Classifier_Selection:
  Claim: SVC achieved highest validation F1 (76.60%) and was selected
  Status: ✗ DOES NOT MATCH
  Details: SVC achieved 78.89% but XGBoost achieved 95.36%

Self_Evaluation:
  Claim: Method achieved F1=74.68%
  Status: ✓ MATCHES
  Details: Actual F1=74.68% exactly matches the claim

Proxy_Evaluation:
  Claim: Method achieved F1=75.36%
  Status: ✓ MATCHES
  Details: Actual F1=75.36% exactly matches the claim


In [34]:
# CS2: Verify Plan Steps Appear in Implementation
# Let's extract the plan steps and verify each one exists in the implementation

plan_methodology = """
Methodology from Plan:
1. Compute External Context Score (ECS) per attention head and layer by identifying the most attended context chunk via attention weights, then measuring cosine similarity between response and context embeddings.
2. Compute Parametric Knowledge Score (PKS) per FFN layer by measuring Jensen-Shannon divergence between vocabulary distributions before and after the FFN layer in the residual stream.
3. Use TransformerLens library on Qwen3-0.6b model to extract internal mechanistic signals (ECS and PKS) at span level across 28 layers and 16 attention heads.
4. Train binary classifiers (Logistic Regression, SVC, Random Forest, XGBoost) on standardized and correlation-filtered ECS/PKS features to predict span-level hallucinations, then aggregate to response-level.
5. Evaluate both self-evaluation (same model generates responses and computes signals) and proxy-based evaluation (Qwen3-0.6b signals applied to GPT-4.1-mini responses) settings.
"""
print(plan_methodology)


Methodology from Plan:
1. Compute External Context Score (ECS) per attention head and layer by identifying the most attended context chunk via attention weights, then measuring cosine similarity between response and context embeddings.
2. Compute Parametric Knowledge Score (PKS) per FFN layer by measuring Jensen-Shannon divergence between vocabulary distributions before and after the FFN layer in the residual stream.
3. Use TransformerLens library on Qwen3-0.6b model to extract internal mechanistic signals (ECS and PKS) at span level across 28 layers and 16 attention heads.
4. Train binary classifiers (Logistic Regression, SVC, Random Forest, XGBoost) on standardized and correlation-filtered ECS/PKS features to predict span-level hallucinations, then aggregate to response-level.
5. Evaluate both self-evaluation (same model generates responses and computes signals) and proxy-based evaluation (Qwen3-0.6b signals applied to GPT-4.1-mini responses) settings.



In [35]:
# Now let's verify each step exists in the implementation

# Step 1: ECS computation - Check compute_scores.py
print("="*80)
print("CS2: PLAN VS IMPLEMENTATION VERIFICATION")
print("="*80)

# Check Step 1: ECS computation
print("\n1. ECS Computation (compute_scores.py):")
ecs_check = {
    "attention_weights": "attention_score = outputs.attentions[attentions_layer_id][0, head_id, :, :]" in compute_scores_content,
    "cosine_similarity": "calculate_sentence_similarity" in compute_scores_content,
    "bge_model": "SentenceTransformer" in compute_scores_content,
    "per_head_layer": "for attentions_layer_id" in compute_scores_content and "for head_id" in compute_scores_content
}
for key, found in ecs_check.items():
    print(f"  {key}: {'✓ Found' if found else '✗ NOT FOUND'}")

# Check Step 2: PKS computation
print("\n2. PKS Computation (compute_scores.py):")
pks_check = {
    "jensen_shannon": "calculate_dist_2d" in compute_scores_content,
    "kl_divergence": "F.kl_div" in compute_scores_content,
    "resid_mid": "hook_resid_mid" in compute_scores_content,
    "resid_post": "hook_resid_post" in compute_scores_content,
    "vocab_distribution": "W_U" in compute_scores_content
}
for key, found in pks_check.items():
    print(f"  {key}: {'✓ Found' if found else '✗ NOT FOUND'}")

CS2: PLAN VS IMPLEMENTATION VERIFICATION

1. ECS Computation (compute_scores.py):
  attention_weights: ✓ Found
  cosine_similarity: ✓ Found
  bge_model: ✓ Found
  per_head_layer: ✓ Found

2. PKS Computation (compute_scores.py):
  jensen_shannon: ✓ Found
  kl_divergence: ✓ Found
  resid_mid: ✓ Found
  resid_post: ✓ Found
  vocab_distribution: ✓ Found


In [36]:
# Check Step 3: TransformerLens on Qwen3-0.6b
print("3. TransformerLens on Qwen3-0.6b (compute_scores.py):")
transformerlens_check = {
    "transformer_lens_import": "from transformer_lens import HookedTransformer" in compute_scores_content,
    "run_with_cache": "model.run_with_cache" in compute_scores_content,
    "qwen_model": "Qwen" in compute_scores_content or "qwen" in compute_scores_content.lower()
}
for key, found in transformerlens_check.items():
    print(f"  {key}: {'✓ Found' if found else '✗ NOT FOUND'}")

# Check model parameters in data (28 layers, 16 heads)
print(f"\n  28 layers: ✓ Found ({len(pks_truthful)} layers in data)")
print(f"  16 heads per layer: ✓ Found ({len(ecs_truthful)} total = 28 layers × 16 heads)")

# Check Step 4: Classifier training
print("\n4. Classifier Training (classifier.py):")
classifier_check = {
    "logistic_regression": "LogisticRegression" in classifier_content,
    "svc": "SVC" in classifier_content,
    "random_forest": "RandomForestClassifier" in classifier_content,
    "xgboost": "XGBClassifier" in classifier_content,
    "standardization": "StandardScaler" in classifier_content,
    "span_level": "span" in classifier_content.lower() or "item" in classifier_content
}
for key, found in classifier_check.items():
    print(f"  {key}: {'✓ Found' if found else '✗ NOT FOUND'}")

3. TransformerLens on Qwen3-0.6b (compute_scores.py):
  transformer_lens_import: ✓ Found
  run_with_cache: ✓ Found
  qwen_model: ✓ Found

  28 layers: ✓ Found (28 layers in data)
  16 heads per layer: ✓ Found (448 total = 28 layers × 16 heads)

4. Classifier Training (classifier.py):
  logistic_regression: ✓ Found
  svc: ✓ Found
  random_forest: ✓ Found
  xgboost: ✓ Found
  standardization: ✓ Found
  span_level: ✓ Found


In [37]:
# Check Step 5: Self-evaluation and proxy-based evaluation
print("5. Evaluation Settings (predict.py and data files):")

# Check predict.py for response-level aggregation
predict_check = {
    "response_level_agg": "response_id" in predict_content,
    "span_to_response": "groupby" in predict_content,
}
for key, found in predict_check.items():
    print(f"  {key}: {'✓ Found' if found else '✗ NOT FOUND'}")

# Check for test data files
test_qwen_exists = os.path.exists(os.path.join(repo_path, "datasets/test/test_w_chunk_score_qwen06b.json"))
test_gpt_exists = os.path.exists(os.path.join(repo_path, "datasets/test/test_w_chunk_score_gpt41mini.json"))

print(f"  self_eval_data (Qwen test): {'✓ Found' if test_qwen_exists else '✗ NOT FOUND'}")
print(f"  proxy_eval_data (GPT-4.1-mini test): {'✓ Found' if test_gpt_exists else '✗ NOT FOUND'}")

# Check baseline implementations
baseline_files = os.listdir(os.path.join(repo_path, "scripts/baseline"))
print(f"\n  Baseline comparisons implemented:")
for f in baseline_files:
    if f.endswith('.py'):
        print(f"    - {f}")

5. Evaluation Settings (predict.py and data files):
  response_level_agg: ✓ Found
  span_to_response: ✓ Found
  self_eval_data (Qwen test): ✓ Found
  proxy_eval_data (GPT-4.1-mini test): ✓ Found

  Baseline comparisons implemented:
    - run_refchecker.py
    - run_hf.py
    - run_ragas.py
    - run_groq.py
    - run_trulens.py
    - run_gpt.py


In [38]:
# Summarize CS2 findings
print("="*80)
print("CS2: PLAN VS IMPLEMENTATION SUMMARY")
print("="*80)

cs2_results = {
    "Step1_ECS_Computation": {
        "description": "Compute ECS per attention head/layer using attention weights and cosine similarity",
        "implemented": True,
        "details": "Found in compute_scores.py: attention weights extraction, sentence similarity using BGE model"
    },
    "Step2_PKS_Computation": {
        "description": "Compute PKS per FFN layer using Jensen-Shannon divergence",
        "implemented": True,
        "details": "Found in compute_scores.py: JS divergence via KL divergence, residual stream hooks (mid/post)"
    },
    "Step3_TransformerLens_Qwen": {
        "description": "Use TransformerLens on Qwen3-0.6b for 28 layers × 16 heads",
        "implemented": True,
        "details": "Found: HookedTransformer import, run_with_cache, 448 attention heads (28×16)"
    },
    "Step4_Classifier_Training": {
        "description": "Train LR, SVC, RF, XGBoost with standardization on span-level features",
        "implemented": True,
        "details": "Found in classifier.py: all 4 classifiers, StandardScaler, trained models saved"
    },
    "Step5_Evaluation_Settings": {
        "description": "Self-evaluation and proxy-based evaluation (Qwen → GPT-4.1-mini)",
        "implemented": True,
        "details": "Found: response-level aggregation, both test datasets exist"
    }
}

all_implemented = True
for key, result in cs2_results.items():
    status = "✓ IMPLEMENTED" if result['implemented'] else "✗ NOT IMPLEMENTED"
    if not result['implemented']:
        all_implemented = False
    print(f"\n{key}:")
    print(f"  Description: {result['description']}")
    print(f"  Status: {status}")
    print(f"  Details: {result['details']}")

print(f"\n{'='*80}")
print(f"CS2 OVERALL: {'PASS - All plan steps implemented' if all_implemented else 'FAIL - Some steps missing'}")
print(f"{'='*80}")

CS2: PLAN VS IMPLEMENTATION SUMMARY

Step1_ECS_Computation:
  Description: Compute ECS per attention head/layer using attention weights and cosine similarity
  Status: ✓ IMPLEMENTED
  Details: Found in compute_scores.py: attention weights extraction, sentence similarity using BGE model

Step2_PKS_Computation:
  Description: Compute PKS per FFN layer using Jensen-Shannon divergence
  Status: ✓ IMPLEMENTED
  Details: Found in compute_scores.py: JS divergence via KL divergence, residual stream hooks (mid/post)

Step3_TransformerLens_Qwen:
  Description: Use TransformerLens on Qwen3-0.6b for 28 layers × 16 heads
  Status: ✓ IMPLEMENTED
  Details: Found: HookedTransformer import, run_with_cache, 448 attention heads (28×16)

Step4_Classifier_Training:
  Description: Train LR, SVC, RF, XGBoost with standardization on span-level features
  Status: ✓ IMPLEMENTED
  Details: Found in classifier.py: all 4 classifiers, StandardScaler, trained models saved

Step5_Evaluation_Settings:
  Description: 

In [39]:
# Final Summary for both CS1 and CS2
print("="*80)
print("FINAL CONSISTENCY EVALUATION SUMMARY")
print("="*80)

print("\n### CS1: Results vs Conclusions ###")
cs1_pass = all(result['verified'] for result in cs1_results.values())
print(f"Status: {'PASS' if cs1_pass else 'FAIL'}")
print("\nDetails:")
for key, result in cs1_results.items():
    status = "✓" if result['verified'] else "✗"
    print(f"  {status} {key}")
    if not result['verified']:
        print(f"      Issue: {result['details']}")

print("\n### CS2: Plan vs Implementation ###")
cs2_pass = all(result['implemented'] for result in cs2_results.values())
print(f"Status: {'PASS' if cs2_pass else 'FAIL'}")
print("\nDetails:")
for key, result in cs2_results.items():
    status = "✓" if result['implemented'] else "✗"
    print(f"  {status} {key}")

print("\n" + "="*80)
print("OVERALL EVALUATION")
print("="*80)
print(f"CS1 (Results vs Conclusions): {'PASS' if cs1_pass else 'FAIL'}")
print(f"CS2 (Plan vs Implementation): {'PASS' if cs2_pass else 'FAIL'}")

FINAL CONSISTENCY EVALUATION SUMMARY

### CS1: Results vs Conclusions ###
Status: FAIL

Details:
  ✓ ECS_Correlation
  ✓ PKS_Correlation
  ✗ Classifier_Selection
      Issue: SVC achieved 78.89% but XGBoost achieved 95.36%
  ✓ Self_Evaluation
  ✓ Proxy_Evaluation

### CS2: Plan vs Implementation ###
Status: PASS

Details:
  ✓ Step1_ECS_Computation
  ✓ Step2_PKS_Computation
  ✓ Step3_TransformerLens_Qwen
  ✓ Step4_Classifier_Training
  ✓ Step5_Evaluation_Settings

OVERALL EVALUATION
CS1 (Results vs Conclusions): FAIL
CS2 (Plan vs Implementation): PASS


In [40]:
# Create the evaluation directory if it doesn't exist
eval_dir = os.path.join(repo_path, 'evaluation')
os.makedirs(eval_dir, exist_ok=True)
print(f"Evaluation directory created: {eval_dir}")

Evaluation directory created: /net/scratch2/smallyan/InterpDetect_eval/evaluation


In [41]:
# Create the self_matching.ipynb notebook
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell

# Create a new notebook
nb = new_notebook()

# Add cells to the notebook
cells = []

# Title and overview
cells.append(new_markdown_cell("""# Consistency Evaluation - Self Matching Analysis

This notebook performs a consistency evaluation of the InterpDetect project, checking:
1. **CS1**: Whether conclusions match the original recorded results
2. **CS2**: Whether implementation follows the plan

## Project Overview
The InterpDetect project implements a mechanistic interpretability-based hallucination detection method for RAG systems using:
- External Context Score (ECS) - measures attention to external context
- Parametric Knowledge Score (PKS) - measures FFN contribution via Jensen-Shannon divergence
"""))

# Setup cell
cells.append(new_code_cell("""import os
import json
import glob
import pickle
import collections
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import torch

# Set working directory
os.chdir('/home/smallyan/eval_agent')
repo_path = '/net/scratch2/smallyan/InterpDetect_eval'

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")"""))

# Load data cell
cells.append(new_markdown_cell("""## Load Training Data"""))

cells.append(new_code_cell("""# Load training data
folder_path = os.path.join(repo_path, "datasets/train")
examples = []
json_files = glob.glob(os.path.join(folder_path, "*.json"))

for file_path in json_files:
    with open(file_path, "r") as f:
        data = json.load(f)
        examples.extend(data)

print(f"Loaded {len(examples)} examples from {len(json_files)} files")"""))

# CS1 Analysis
cells.append(new_markdown_cell("""## CS1: Results vs Conclusions Analysis

### Claim 1: ECS Correlation Analysis
**Plan states**: "All attention heads exhibit negative correlations; hallucinated responses utilize less external context than truthful ones."
"""))

cells.append(new_code_cell("""# Separate ECS and PKS data by hallucination label
ecs_truthful = collections.defaultdict(list)
ecs_hallucinated = collections.defaultdict(list)
pks_truthful = collections.defaultdict(list)
pks_hallucinated = collections.defaultdict(list)

for example in examples:
    for score in example['scores']:
        if score['hallucination_label']==0:
            for k, v in score['prompt_attention_score'].items():
                ecs_truthful[k].append(v)
            for k, v in score['parameter_knowledge_scores'].items():
                pks_truthful[k].append(v)
        else:
            for k, v in score['prompt_attention_score'].items():
                ecs_hallucinated[k].append(v)
            for k, v in score['parameter_knowledge_scores'].items():
                pks_hallucinated[k].append(v)

print(f"Number of attention heads: {len(ecs_truthful)}")
print(f"Number of FFN layers: {len(pks_truthful)}")
print(f"Truthful spans: {len(list(ecs_truthful.values())[0])}")
print(f"Hallucinated spans: {len(list(ecs_hallucinated.values())[0])}")"""))

cells.append(new_code_cell("""# Compute ECS vs Hallucination correlation
def pearson_corr(attention_scores, hallucination_labels, inverse=False):
    scores = np.array(attention_scores, dtype=float)
    labels = np.array(hallucination_labels, dtype=int)
    
    if inverse:
        inverse_labels = 1 - labels
        r, p_value = pearsonr(scores, inverse_labels)
    else:
        r, p_value = pearsonr(scores, labels)
    
    return r, p_value

# Aggregate ECS data for correlation
ecs_lst = collections.defaultdict(list) 
ecs_label_lst = collections.defaultdict(list) 
for k, v in ecs_truthful.items():
    for a in v:
        ecs_lst[k].append(a)
        ecs_label_lst[k].append(0)
    for a in ecs_hallucinated[k]:
        ecs_lst[k].append(a)
        ecs_label_lst[k].append(1)

# Compute ECS correlations (ECS vs Hallucination directly)
ecs_pcc_direct = {}
for k, v in ecs_lst.items():
    r, p_val = pearson_corr(v, ecs_label_lst[k], inverse=False)
    ecs_pcc_direct[k] = r

# Count positive and negative correlations
positive_corr = sum(1 for v in ecs_pcc_direct.values() if v > 0)
negative_corr = sum(1 for v in ecs_pcc_direct.values() if v < 0)

print(f"ECS Correlation Analysis (ECS vs Hallucination Label):")
print(f"  Positive correlations: {positive_corr}")
print(f"  Negative correlations: {negative_corr}")
print(f"  Total heads: {len(ecs_pcc_direct)}")
print(f"\\nClaim: 'All attention heads exhibit negative correlations'")
print(f"Result: {'MATCHES' if negative_corr == len(ecs_pcc_direct) else 'DOES NOT MATCH'}")"""))

cells.append(new_markdown_cell("""### Claim 2: PKS Correlation Analysis
**Plan states**: "Later-layer FFNs exhibit substantially higher PKS for hallucinated responses and are positively correlated with hallucinations."
"""))

cells.append(new_code_cell("""# Compute PKS vs Hallucination correlation
pks_lst = collections.defaultdict(list) 
pks_label_lst = collections.defaultdict(list) 
for k, v in pks_truthful.items():
    for a in v:
        pks_lst[k].append(a)
        pks_label_lst[k].append(0)
    for a in pks_hallucinated[k]:
        pks_lst[k].append(a)
        pks_label_lst[k].append(1)

# Compute PKS correlations
pks_pcc = {}
for k, v in pks_lst.items():
    r, p_val = pearson_corr(v, pks_label_lst[k], inverse=False)
    pks_pcc[k] = r

# Sort by layer number
sorted_pks = sorted(pks_pcc.items(), key=lambda x: int(x[0].split('_')[1]))

# Compare early vs later layers
early_layers = [v for k, v in sorted_pks[:14]]  # layers 0-13
later_layers = [v for k, v in sorted_pks[14:]]  # layers 14-27

print(f"PKS Correlation Analysis:")
print(f"  Mean correlation - Early layers (0-13): {np.mean(early_layers):.4f}")
print(f"  Mean correlation - Later layers (14-27): {np.mean(later_layers):.4f}")
print(f"\\nClaim: 'Later-layer FFNs exhibit higher PKS correlation with hallucinations'")
print(f"Result: {'MATCHES' if np.mean(later_layers) > np.mean(early_layers) else 'DOES NOT MATCH'}")"""))

cells.append(new_markdown_cell("""### Claim 3: Classifier Performance
**Plan states**: "SVC achieved highest validation F1 (76.60%) and was selected; XGBoost overfitted despite strong training performance."
"""))

cells.append(new_code_cell("""# Prepare data for classifier evaluation
ATTENTION_COLS = list(examples[0]['scores'][0]['prompt_attention_score'].keys())
PARAMETER_COLS = list(examples[0]['scores'][0]['parameter_knowledge_scores'].keys())

data_dict = {
    "identifier": [],
    **{col: [] for col in ATTENTION_COLS},
    **{col: [] for col in PARAMETER_COLS},
    "hallucination_label": []
}

for i, resp in enumerate(examples):
    for j in range(len(resp["scores"])):
        data_dict["identifier"].append(f"response_{i}_item_{j}")
        for col in ATTENTION_COLS:
            data_dict[col].append(resp["scores"][j]['prompt_attention_score'][col])
        for col in PARAMETER_COLS:
            data_dict[col].append(resp["scores"][j]['parameter_knowledge_scores'][col])
        data_dict["hallucination_label"].append(resp["scores"][j]["hallucination_label"])

df = pd.DataFrame(data_dict)

# Balance and split
min_count = df['hallucination_label'].value_counts().min()
df_balanced = df.groupby('hallucination_label', group_keys=False).apply(
    lambda x: x.sample(min_count, random_state=42), include_groups=False
).reset_index(drop=True)
df_balanced['hallucination_label'] = df.groupby('hallucination_label', group_keys=False).apply(
    lambda x: x.sample(min_count, random_state=42)
)['hallucination_label'].values

train, val = train_test_split(df_balanced, test_size=0.1, random_state=42, stratify=df_balanced['hallucination_label'])
features = [col for col in df_balanced.columns if col not in ['identifier', 'hallucination_label']]

X_train, y_train = train[features], train["hallucination_label"]
X_val, y_val = val[features], val["hallucination_label"]

print(f"Train set: {len(X_train)} samples, Validation set: {len(X_val)} samples")"""))

cells.append(new_code_cell("""# Evaluate pre-trained models
import warnings
warnings.filterwarnings('ignore')

models_path = os.path.join(repo_path, "trained_models")
model_names = ["LR", "SVC", "RandomForest", "XGBoost"]
model_results = {}

for name in model_names:
    model_file = os.path.join(models_path, f"model_{name}_3000.pickle")
    with open(model_file, "rb") as f:
        model = pickle.load(f)
    
    y_pred = model.predict(X_val)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='binary')
    model_results[name] = {'precision': precision, 'recall': recall, 'f1': f1}
    
print("Classifier Performance Comparison:")
print("="*50)
for name in model_names:
    print(f"{name}: F1 = {model_results[name]['f1']*100:.2f}%")
print("="*50)
print(f"\\nClaim: 'SVC achieved highest validation F1 (76.60%)'")
print(f"Actual highest: {max(model_results.items(), key=lambda x: x[1]['f1'])[0]} with {max(model_results.values(), key=lambda x: x['f1'])['f1']*100:.2f}%")
print(f"Result: {'MATCHES' if max(model_results.items(), key=lambda x: x[1]['f1'])[0] == 'SVC' else 'DOES NOT MATCH'}")"""))

cells.append(new_markdown_cell("""### Claim 4 & 5: Detection Performance
**Plan states**: 
- Self-Evaluation: "Method achieved F1=74.68%"
- Proxy-Based: "Method achieved F1=75.36%"
"""))

cells.append(new_code_cell("""# Load SVC model for response-level evaluation
svc_model_path = os.path.join(models_path, "model_SVC_3000.pickle")
with open(svc_model_path, "rb") as f:
    svc_model = pickle.load(f)

# Self-Evaluation (Qwen test data)
test_qwen_path = os.path.join(repo_path, "datasets/test/test_w_chunk_score_qwen06b.json")
with open(test_qwen_path, "r") as f:
    test_qwen = json.load(f)

test_qwen_dict = {"identifier": [], **{col: [] for col in ATTENTION_COLS}, 
                  **{col: [] for col in PARAMETER_COLS}, "hallucination_label": []}
for i, resp in enumerate(test_qwen):
    for j in range(len(resp["scores"])):
        test_qwen_dict["identifier"].append(f"response_{i}_item_{j}")
        for col in ATTENTION_COLS:
            test_qwen_dict[col].append(resp["scores"][j]['prompt_attention_score'][col])
        for col in PARAMETER_COLS:
            test_qwen_dict[col].append(resp["scores"][j]['parameter_knowledge_scores'][col])
        test_qwen_dict["hallucination_label"].append(resp["scores"][j]["hallucination_label"])

test_qwen_df = pd.DataFrame(test_qwen_dict)
test_qwen_df['pred'] = svc_model.predict(test_qwen_df[features])
test_qwen_df["response_id"] = test_qwen_df["identifier"].str.extract(r"(response_\\d+)_item_\\d+")
agg_qwen = test_qwen_df.groupby("response_id").agg({"pred": "max", "hallucination_label": "max"}).reset_index()

tn, fp, fn, tp = confusion_matrix(agg_qwen["hallucination_label"], agg_qwen["pred"]).ravel()
f1_self = 2 * (tp/(tp+fp)) * (tp/(tp+fn)) / ((tp/(tp+fp)) + (tp/(tp+fn)))
print(f"Self-Evaluation F1: {f1_self*100:.2f}% (Claimed: 74.68%)")
print(f"Result: {'MATCHES' if abs(f1_self*100 - 74.68) < 0.1 else 'DOES NOT MATCH'}")"""))

cells.append(new_code_cell("""# Proxy-Based Evaluation (GPT-4.1-mini test data)
test_gpt_path = os.path.join(repo_path, "datasets/test/test_w_chunk_score_gpt41mini.json")
with open(test_gpt_path, "r") as f:
    test_gpt = json.load(f)

test_gpt_dict = {"identifier": [], **{col: [] for col in ATTENTION_COLS}, 
                 **{col: [] for col in PARAMETER_COLS}, "hallucination_label": []}
for i, resp in enumerate(test_gpt):
    for j in range(len(resp["scores"])):
        test_gpt_dict["identifier"].append(f"response_{i}_item_{j}")
        for col in ATTENTION_COLS:
            test_gpt_dict[col].append(resp["scores"][j]['prompt_attention_score'][col])
        for col in PARAMETER_COLS:
            test_gpt_dict[col].append(resp["scores"][j]['parameter_knowledge_scores'][col])
        test_gpt_dict["hallucination_label"].append(resp["scores"][j]["hallucination_label"])

test_gpt_df = pd.DataFrame(test_gpt_dict)
test_gpt_df['pred'] = svc_model.predict(test_gpt_df[features])
test_gpt_df["response_id"] = test_gpt_df["identifier"].str.extract(r"(response_\\d+)_item_\\d+")
agg_gpt = test_gpt_df.groupby("response_id").agg({"pred": "max", "hallucination_label": "max"}).reset_index()

tn, fp, fn, tp = confusion_matrix(agg_gpt["hallucination_label"], agg_gpt["pred"]).ravel()
f1_proxy = 2 * (tp/(tp+fp)) * (tp/(tp+fn)) / ((tp/(tp+fp)) + (tp/(tp+fn)))
print(f"Proxy-Based Evaluation F1: {f1_proxy*100:.2f}% (Claimed: 75.36%)")
print(f"Result: {'MATCHES' if abs(f1_proxy*100 - 75.36) < 0.1 else 'DOES NOT MATCH'}")"""))

# CS2 Analysis
cells.append(new_markdown_cell("""## CS2: Plan vs Implementation Analysis

Verifying that all methodology steps from the plan are implemented in the code.
"""))

cells.append(new_code_cell("""# Read implementation files
scripts_path = os.path.join(repo_path, 'scripts')
with open(os.path.join(scripts_path, 'compute_scores.py'), 'r') as f:
    compute_scores_content = f.read()
with open(os.path.join(scripts_path, 'classifier.py'), 'r') as f:
    classifier_content = f.read()
with open(os.path.join(scripts_path, 'predict.py'), 'r') as f:
    predict_content = f.read()

# Verify each plan step
cs2_checks = {
    "Step1_ECS": {
        "attention_weights": "outputs.attentions" in compute_scores_content,
        "cosine_similarity": "calculate_sentence_similarity" in compute_scores_content,
    },
    "Step2_PKS": {
        "jensen_shannon": "calculate_dist_2d" in compute_scores_content,
        "kl_divergence": "F.kl_div" in compute_scores_content,
    },
    "Step3_TransformerLens": {
        "hooked_transformer": "HookedTransformer" in compute_scores_content,
        "run_with_cache": "run_with_cache" in compute_scores_content,
    },
    "Step4_Classifiers": {
        "LR": "LogisticRegression" in classifier_content,
        "SVC": "SVC" in classifier_content,
        "RF": "RandomForestClassifier" in classifier_content,
        "XGB": "XGBClassifier" in classifier_content,
    },
    "Step5_Evaluation": {
        "response_level": "response_id" in predict_content,
        "test_data_qwen": os.path.exists(os.path.join(repo_path, "datasets/test/test_w_chunk_score_qwen06b.json")),
        "test_data_gpt": os.path.exists(os.path.join(repo_path, "datasets/test/test_w_chunk_score_gpt41mini.json")),
    }
}

print("CS2: Plan vs Implementation Verification")
print("="*50)
all_pass = True
for step, checks in cs2_checks.items():
    step_pass = all(checks.values())
    all_pass = all_pass and step_pass
    print(f"\\n{step}: {'PASS' if step_pass else 'FAIL'}")
    for check, result in checks.items():
        print(f"  {'✓' if result else '✗'} {check}")

print(f"\\n{'='*50}")
print(f"CS2 Overall: {'PASS' if all_pass else 'FAIL'}")"""))

# Summary
cells.append(new_markdown_cell("""## Summary

### CS1: Results vs Conclusions
"""))

cells.append(new_code_cell("""# Final CS1 Summary
cs1_results = {
    "ECS_Correlation": {
        "claim": "All attention heads exhibit negative correlations",
        "verified": negative_corr == len(ecs_pcc_direct),
        "details": f"All {len(ecs_pcc_direct)} heads show negative correlation"
    },
    "PKS_Correlation": {
        "claim": "Later-layer FFNs have higher positive correlation",
        "verified": np.mean(later_layers) > np.mean(early_layers),
        "details": f"Early: {np.mean(early_layers):.4f}, Later: {np.mean(later_layers):.4f}"
    },
    "Classifier_Selection": {
        "claim": "SVC achieved highest validation F1 (76.60%)",
        "verified": max(model_results.items(), key=lambda x: x[1]['f1'])[0] == 'SVC',
        "details": f"Best: {max(model_results.items(), key=lambda x: x[1]['f1'])[0]} with {max(model_results.values(), key=lambda x: x['f1'])['f1']*100:.2f}%"
    },
    "Self_Evaluation": {
        "claim": "Method achieved F1=74.68%",
        "verified": abs(f1_self*100 - 74.68) < 0.1,
        "details": f"Actual: {f1_self*100:.2f}%"
    },
    "Proxy_Evaluation": {
        "claim": "Method achieved F1=75.36%",
        "verified": abs(f1_proxy*100 - 75.36) < 0.1,
        "details": f"Actual: {f1_proxy*100:.2f}%"
    }
}

print("CS1: Results vs Conclusions")
print("="*60)
for key, result in cs1_results.items():
    status = "PASS" if result['verified'] else "FAIL"
    print(f"\\n{key}: {status}")
    print(f"  Claim: {result['claim']}")
    print(f"  Details: {result['details']}")

cs1_pass = all(r['verified'] for r in cs1_results.values())
print(f"\\n{'='*60}")
print(f"CS1 Overall: {'PASS' if cs1_pass else 'FAIL'}")"""))

cells.append(new_markdown_cell("""### Binary Checklist Summary"""))

cells.append(new_code_cell("""print("="*60)
print("BINARY CHECKLIST")
print("="*60)
print(f"\\nCS1 (Results vs Conclusions): {'PASS' if cs1_pass else 'FAIL'}")
print(f"CS2 (Plan vs Implementation): {'PASS' if all_pass else 'FAIL'}")
print("="*60)

if not cs1_pass:
    print("\\nCS1 FAIL Reason:")
    for key, result in cs1_results.items():
        if not result['verified']:
            print(f"  - {key}: {result['details']}")"""))

nb['cells'] = cells

# Save the notebook
notebook_path = os.path.join(eval_dir, 'self_matching.ipynb')
with open(notebook_path, 'w') as f:
    nbformat.write(nb, f)

print(f"Notebook saved to: {notebook_path}")

Notebook saved to: /net/scratch2/smallyan/InterpDetect_eval/evaluation/self_matching.ipynb


In [42]:
# Create consistency_evaluation.json
consistency_evaluation = {
    "Checklist": {
        "CS1_Results_vs_Conclusion": "FAIL",
        "CS2_Plan_vs_Implementation": "PASS"
    },
    "Rationale": {
        "CS1_Results_vs_Conclusion": "FAIL because the claim 'SVC achieved highest validation F1 (76.60%)' does not match the actual results. While 4 out of 5 conclusions match (ECS negative correlations verified, PKS later-layer correlation verified, Self-Evaluation F1=74.68% verified, Proxy-Based F1=75.36% verified), the classifier selection claim is contradicted by the data: XGBoost achieves 95.36% validation F1, which is higher than SVC's 78.89%. The claim states SVC had the highest validation F1, but XGBoost outperforms it significantly.",
        "CS2_Plan_vs_Implementation": "PASS because all 5 methodology steps from the plan are fully implemented: (1) ECS computation using attention weights and cosine similarity is in compute_scores.py, (2) PKS computation using Jensen-Shannon divergence via KL divergence is in compute_scores.py, (3) TransformerLens with Qwen3-0.6b model extracting signals across 28 layers and 16 heads is implemented, (4) All 4 classifiers (LR, SVC, RandomForest, XGBoost) with StandardScaler are in classifier.py, (5) Both self-evaluation and proxy-based evaluation are supported with test data for Qwen and GPT-4.1-mini responses."
    }
}

# Save the JSON file
json_path = os.path.join(eval_dir, 'consistency_evaluation.json')
with open(json_path, 'w') as f:
    json.dump(consistency_evaluation, f, indent=4)

print(f"JSON saved to: {json_path}")
print("\nContent:")
print(json.dumps(consistency_evaluation, indent=4))

JSON saved to: /net/scratch2/smallyan/InterpDetect_eval/evaluation/consistency_evaluation.json

Content:
{
    "Checklist": {
        "CS1_Results_vs_Conclusion": "FAIL",
        "CS2_Plan_vs_Implementation": "PASS"
    },
    "Rationale": {
        "CS1_Results_vs_Conclusion": "FAIL because the claim 'SVC achieved highest validation F1 (76.60%)' does not match the actual results. While 4 out of 5 conclusions match (ECS negative correlations verified, PKS later-layer correlation verified, Self-Evaluation F1=74.68% verified, Proxy-Based F1=75.36% verified), the classifier selection claim is contradicted by the data: XGBoost achieves 95.36% validation F1, which is higher than SVC's 78.89%. The claim states SVC had the highest validation F1, but XGBoost outperforms it significantly.",
        "CS2_Plan_vs_Implementation": "PASS because all 5 methodology steps from the plan are fully implemented: (1) ECS computation using attention weights and cosine similarity is in compute_scores.py, (2)

In [43]:
# Verify both files were created
print("Evaluation files created:")
for f in os.listdir(eval_dir):
    full_path = os.path.join(eval_dir, f)
    size = os.path.getsize(full_path)
    print(f"  {f}: {size} bytes")

Evaluation files created:
  self_matching.ipynb: 22389 bytes
  replications: 7 bytes
  consistency_evaluation.json: 1317 bytes


In [44]:
# Let me check what the 'replications' file is
rep_path = os.path.join(eval_dir, 'replications')
if os.path.exists(rep_path):
    print(f"Replications file exists, checking if it's a directory or file...")
    if os.path.isdir(rep_path):
        print("It's a directory")
    else:
        with open(rep_path, 'r') as f:
            print(f"Content: {f.read()}")

Replications file exists, checking if it's a directory or file...
It's a directory


In [45]:
# Final summary
print("="*80)
print("CONSISTENCY EVALUATION COMPLETE")
print("="*80)
print(f"""
Files created:
1. {os.path.join(eval_dir, 'self_matching.ipynb')}
2. {os.path.join(eval_dir, 'consistency_evaluation.json')}

Binary Checklist Results:
- CS1 (Results vs Conclusions): FAIL
  Reason: The claim 'SVC achieved highest validation F1 (76.60%)' is contradicted
  by the data showing XGBoost achieves 95.36% validation F1, higher than SVC's 78.89%.
  
- CS2 (Plan vs Implementation): PASS
  All 5 methodology steps are fully implemented in the codebase.

Summary of Mismatches (CS1):
- Classifier Selection claim does not match actual results:
  * Plan claims: SVC has highest validation F1 (76.60%)
  * Actual results: XGBoost has highest validation F1 (95.36%)
  * SVC actual validation F1: 78.89%
""")

CONSISTENCY EVALUATION COMPLETE

Files created:
1. /net/scratch2/smallyan/InterpDetect_eval/evaluation/self_matching.ipynb
2. /net/scratch2/smallyan/InterpDetect_eval/evaluation/consistency_evaluation.json

Binary Checklist Results:
- CS1 (Results vs Conclusions): FAIL
  Reason: The claim 'SVC achieved highest validation F1 (76.60%)' is contradicted
  by the data showing XGBoost achieves 95.36% validation F1, higher than SVC's 78.89%.
  
- CS2 (Plan vs Implementation): PASS
  All 5 methodology steps are fully implemented in the codebase.

Summary of Mismatches (CS1):
- Classifier Selection claim does not match actual results:
  * Plan claims: SVC has highest validation F1 (76.60%)
  * Actual results: XGBoost has highest validation F1 (95.36%)
  * SVC actual validation F1: 78.89%

