In [1]:
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


# Code Evaluation: InterpDetect Circuit Analysis

This notebook provides a strict, deterministic evaluation of the code implementing the circuit analysis in the repository:
`/net/scratch2/smallyan/InterpDetect_eval`

## Evaluation Criteria
1. **Runnable (Y/N)** - Block executes without error
2. **Correct-Implementation (Y/N)** - Logic implements described computation correctly
3. **Redundant (Y/N)** - Block duplicates another block's computation
4. **Irrelevant (Y/N)** - Block does not contribute to project goal

In [2]:
# Check for GPU availability
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU: NVIDIA A100 80GB PCIe


## Step 1: Identify Core Analysis Files

Based on the CodeWalkthrough.md, the core analysis scripts are:

### Main Analysis Pipeline (Part 2: Training & Prediction)
1. **scripts/compute_scores.py** - Computes PKS (Parameter Knowledge Score) and ECS (External Context Score)
2. **scripts/classifier.py** - Trains classifiers (LR, SVC, RandomForest, XGBoost)
3. **scripts/predict.py** - Makes predictions using trained models

### Preprocessing Pipeline (Part 1)
4. **scripts/preprocess/preprocess.py** - Adds prompt and prompt_spans to raw data
5. **scripts/preprocess/generate_response_hf.py** - Generates responses using HuggingFace models
6. **scripts/preprocess/generate_response_gpt.py** - Generates responses using GPT models  
7. **scripts/preprocess/generate_labels.py** - Generates hallucination labels
8. **scripts/preprocess/filter.py** - Filters datasets based on confidence
9. **scripts/preprocess/helper.py** - Utility functions

### Baselines (Part 3)
10. **scripts/baseline/run_hf.py** - HuggingFace baseline
11. **scripts/baseline/run_ragas.py** - RAGAS baseline
12. **scripts/baseline/run_gpt.py** - GPT baseline
13. **scripts/baseline/run_groq.py** - Groq baseline
14. **scripts/baseline/run_refchecker.py** - RefChecker baseline
15. **scripts/baseline/run_trulens.py** - TruLens baseline

In [3]:
# Create output directories
import os

output_dir = "/net/scratch2/smallyan/InterpDetect_eval/evaluation"
os.makedirs(output_dir, exist_ok=True)
print(f"Created output directory: {output_dir}")

Created output directory: /net/scratch2/smallyan/InterpDetect_eval/evaluation


## Step 2: Evaluate Core Analysis Code

### 2.1 Evaluate compute_scores.py

This script computes PKS (Parameter Knowledge Score) and ECS (External Context Score) using TransformerLens.

In [4]:
# First, let's test the imports and helper functions from compute_scores.py
import sys
sys.path.insert(0, '/net/scratch2/smallyan/InterpDetect_eval/scripts')

# Test imports
try:
    import torch
    from transformers import AutoTokenizer
    from transformer_lens import HookedTransformer
    import json
    from torch.nn import functional as F
    from typing import Dict, List, Tuple
    from sentence_transformers import SentenceTransformer
    import numpy as np
    import pandas as pd
    from tqdm import tqdm
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy.stats import pointbiserialr
    print("✓ All imports for compute_scores.py successful")
except Exception as e:
    print(f"✗ Import error: {e}")

✓ All imports for compute_scores.py successful


In [5]:
# Test helper functions from compute_scores.py

# Test load_examples function
def load_examples(file_path):
    """Load examples from JSONL file"""
    print(f"Loading examples from {file_path}...")
    
    try:
        examples = []
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line)
                examples.append(data)
        
        print(f"Loaded {len(examples)} examples")
        return examples
    except Exception as e:
        print(f"Error loading examples: {e}")
        return None

# Test with actual data
test_file = "/net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess/datasets/test/test1176_w_labels_filtered.jsonl"
if os.path.exists(test_file):
    examples = load_examples(test_file)
    if examples and len(examples) > 0:
        print(f"✓ load_examples function works correctly")
        print(f"  First example keys: {examples[0].keys()}")
    else:
        print(f"✗ load_examples returned empty or None")
else:
    print(f"Test file not found: {test_file}")

Loading examples from /net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess/datasets/test/test1176_w_labels_filtered.jsonl...
Loaded 256 examples
✓ load_examples function works correctly
  First example keys: dict_keys(['id', 'question', 'documents', 'documents_sentences', 'prompt', 'prompt_spans', 'num_tokens', 'response', 'response_spans', 'labels', 'hallucinated_llama-4-maverick-17b-128e-instruct', 'hallucinated_gpt-oss-120b', 'labels_llama', 'labels_gpt'])


In [6]:
# Test the JS divergence calculation function
def calculate_dist_2d(sep_vocabulary_dist, sep_attention_dist):
    """Calculate Jensen-Shannon divergence between distributions"""
    # Calculate softmax
    softmax_mature_layer = F.softmax(sep_vocabulary_dist, dim=-1)
    softmax_anchor_layer = F.softmax(sep_attention_dist, dim=-1)

    # Calculate the average distribution M
    M = 0.5 * (softmax_mature_layer + softmax_anchor_layer)

    # Calculate log-softmax for the KL divergence
    log_softmax_mature_layer = F.log_softmax(sep_vocabulary_dist, dim=-1)
    log_softmax_anchor_layer = F.log_softmax(sep_attention_dist, dim=-1)

    # Calculate the KL divergences and then the JS divergences
    kl1 = F.kl_div(log_softmax_mature_layer, M, reduction='none').sum(dim=-1)
    kl2 = F.kl_div(log_softmax_anchor_layer, M, reduction='none').sum(dim=-1)
    js_divs = 0.5 * (kl1 + kl2)

    scores = js_divs.cpu().tolist()
    return sum(scores)

# Test with random tensors
torch.manual_seed(42)
dist1 = torch.randn(10, 100)
dist2 = torch.randn(10, 100)

try:
    js_score = calculate_dist_2d(dist1, dist2)
    print(f"✓ calculate_dist_2d function works correctly")
    print(f"  JS divergence score: {js_score:.4f}")
except Exception as e:
    print(f"✗ calculate_dist_2d error: {e}")

✓ calculate_dist_2d function works correctly
  JS divergence score: 2.8917


In [7]:
# Test add_special_template function
def add_special_template(tokenizer, prompt):
    """Add special template to prompt"""
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    return text

# Load tokenizer and test
try:
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
    test_prompt = "What is 2+2?"
    templated = add_special_template(tokenizer, test_prompt)
    print(f"✓ add_special_template function works correctly")
    print(f"  Template length: {len(templated)} characters")
except Exception as e:
    print(f"✗ add_special_template error: {e}")

✗ add_special_template error: [Errno 122] Disk quota exceeded: '/net/projects/chai-lab/shared_models/hub/models--Qwen--Qwen3-0.6B'


In [8]:
# Set cache to scratch to avoid quota issues
import os
os.environ["HF_HOME"] = "/net/scratch2/smallyan/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/net/scratch2/smallyan/hf_cache"
os.makedirs("/net/scratch2/smallyan/hf_cache", exist_ok=True)

# Retry loading tokenizer
try:
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
    test_prompt = "What is 2+2?"
    templated = add_special_template(tokenizer, test_prompt)
    print(f"✓ add_special_template function works correctly")
    print(f"  Template length: {len(templated)} characters")
except Exception as e:
    print(f"✗ add_special_template error: {e}")

✗ add_special_template error: [Errno 122] Disk quota exceeded: '/net/projects/chai-lab/shared_models/hub/models--Qwen--Qwen3-0.6B'


In [9]:
# Use a different approach - force local cache
import os

# Clear any previous cache config
os.environ["TRANSFORMERS_CACHE"] = "/net/scratch2/smallyan/hf_cache"
os.environ["HF_HOME"] = "/net/scratch2/smallyan/hf_cache"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/net/scratch2/smallyan/hf_cache"

# Create cache dir
cache_dir = "/net/scratch2/smallyan/hf_cache"
os.makedirs(cache_dir, exist_ok=True)

# Reload transformers
import importlib
import transformers
importlib.reload(transformers)

from transformers import AutoTokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", cache_dir=cache_dir)
    test_prompt = "What is 2+2?"
    templated = add_special_template(tokenizer, test_prompt)
    print(f"✓ add_special_template function works correctly")
    print(f"  Template length: {len(templated)} characters")
except Exception as e:
    print(f"✗ add_special_template error: {e}")
    print("Note: This appears to be a disk quota issue, not a code issue.")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

✓ add_special_template function works correctly
  Template length: 120 characters


In [10]:
# Test helper functions for span calculations
def is_hallucination_span(r_span, hallucination_spans):
    """Check if a span contains hallucination"""
    for token_id in range(r_span[0], r_span[1]):
        for span in hallucination_spans:
            if token_id >= span[0] and token_id <= span[1]:
                return True
    return False

# Test
test_r_span = [10, 20]
test_hallucination_spans = [[15, 25], [30, 40]]
result = is_hallucination_span(test_r_span, test_hallucination_spans)
print(f"✓ is_hallucination_span function works correctly")
print(f"  Test result (should be True): {result}")

# Test with non-overlapping
test_r_span2 = [50, 60]
result2 = is_hallucination_span(test_r_span2, test_hallucination_spans)
print(f"  Test result (should be False): {result2}")

✓ is_hallucination_span function works correctly
  Test result (should be True): True
  Test result (should be False): False


In [11]:
# Test the full model setup with TransformerLens
def setup_models(model_name, hf_model_name, device="cuda", cache_dir=None):
    """Setup tokenizer, model, and sentence transformer"""
    print(f"Setting up models: {model_name}, {hf_model_name}")
    
    try:
        from transformers import AutoTokenizer
        from transformer_lens import HookedTransformer
        from sentence_transformers import SentenceTransformer
        
        tokenizer = AutoTokenizer.from_pretrained(hf_model_name, cache_dir=cache_dir)
        
        model = HookedTransformer.from_pretrained(
            model_name,
            device="cpu",
            torch_dtype=torch.float16,
            cache_dir=cache_dir
        )
        model.to(device)
        
        bge_model = SentenceTransformer("BAAI/bge-base-en-v1.5", cache_folder=cache_dir).to(device)
        
        return tokenizer, model, bge_model
    except Exception as e:
        print(f"Error setting up models: {e}")
        return None, None, None

# Test setup (use a smaller test to save time)
cache_dir = "/net/scratch2/smallyan/hf_cache"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Testing model setup on device: {device}")
tokenizer, model, bge_model = setup_models("qwen3-0.6b", "Qwen/Qwen3-0.6B", device, cache_dir)

if model is not None:
    print(f"✓ setup_models function works correctly")
    print(f"  Model config: n_layers={model.cfg.n_layers}, n_heads={model.cfg.n_heads}")
else:
    print("✗ setup_models failed")

Testing model setup on device: cuda
Setting up models: qwen3-0.6b, Qwen/Qwen3-0.6B




Error setting up models: qwen3-0.6b not found. Valid official model names (excl aliases): ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'distilgpt2', 'facebook/opt-125m', 'facebook/opt-1.3b', 'facebook/opt-2.7b', 'facebook/opt-6.7b', 'facebook/opt-13b', 'facebook/opt-30b', 'facebook/opt-66b', 'EleutherAI/gpt-neo-125M', 'EleutherAI/gpt-neo-1.3B', 'EleutherAI/gpt-neo-2.7B', 'EleutherAI/gpt-j-6B', 'EleutherAI/gpt-neox-20b', 'stanford-crfm/alias-gpt2-small-x21', 'stanford-crfm/battlestar-gpt2-small-x49', 'stanford-crfm/caprica-gpt2-small-x81', 'stanford-crfm/darkmatter-gpt2-small-x343', 'stanford-crfm/expanse-gpt2-small-x777', 'stanford-crfm/arwen-gpt2-medium-x21', 'stanford-crfm/beren-gpt2-medium-x49', 'stanford-crfm/celebrimbor-gpt2-medium-x81', 'stanford-crfm/durin-gpt2-medium-x343', 'stanford-crfm/eowyn-gpt2-medium-x777', 'EleutherAI/pythia-14m', 'EleutherAI/pythia-31m', 'EleutherAI/pythia-70m', 'EleutherAI/pythia-160m', 'EleutherAI/pythia-410m', 'EleutherAI/pythia-1b', 'EleutherAI/

In [12]:
# The code has an issue - "qwen3-0.6b" is not a valid TransformerLens model name
# This is a CORRECT-IMPLEMENTATION issue - let's document this

# Check if Qwen2 models are available in TransformerLens
valid_qwen_models = [m for m in ['Qwen/Qwen2-0.5B', 'Qwen/Qwen2-1.5B', 'Qwen/Qwen1.5-0.5B'] ]
print("Valid Qwen models in TransformerLens:")
for m in valid_qwen_models:
    print(f"  - {m}")
    
print("\nNOTE: The code uses 'qwen3-0.6b' but TransformerLens doesn't support Qwen3 models.")
print("This is a Correct-Implementation issue - the model name is invalid for TransformerLens.")

# Let's test with a valid model instead to verify the rest of the code works
print("\nTesting with Qwen/Qwen2-0.5B instead...")

Valid Qwen models in TransformerLens:
  - Qwen/Qwen2-0.5B
  - Qwen/Qwen2-1.5B
  - Qwen/Qwen1.5-0.5B

NOTE: The code uses 'qwen3-0.6b' but TransformerLens doesn't support Qwen3 models.
This is a Correct-Implementation issue - the model name is invalid for TransformerLens.

Testing with Qwen/Qwen2-0.5B instead...


In [13]:
# Test with a valid model that is available in TransformerLens
from transformer_lens import HookedTransformer
from sentence_transformers import SentenceTransformer

cache_dir = "/net/scratch2/smallyan/hf_cache"

try:
    # Use a smaller model for testing - Qwen2-0.5B 
    print("Loading Qwen/Qwen2-0.5B with TransformerLens...")
    model = HookedTransformer.from_pretrained(
        "Qwen/Qwen2-0.5B",
        device="cpu",
        torch_dtype=torch.float16,
        cache_dir=cache_dir
    )
    model = model.to("cuda")
    print(f"✓ Model loaded successfully")
    print(f"  Model config: n_layers={model.cfg.n_layers}, n_heads={model.cfg.n_heads}, n_ctx={model.cfg.n_ctx}")
    
    # Load BGE model
    print("\nLoading BGE model...")
    bge_model = SentenceTransformer("BAAI/bge-base-en-v1.5", cache_folder=cache_dir).to("cuda")
    print(f"✓ BGE model loaded successfully")
    
except Exception as e:
    print(f"✗ Error: {e}")



Loading Qwen/Qwen2-0.5B with TransformerLens...


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 122] Disk quota exceeded: '/net/projects/chai-lab/shared_models/hub/models--Qwen--Qwen2-0.5B'


ERROR:huggingface_hub.file_download:Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 122] Disk quota exceeded: '/net/projects/chai-lab/shared_models/hub/models--Qwen--Qwen2-0.5B'


✗ Error: [Errno 122] Disk quota exceeded: '/net/projects/chai-lab/shared_models/hub/models--Qwen--Qwen2-0.5B'


In [14]:
# The cache issue is external - let's force all cache to our scratch directory
import os
import shutil

# Force HuggingFace to use only our cache directory
cache_dir = "/net/scratch2/smallyan/hf_cache"

# Set all possible cache environment variables BEFORE importing libraries
os.environ["HF_HOME"] = cache_dir
os.environ["HF_HUB_CACHE"] = cache_dir
os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["XDG_CACHE_HOME"] = cache_dir

# For transformer_lens specifically
os.environ["TRANSFORMER_LENS_CACHE"] = cache_dir

print(f"Cache directories set to: {cache_dir}")
print("Available disk space check:")
import subprocess
result = subprocess.run(['df', '-h', '/net/scratch2/'], capture_output=True, text=True)
print(result.stdout)

Cache directories set to: /net/scratch2/smallyan/hf_cache
Available disk space check:
Filesystem                                  Size  Used Avail Use% Mounted on
cluster-storage4.ds.uchicago.edu:/scratch2  171T   92T   80T  54% /net/scratch2



In [15]:
# Restart Python kernel state for transformers
import importlib
import sys

# Remove cached modules
modules_to_remove = [k for k in sys.modules.keys() if 'huggingface' in k or 'transformers' in k or 'transformer_lens' in k]
for m in modules_to_remove:
    del sys.modules[m]

# Now import fresh
from transformer_lens import HookedTransformer

cache_dir = "/net/scratch2/smallyan/hf_cache"

try:
    # Force cache directory in the from_pretrained call
    print("Attempting to load model with explicit cache_dir...")
    model = HookedTransformer.from_pretrained(
        "Qwen/Qwen2-0.5B",
        device="cpu",
        torch_dtype=torch.float16,
        hf_model = None,
        cache_dir=cache_dir
    )
    model = model.to("cuda")
    print(f"✓ Model loaded successfully")
    print(f"  Model config: n_layers={model.cfg.n_layers}, n_heads={model.cfg.n_heads}")
except Exception as e:
    print(f"Model loading error: {e}")
    print("\nNote: This is an environment issue (disk quota on shared cache), not a code logic issue.")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]



Attempting to load model with explicit cache_dir...


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]





Loaded pretrained model Qwen/Qwen2-0.5B into HookedTransformer
Moving model to device:  cuda


✓ Model loaded successfully
  Model config: n_layers=24, n_heads=14


In [16]:
# Load BGE model for sentence similarity
from sentence_transformers import SentenceTransformer

bge_model = SentenceTransformer("BAAI/bge-base-en-v1.5", cache_folder=cache_dir).to("cuda")
print("✓ BGE model loaded successfully")

# Test sentence similarity function
def calculate_sentence_similarity(bge_model, r_text, p_text):
    """Calculate sentence similarity using BGE model"""
    part_embedding = bge_model.encode([r_text], normalize_embeddings=True)
    q_embeddings = bge_model.encode([p_text], normalize_embeddings=True)
    
    # Calculate similarity score
    scores_named = np.matmul(q_embeddings, part_embedding.T).flatten()
    return float(scores_named[0])

# Test
sim = calculate_sentence_similarity(bge_model, "The revenue was $100M.", "Company reported revenue of $100 million.")
print(f"✓ calculate_sentence_similarity works correctly")
print(f"  Similarity score: {sim:.4f}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ BGE model loaded successfully


✓ calculate_sentence_similarity works correctly
  Similarity score: 0.9214


In [17]:
# Test the MockOutputs class and process_example function
class MockOutputs:
    """Mock outputs class for transformer lens compatibility"""
    def __init__(self, cache, model_cfg):
        self.cache = cache
        self.model_cfg = model_cfg

    @property
    def attentions(self):
        attentions = []
        for layer in range(self.model_cfg.n_layers):
            attn_pattern = self.cache[f"blocks.{layer}.attn.hook_pattern"]
            attentions.append(attn_pattern)
        return tuple(attentions)

    def __getitem__(self, key):
        if key == "hidden_states":
            hidden_states = []
            for layer in range(self.model_cfg.n_layers):
                hidden_state = self.cache[f"blocks.{layer}.hook_resid_post"]
                hidden_states.append(hidden_state)
            return tuple(hidden_states)
        else:
            raise KeyError(f"Key {key} not found")

# Test with a simple input
test_text = "Hello, how are you?"
test_ids = tokenizer(test_text, return_tensors="pt").input_ids.to("cuda")

model.eval()
torch.set_grad_enabled(False)

logits, cache = model.run_with_cache(test_ids, return_type="logits")
outputs = MockOutputs(cache, model.cfg)

print("✓ MockOutputs class works correctly")
print(f"  Attentions shape (layer 0): {outputs.attentions[0].shape}")
print(f"  Hidden states shape (layer 0): {outputs['hidden_states'][0].shape}")

TypeError: 'NoneType' object is not callable

In [18]:
# Reload tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B", cache_dir=cache_dir)

# Test with a simple input
test_text = "Hello, how are you?"
test_ids = tokenizer(test_text, return_tensors="pt").input_ids.to("cuda")

model.eval()
torch.set_grad_enabled(False)

logits, cache = model.run_with_cache(test_ids, return_type="logits")
outputs = MockOutputs(cache, model.cfg)

print("✓ MockOutputs class works correctly")
print(f"  Attentions shape (layer 0): {outputs.attentions[0].shape}")
print(f"  Hidden states shape (layer 0): {outputs['hidden_states'][0].shape}")

✓ MockOutputs class works correctly
  Attentions shape (layer 0): torch.Size([1, 14, 6, 6])
  Hidden states shape (layer 0): torch.Size([1, 6, 896])


In [19]:
# Define helper functions needed for process_example
def calculate_hallucination_spans(response, text, response_rag, tokenizer, prefix_len):
    """Calculate hallucination spans"""
    hallucination_span = []
    for item in response:
        start_id = item['start']
        end_id = item['end']
        start_text = text + response_rag[:start_id]
        end_text = text + response_rag[:end_id]
        start_text_id = tokenizer(start_text, return_tensors="pt").input_ids
        end_text_id = tokenizer(end_text, return_tensors="pt").input_ids
        start_id = start_text_id.shape[-1]
        end_id = end_text_id.shape[-1]
        hallucination_span.append([start_id, end_id])
    return hallucination_span

def calculate_respond_spans(raw_response_spans, text, response_rag, tokenizer):
    """Calculate response spans"""
    respond_spans = []
    for item in raw_response_spans:
        start_id = item[0]
        end_id = item[1]
        start_text = text + response_rag[:start_id]
        end_text = text + response_rag[:end_id]
        start_text_id = tokenizer(start_text, return_tensors="pt").input_ids
        end_text_id = tokenizer(end_text, return_tensors="pt").input_ids
        start_id = start_text_id.shape[-1]
        end_id = end_text_id.shape[-1]
        respond_spans.append([start_id, end_id])
    return respond_spans

def calculate_prompt_spans(raw_prompt_spans, prompt, tokenizer):
    """Calculate prompt spans"""
    prompt_spans = []
    for item in raw_prompt_spans:
        start_id = item[0]
        end_id = item[1]
        start_text = prompt[:start_id]
        end_text = prompt[:end_id]
        added_start_text = add_special_template(tokenizer, start_text)
        added_end_text = add_special_template(tokenizer, end_text)
        start_text_id = tokenizer(added_start_text, return_tensors="pt").input_ids.shape[-1] - 4
        end_text_id = tokenizer(added_end_text, return_tensors="pt").input_ids.shape[-1] - 4
        prompt_spans.append([start_text_id, end_text_id])
    return prompt_spans

print("✓ Helper functions defined")

✓ Helper functions defined


In [20]:
# Define the full process_example function
def process_example(example, tokenizer, model, bge_model, device, max_ctx, iter_step=1):
    """Process a single example to compute scores"""
    response_rag = example['response']
    prompt = example['prompt']
    original_prompt_spans = example['prompt_spans']
    original_response_spans = example['response_spans']

    text = add_special_template(tokenizer, prompt)

    prompt_ids = tokenizer([text], return_tensors="pt").input_ids
    response_ids = tokenizer([response_rag], return_tensors="pt").input_ids
    input_ids = torch.cat([prompt_ids, response_ids[:, 1:]], dim=1)

    if input_ids.shape[-1] > max_ctx:
        overflow = input_ids.shape[-1] - max_ctx
        input_ids = input_ids[:, overflow:]
        prompt_kept = max(prompt_ids.shape[-1] - overflow, 0)
    else:
        prompt_kept = prompt_ids.shape[-1]

    input_ids = input_ids.to(device)
    prefix_len = prompt_kept

    if "labels" in example.keys():
        hallucination_spans = calculate_hallucination_spans(example['labels'], text, response_rag, tokenizer, prefix_len)
    else:
        hallucination_spans = []

    prompt_spans = calculate_prompt_spans(example['prompt_spans'], prompt, tokenizer)
    respond_spans = calculate_respond_spans(example['response_spans'], text, response_rag, tokenizer)

    # Run model with cache to get all intermediate activations
    logits, cache = model.run_with_cache(
        input_ids,
        return_type="logits"
    )

    outputs = MockOutputs(cache, model.cfg)

    # skip tokens without hallucination
    hidden_states = outputs["hidden_states"]
    last_hidden_states = hidden_states[-1][0, :, :]
    del hidden_states

    span_score_dict = []
    for r_id, r_span in enumerate(respond_spans):
        layer_head_span = {}
        parameter_knowledge_dict = {}
        for attentions_layer_id in range(0, model.cfg.n_layers, iter_step):
            for head_id in range(model.cfg.n_heads):
                layer_head = (attentions_layer_id, head_id)
                p_span_score_dict = []
                for p_span in prompt_spans:
                    attention_score = outputs.attentions[attentions_layer_id][0, head_id, :, :]
                    p_span_score_dict.append([p_span, torch.sum(attention_score[r_span[0]:r_span[1], p_span[0]:p_span[1]]).cpu().item()])
                
                # Get the span with maximum score
                if len(p_span_score_dict) > 0:
                    p_id = max(range(len(p_span_score_dict)), key=lambda i: p_span_score_dict[i][1])
                    prompt_span_text = prompt[original_prompt_spans[p_id][0]:original_prompt_spans[p_id][1]]
                    respond_span_text = response_rag[original_response_spans[r_id][0]:original_response_spans[r_id][1]]
                    layer_head_span[str(layer_head)] = calculate_sentence_similarity(bge_model, prompt_span_text, respond_span_text)

            x_mid = cache[f"blocks.{attentions_layer_id}.hook_resid_mid"][0, r_span[0]:r_span[1], :]
            x_post = cache[f"blocks.{attentions_layer_id}.hook_resid_post"][0, r_span[0]:r_span[1], :]

            score = calculate_dist_2d(
                x_mid @ model.W_U,
                x_post @ model.W_U
            )
            parameter_knowledge_dict[f"layer_{attentions_layer_id}"] = score

        span_score_dict.append({
            "prompt_attention_score": layer_head_span,
            "r_span": r_span,
            "hallucination_label": 1 if is_hallucination_span(r_span, hallucination_spans) else 0,
            "parameter_knowledge_scores": parameter_knowledge_dict
        })

    example["scores"] = span_score_dict
    return example

print("✓ process_example function defined")

✓ process_example function defined


In [21]:
# Test with a real example from the dataset
# Load test examples
test_file = "/net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess/datasets/test/test1176_w_labels_filtered.jsonl"
examples = load_examples(test_file)

# Take first example
example = examples[0]
print(f"Testing with example ID: {example.get('id', 'N/A')}")
print(f"  Prompt length: {len(example['prompt'])} chars")
print(f"  Response length: {len(example['response'])} chars")
print(f"  Prompt spans: {len(example['prompt_spans'])}")
print(f"  Response spans: {len(example['response_spans'])}")

# Process the example
max_ctx = model.cfg.n_ctx
device = "cuda"

try:
    result = process_example(example, tokenizer, model, bge_model, device, max_ctx, iter_step=4)  # Use larger step for faster test
    print(f"\n✓ process_example works correctly!")
    print(f"  Number of span scores: {len(result['scores'])}")
    if result['scores']:
        print(f"  Sample span score keys: {result['scores'][0].keys()}")
        print(f"  Number of attention scores: {len(result['scores'][0]['prompt_attention_score'])}")
        print(f"  Number of PKS scores: {len(result['scores'][0]['parameter_knowledge_scores'])}")
except Exception as e:
    print(f"✗ process_example error: {e}")
    import traceback
    traceback.print_exc()

Loading examples from /net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess/datasets/test/test1176_w_labels_filtered.jsonl...
Loaded 256 examples
Testing with example ID: finqa_6345
  Prompt length: 1819 chars
  Response length: 655 chars
  Prompt spans: 8
  Response spans: 5



✓ process_example works correctly!
  Number of span scores: 5
  Sample span score keys: dict_keys(['prompt_attention_score', 'r_span', 'hallucination_label', 'parameter_knowledge_scores'])
  Number of attention scores: 84
  Number of PKS scores: 6


### 2.2 Evaluate classifier.py

This script trains classifiers (LR, SVC, RandomForest, XGBoost) on the computed scores.

In [22]:
# Test imports for classifier.py
try:
    import pandas as pd
    import json
    import numpy as np
    import os
    import glob
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
    from scipy.stats import pearsonr
    from sklearn.preprocessing import MinMaxScaler
    import pickle
    import seaborn as sns
    from matplotlib import pyplot as plt
    from tqdm import tqdm
    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
    print("✓ All imports for classifier.py successful")
except Exception as e:
    print(f"✗ Import error: {e}")

✓ All imports for classifier.py successful


In [23]:
# Test load_data function from classifier.py
def load_data(folder_path):
    """Load data from JSON files in the specified folder"""
    print(f"Loading data from {folder_path}...")
    
    try:
        response = []
        json_files = glob.glob(os.path.join(folder_path, "*.json"))
        
        if not json_files:
            print(f"No JSON files found in {folder_path}")
            return None
        
        for file_path in json_files:
            with open(file_path, "r") as f:
                data = json.load(f)
                response.extend(data)
        
        print(f"Loaded {len(response)} examples from {len(json_files)} files")
        return response
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Test with actual training data
train_folder = "/net/scratch2/smallyan/InterpDetect_eval/datasets/train"
response = load_data(train_folder)

if response:
    print(f"✓ load_data function works correctly")
    print(f"  First example keys: {response[0].keys()}")
else:
    print("✗ load_data failed or no data found")

Loading data from /net/scratch2/smallyan/InterpDetect_eval/datasets/train...


Loaded 1800 examples from 18 files
✓ load_data function works correctly
  First example keys: dict_keys(['id', 'question', 'documents', 'documents_sentences', 'prompt', 'prompt_spans', 'num_tokens', 'response', 'response_spans', 'labels', 'hallucinated_llama-4-maverick-17b-128e-instruct', 'hallucinated_gpt-oss-120b', 'labels_llama', 'labels_gpt', 'scores'])


In [24]:
# Test preprocess_data function from classifier.py
def preprocess_data(response, balance_classes=True, random_state=42):
    """Preprocess the loaded data into a DataFrame"""
    print("Preprocessing data...")
    
    if not response:
        print("No data to preprocess")
        return None, None, None
    
    # Get column names from first example
    ATTENTION_COLS = response[0]['scores'][0]['prompt_attention_score'].keys()
    PARAMETER_COLS = response[0]['scores'][0]['parameter_knowledge_scores'].keys()
    
    data_dict = {
        "identifier": [],
        **{col: [] for col in ATTENTION_COLS},
        **{col: [] for col in PARAMETER_COLS},
        "hallucination_label": []
    }
    
    for i, resp in enumerate(response):
        for j in range(len(resp["scores"])):
            data_dict["identifier"].append(f"response_{i}_item_{j}")
            for col in ATTENTION_COLS:
                data_dict[col].append(resp["scores"][j]['prompt_attention_score'][col])
            
            for col in PARAMETER_COLS:
                data_dict[col].append(resp["scores"][j]['parameter_knowledge_scores'][col])
            data_dict["hallucination_label"].append(resp["scores"][j]["hallucination_label"])
    
    df = pd.DataFrame(data_dict)
    
    print(f"Created DataFrame with {len(df)} samples")
    print(f"Class distribution: {df['hallucination_label'].value_counts().to_dict()}")
    
    # Balance classes if requested
    if balance_classes:
        min_count = df['hallucination_label'].value_counts().min()
        df = (
            df.groupby('hallucination_label', group_keys=False)
              .apply(lambda x: x.sample(min_count, random_state=random_state))
        )
        print(f"After balancing: {df['hallucination_label'].value_counts().to_dict()}")
    
    return df, list(ATTENTION_COLS), list(PARAMETER_COLS)

# Test
df, attention_cols, parameter_cols = preprocess_data(response, balance_classes=True)
if df is not None:
    print(f"\n✓ preprocess_data function works correctly")
    print(f"  DataFrame shape: {df.shape}")
    print(f"  Number of attention columns: {len(attention_cols)}")
    print(f"  Number of parameter columns: {len(parameter_cols)}")

Preprocessing data...


Created DataFrame with 7799 samples
Class distribution: {0: 4406, 1: 3393}
After balancing: {0: 3393, 1: 3393}

✓ preprocess_data function works correctly
  DataFrame shape: (6786, 478)
  Number of attention columns: 448
  Number of parameter columns: 28


  .apply(lambda x: x.sample(min_count, random_state=random_state))


In [25]:
# Test split_data and train_models functions
def split_data(df, test_size=0.1, random_state=42):
    """Split data into train and validation sets"""
    print("Splitting data into train and validation sets...")
    
    train, val = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df['hallucination_label'])
    
    features = [col for col in df.columns if col not in ['identifier', 'hallucination_label']]
    
    X_train = train[features]
    y_train = train["hallucination_label"]
    X_val = val[features]
    y_val = val["hallucination_label"]
    
    print(f"Train set: {len(X_train)} samples")
    print(f"Validation set: {len(X_val)} samples")
    print(f"Number of features: {len(features)}")
    
    return X_train, X_val, y_train, y_val, features

# Test
X_train, X_val, y_train, y_val, features = split_data(df)
print(f"\n✓ split_data function works correctly")

Splitting data into train and validation sets...
Train set: 6107 samples
Validation set: 679 samples
Number of features: 476

✓ split_data function works correctly


In [26]:
# Test create_preprocessor and train_models
def create_preprocessor(use_feature_selection=False):
    """Create preprocessing pipeline"""
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    
    scaler = StandardScaler()
    
    if use_feature_selection:
        try:
            from feature_engine.selection import DropConstantFeatures, SmartCorrelatedSelection, DropDuplicateFeatures
            from sklearn.ensemble import RandomForestClassifier
            
            drop_const = DropConstantFeatures(tol=0.95, missing_values='ignore')
            drop_dup = DropDuplicateFeatures()
            drop_corr = SmartCorrelatedSelection(
                method='pearson', 
                threshold=0.90,
                selection_method='model_performance',
                estimator=RandomForestClassifier(max_depth=5, random_state=42)
            )
            
            preprocessor = Pipeline([
                ('scaler', scaler),
                ('drop_constant', drop_const),
                ('drop_duplicates', drop_dup),
                ('smart_corr_selection', drop_corr),
            ])
        except ImportError:
            print("feature_engine not available, using simple preprocessing")
            preprocessor = Pipeline([('scaler', scaler)])
    else:
        preprocessor = Pipeline([
            ('scaler', scaler),
        ])
    
    return preprocessor

preprocessor = create_preprocessor(use_feature_selection=False)
print(f"✓ create_preprocessor function works correctly")
print(f"  Pipeline steps: {preprocessor.steps}")

✓ create_preprocessor function works correctly
  Pipeline steps: [('scaler', StandardScaler())]


In [27]:
# Test train_models function
def train_models(X_train, X_val, y_train, y_val, preprocessor, models_to_train=None):
    """Train multiple models and compare their performance"""
    print("Training models...")
    
    from sklearn.pipeline import make_pipeline
    from sklearn.metrics import precision_recall_fscore_support
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from xgboost import XGBClassifier
    
    # Define models to train
    if models_to_train is None:
        models_to_train = ["LR", "SVC", "RandomForest", "XGBoost"]
    
    models = []
    if "LR" in models_to_train:
        models.append(("LR", LogisticRegression()))
    if "SVC" in models_to_train:
        models.append(('SVC', SVC()))
    if "RandomForest" in models_to_train:
        models.append(('RandomForest', RandomForestClassifier(max_depth=5)))
    if "XGBoost" in models_to_train:
        models.append(('XGBoost', XGBClassifier(max_depth=5)))
    
    # Initialize lists for results
    names = []
    train_ps = []
    train_rs = []
    train_fs = []
    val_ps = []
    val_rs = []
    val_fs = []
    clfs = {}
    
    # Train each model
    for name, model_obj in models:
        print(f"Training {name}...")
        names.append(name)
        clf = make_pipeline(preprocessor, model_obj)
        clf.fit(X_train, y_train)
        
        # Calculate metrics
        tp, tr, tf, _ = precision_recall_fscore_support(y_train, clf.predict(X_train), average='binary')
        train_ps.append(tp)
        train_rs.append(tr)
        train_fs.append(tf)
        
        vp, vr, vf, _ = precision_recall_fscore_support(y_val, clf.predict(X_val), average='binary')
        val_ps.append(vp)
        val_rs.append(vr)
        val_fs.append(vf)
        
        clfs[name] = clf
    
    # Create comparison dataframe
    model_comparison = pd.DataFrame({
        'Algorithm': names,
        'Train_p': train_ps,
        'Val_p': val_ps,
        'Train_r': train_rs,
        'Val_r': val_rs,
        'Train_f': train_fs,
        'Val_f': val_fs,
    })
    
    print("\nModel Comparison:")
    print(model_comparison)
    
    return clfs, model_comparison

# Test training with just LR for speed
clfs, model_comparison = train_models(X_train, X_val, y_train, y_val, preprocessor, models_to_train=["LR", "RandomForest"])
print(f"\n✓ train_models function works correctly")
print(f"  Trained models: {list(clfs.keys())}")

Training models...
Training LR...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training RandomForest...



Model Comparison:
      Algorithm   Train_p     Val_p   Train_r     Val_r   Train_f     Val_f
0            LR  0.794437  0.737500  0.766863  0.696165  0.780407  0.716237
1  RandomForest  0.794307  0.763754  0.767518  0.696165  0.780683  0.728395

✓ train_models function works correctly
  Trained models: ['LR', 'RandomForest']


In [28]:
# Test save_models function
def save_models(clfs, output_dir):
    """Save trained models"""
    print(f"Saving models to {output_dir}...")
    
    os.makedirs(output_dir, exist_ok=True)
    
    for name, clf in clfs.items():
        model_path = os.path.join(output_dir, f"model_{name}_test.pickle")
        with open(model_path, "wb") as fout:
            pickle.dump(clf, fout)
        print(f"Saved {name} model to {model_path}")

# Test (save to temp directory)
temp_output = "/net/scratch2/smallyan/InterpDetect_eval/evaluation/temp_models"
save_models(clfs, temp_output)
print(f"\n✓ save_models function works correctly")

# Cleanup
import shutil
shutil.rmtree(temp_output)

Saving models to /net/scratch2/smallyan/InterpDetect_eval/evaluation/temp_models...
Saved LR model to /net/scratch2/smallyan/InterpDetect_eval/evaluation/temp_models/model_LR_test.pickle
Saved RandomForest model to /net/scratch2/smallyan/InterpDetect_eval/evaluation/temp_models/model_RandomForest_test.pickle

✓ save_models function works correctly


### 2.3 Evaluate predict.py

This script makes predictions using trained models and evaluates at span and response level.

In [29]:
# Test imports for predict.py
try:
    import pandas as pd
    import json
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import classification_report, accuracy_score
    from sklearn.metrics import roc_auc_score
    from scipy.stats import pearsonr
    from sklearn.preprocessing import MinMaxScaler
    import pickle
    import seaborn as sns
    from matplotlib import pyplot as plt
    from tqdm import tqdm
    from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
    print("✓ All imports for predict.py successful")
except Exception as e:
    print(f"✗ Import error: {e}")

✓ All imports for predict.py successful


In [30]:
# Test load_data function from predict.py
def load_data_predict(data_path):
    """Load data from JSON file"""
    print(f"Loading data from {data_path}...")
    
    try:
        with open(data_path, "r") as f:
            response = json.load(f)
        
        print(f"Loaded {len(response)} examples")
        return response
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Test with test data
test_data_path = "/net/scratch2/smallyan/InterpDetect_eval/datasets/test/test_w_chunk_score_qwen06b.json"
response = load_data_predict(test_data_path)

if response:
    print(f"✓ load_data function works correctly")
    print(f"  First example keys: {response[0].keys()}")

Loading data from /net/scratch2/smallyan/InterpDetect_eval/datasets/test/test_w_chunk_score_qwen06b.json...


Loaded 256 examples
✓ load_data function works correctly
  First example keys: dict_keys(['id', 'question', 'documents', 'documents_sentences', 'prompt', 'prompt_spans', 'num_tokens', 'response', 'response_spans', 'labels', 'hallucinated_llama-4-maverick-17b-128e-instruct', 'hallucinated_gpt-oss-120b', 'labels_llama', 'labels_gpt', 'scores'])


In [31]:
# Test preprocess_data for predict.py
def preprocess_data_predict(response):
    """Preprocess the loaded data into a DataFrame"""
    print("Preprocessing data...")
    
    if not response:
        print("No data to preprocess")
        return None
    
    # Get column names from first example
    ATTENTION_COLS = response[0]['scores'][0]['prompt_attention_score'].keys()
    PARAMETER_COLS = response[0]['scores'][0]['parameter_knowledge_scores'].keys()
    
    data_dict = {
        "identifier": [],
        **{col: [] for col in ATTENTION_COLS},
        **{col: [] for col in PARAMETER_COLS},
        "hallucination_label": []
    }
    
    for i, resp in enumerate(response):
        for j in range(len(resp["scores"])):
            data_dict["identifier"].append(f"response_{i}_item_{j}")
            for col in ATTENTION_COLS:
                data_dict[col].append(resp["scores"][j]['prompt_attention_score'][col])
            
            for col in PARAMETER_COLS:
                data_dict[col].append(resp["scores"][j]['parameter_knowledge_scores'][col])
            data_dict["hallucination_label"].append(resp["scores"][j]["hallucination_label"])
    
    df = pd.DataFrame(data_dict)
    
    print(f"Created DataFrame with {len(df)} samples")
    print(f"Class distribution: {df['hallucination_label'].value_counts().to_dict()}")
    
    return df

df_test = preprocess_data_predict(response)
print(f"\n✓ preprocess_data function works correctly")

Preprocessing data...
Created DataFrame with 975 samples
Class distribution: {0: 699, 1: 276}

✓ preprocess_data function works correctly


In [32]:
# Test load_model and make_predictions
def load_model(model_path):
    """Load trained model from pickle file"""
    print(f"Loading model from {model_path}...")
    
    try:
        with open(model_path, "rb") as f:
            model = pickle.load(f)
        print("Model loaded successfully")
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

def make_predictions(df, model):
    """Make predictions using the loaded model"""
    print("Making predictions...")
    
    features = [col for col in df.columns if col not in ['identifier', 'hallucination_label']]
    y_pred = model.predict(df[features])
    df['pred'] = y_pred
    
    print(f"Predictions completed for {len(df)} samples")
    return df

# Load a pre-trained model
model_path = "/net/scratch2/smallyan/InterpDetect_eval/trained_models/model_SVC_3000.pickle"
model = load_model(model_path)

if model:
    df_test = make_predictions(df_test, model)
    print(f"\n✓ make_predictions function works correctly")
    print(f"  Prediction value counts: {df_test['pred'].value_counts().to_dict()}")

Loading model from /net/scratch2/smallyan/InterpDetect_eval/trained_models/model_SVC_3000.pickle...
Model loaded successfully
Making predictions...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Predictions completed for 975 samples

✓ make_predictions function works correctly
  Prediction value counts: {0: 595, 1: 380}


In [33]:
# Test evaluate_span_level function
def evaluate_span_level(df):
    """Evaluate predictions at span level"""
    print("\n=== Span-level Evaluation ===")
    
    # Confusion matrix: tn, fp, fn, tp
    tn, fp, fn, tp = confusion_matrix(df["hallucination_label"], df["pred"]).ravel()
    
    # Precision, recall, F1
    precision = precision_score(df["hallucination_label"], df["pred"])
    recall = recall_score(df["hallucination_label"], df["pred"])
    f1 = f1_score(df["hallucination_label"], df["pred"])
    
    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")
    
    return {
        'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn,
        'precision': precision, 'recall': recall, 'f1': f1
    }

span_results = evaluate_span_level(df_test)
print(f"\n✓ evaluate_span_level function works correctly")


=== Span-level Evaluation ===
TP: 213, TN: 532, FP: 167, FN: 63
Precision: 0.561
Recall: 0.772
F1 Score: 0.649

✓ evaluate_span_level function works correctly


In [34]:
# Test evaluate_response_level function
def evaluate_response_level(df):
    """Evaluate predictions at response level"""
    print("\n=== Response-level Evaluation ===")
    
    # Extract response_id from identifier (everything before "_item_")
    df["response_id"] = df["identifier"].str.extract(r"(response_\d+)_item_\d+")
    
    # Group by response_id, aggregate with OR (max works for binary 0/1)
    agg_df = df.groupby("response_id").agg({
        "pred": "max",
        "hallucination_label": "max"
    }).reset_index()
    
    # Confusion matrix: tn, fp, fn, tp
    tn, fp, fn, tp = confusion_matrix(agg_df["hallucination_label"], agg_df["pred"]).ravel()
    
    # Precision, recall, F1
    precision = precision_score(agg_df["hallucination_label"], agg_df["pred"])
    recall = recall_score(agg_df["hallucination_label"], agg_df["pred"])
    f1 = f1_score(agg_df["hallucination_label"], agg_df["pred"])
    
    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return {
        'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn,
        'precision': precision, 'recall': recall, 'f1': f1,
        'agg_df': agg_df
    }

response_results = evaluate_response_level(df_test)
print(f"\n✓ evaluate_response_level function works correctly")


=== Response-level Evaluation ===
TP: 115, TN: 63, FP: 65, FN: 13
Precision: 0.6389
Recall: 0.8984
F1 Score: 0.7468

✓ evaluate_response_level function works correctly


### 2.4 Evaluate Preprocessing Scripts

Testing preprocess.py, generate_labels.py, filter.py, helper.py, and response generation scripts.

In [35]:
# Test helper.py functions
import sys
sys.path.insert(0, '/net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess')

# Test clean_text function
import re
import nltk
try:
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
except:
    pass

def clean_text(text):
    # Remove extra spaces before punctuation
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    # Collapse multiple periods (e.g., ". . ." => ".")
    text = re.sub(r'\.{2,}', '.', text)
    # Fix spacing after punctuation
    text = re.sub(r'([.,!?;:])(?=\w)', r'\1 ', text)
    # Strip leading/trailing whitespace
    text = text.strip()
    # Capitalize first letter of each sentence
    sentences = re.split(r'(?<=[.!?])\s+', text)
    sentences = [s.strip().capitalize() for s in sentences if s.strip()]
    return ' '.join(sentences)

# Test
test_text = "hello  .  this is a test . . . another sentence"
cleaned = clean_text(test_text)
print(f"✓ clean_text function works correctly")
print(f"  Original: '{test_text}'")
print(f"  Cleaned: '{cleaned}'")

✓ clean_text function works correctly
  Original: 'hello  .  this is a test . . . another sentence'
  Cleaned: 'Hello. This is a test. Another sentence'


In [36]:
# Test get_sentence_spans function from helper.py
from nltk.tokenize import sent_tokenize

def get_sentence_spans(text):
    sentences = sent_tokenize(text)
    spans = []
    start = 0
    for sentence in sentences:
        start = text.find(sentence, start)
        end = start + len(sentence)
        spans.append((start, end))
        start = end
    return spans

# Test
test_text = "This is the first sentence. This is the second sentence. And this is the third."
spans = get_sentence_spans(test_text)
print(f"✓ get_sentence_spans function works correctly")
print(f"  Text: '{test_text}'")
print(f"  Spans: {spans}")

✓ get_sentence_spans function works correctly
  Text: 'This is the first sentence. This is the second sentence. And this is the third.'
  Spans: [(0, 27), (28, 56), (57, 79)]


In [37]:
# Test preprocess.py - add_prompt_spans function
def add_prompt_spans(df):
    """Build prompt and compute spans for the dataset"""
    part1 = "Given the context, please answer the question based on the provided information from the context. Include any reasoning with the answer\n"
    part2 = "\nContext:"
    part3 = "\nQuestion:"
    part4 = "\nAnswer:"

    prompt_texts = []
    prompt_spans = []

    for i, row in df.iterrows():
        question = row["question"]
        docs = list(row["documents"])  # assume list of document strings
        
        # prefix
        prompt = ""
        spans = []
        l1 = len(part1)
        prompt+=part1
        spans.append([0, l1-1])
        
        # context
        l2 = len(part2)
        prompt+=part2
        spans.append([l1, l1+l2-1])
        cur = l1+l2
        for doc in docs:
            doc = clean_text(doc)
            prompt+=doc
            spans.append([cur, cur+len(doc)-1])
            cur = cur+len(doc)

        # question
        l3 = len(part3)
        prompt+=part3
        spans.append([cur, cur+l3-1])
        cur = cur+l3
        prompt+=question
        spans.append([cur, cur+len(question)-1])
        cur = cur+len(question)
        
        # answer
        l4 = len(part4)
        prompt+=part4
        spans.append([cur, cur+l4-1])

        # append
        prompt_texts.append(prompt)
        prompt_spans.append(spans)

    return prompt_texts, prompt_spans

# Test with sample data
test_df = pd.DataFrame({
    'question': ['What is the revenue?'],
    'documents': [['Company A reported revenue of $100M in 2023.', 'The growth rate was 5%.']]
})

prompts, spans = add_prompt_spans(test_df)
print(f"✓ add_prompt_spans function works correctly")
print(f"  Generated prompt length: {len(prompts[0])} chars")
print(f"  Number of spans: {len(spans[0])}")

✓ add_prompt_spans function works correctly
  Generated prompt length: 250 chars
  Number of spans: 7


In [38]:
# Test filter.py - add_labels_llm function
import textwrap

def add_labels_llm(df, llama_column, gpt_column):
    """Add binary labels for LLM judge evaluations"""
    print("Adding binary labels for LLM judge evaluations...")
    
    labels_llama = []
    labels_gpt = []

    for i, row in df.iterrows():
        try:
            # Process Llama labels
            if "Yes" in row[llama_column]:
                labels_llama.append(0)
            elif "No" in row[llama_column]:
                labels_llama.append(1)
            else:
                labels_llama.append(-1)  # Error indicator

            # Process GPT labels
            if "Yes" in row[gpt_column]:
                labels_gpt.append(0)
            elif "No" in row[gpt_column]:
                labels_gpt.append(1)
            else:
                labels_gpt.append(-1)  # Error indicator
                
        except Exception as e:
            labels_llama.append(-1)
            labels_gpt.append(-1)

    df['labels_llama'] = labels_llama
    df['labels_gpt'] = labels_gpt
    return df

# Test with sample data
test_df = pd.DataFrame({
    'hallucinated_llama': ['Yes, the response is correct.', 'No, there are errors.'],
    'hallucinated_gpt': ['Yes', 'No, incorrect.']
})

result_df = add_labels_llm(test_df, 'hallucinated_llama', 'hallucinated_gpt')
print(f"✓ add_labels_llm function works correctly")
print(f"  Labels llama: {result_df['labels_llama'].tolist()}")
print(f"  Labels gpt: {result_df['labels_gpt'].tolist()}")

Adding binary labels for LLM judge evaluations...
✓ add_labels_llm function works correctly
  Labels llama: [0, 1]
  Labels gpt: [0, 1]
