In [1]:
import os
os.chdir('/home/smallyan/eval_agent')

import torch
import sys
import json
import random

# Check CUDA
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# Add the belief-tracking repo to path
repo_path = '/net/scratch2/smallyan/belief_tracking_eval'
sys.path.append(repo_path)

from src.dataset import Dataset, Sample

# Load synthetic entities
with open(os.path.join(repo_path, 'data', 'synthetic_entities', 'characters.json'), 'r') as f:
    all_characters = json.load(f)
with open(os.path.join(repo_path, 'data', 'synthetic_entities', 'bottles.json'), 'r') as f:
    all_objects = json.load(f)
with open(os.path.join(repo_path, 'data', 'synthetic_entities', 'drinks.json'), 'r') as f:
    all_states = json.load(f)

print(f"Loaded {len(all_characters)} characters, {len(all_objects)} objects, {len(all_states)} states")

CUDA available: True
CUDA device: NVIDIA A100 80GB PCIe
Loaded 103 characters, 21 objects, 23 states


# Generalizability Evaluation for Belief-Tracking Circuit

This notebook evaluates the generalizability of the belief-tracking circuit findings from "Language Models use Lookbacks to Track Beliefs" (Prakash et al., 2025).

## Evaluation Criteria:
- **GT1**: Model Generalization - Does the finding transfer to a new model?
- **GT2**: Data Generalization - Does the finding hold on new data instances?
- **GT3**: Method Generalization - Can the method be applied to another similar task?

## Original Findings:
1. **Answer Lookback Pointer**: Layers 34-52 encode pointer information at final token
2. **Answer Lookback Payload**: Layers 56+ encode the actual state token value
3. **Binding Address/Payload**: Layers 33-38 at state tokens
4. **Binding Source Reference**: Layers 20-34 encode character/object ordering IDs

**Original Models:** Llama-3-70B-Instruct, Llama-3.1-405B-Instruct

In [2]:
# Set HF cache to shared location
os.environ['HF_HOME'] = '/net/projects/chai-lab/shared_models'
os.environ['TRANSFORMERS_CACHE'] = '/net/projects/chai-lab/shared_models'

from nnsight import LanguageModel

# Load Llama-3.1-8B-Instruct - a smaller version of the Llama-3 family
# While this is in the same model family as the original, it's a different model size
# The original used 70B and 405B models - we'll use 8B
print("Loading Llama-3.1-8B-Instruct...")
model = LanguageModel(
    "meta-llama/Llama-3.1-8B-Instruct",
    device_map="cuda:0",
    torch_dtype=torch.float16,
    local_files_only=True,
    dispatch=True,
)
print(f"Model loaded. Number of layers: {model.config.num_hidden_layers}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

There was a problem when trying to write in your cache folder (/net/projects/chai-lab/shared_models). You should set the environment variable TRANSFORMERS_CACHE to a writable directory.


Loading Llama-3.1-8B-Instruct...


OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like meta-llama/Llama-3.1-8B-Instruct is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [3]:
# Let's check what's actually in the hub directory and find the correct cache path
import subprocess
result = subprocess.run(['find', '/net/projects/chai-lab/shared_models/hub/', '-name', 'config.json', '-type', 'f'], 
                       capture_output=True, text=True)
print("Config files found:")
print(result.stdout[:3000] if result.stdout else result.stderr)

Config files found:



In [4]:
# Check the llama model directories
import os
llama_path = '/net/projects/chai-lab/shared_models/hub/models--meta-llama--Llama-3.1-8B-Instruct'
if os.path.exists(llama_path):
    print(f"Llama path exists: {llama_path}")
    for root, dirs, files in os.walk(llama_path):
        level = root.replace(llama_path, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f'{indent}{os.path.basename(root)}/')
        if level < 3:  # Only show first 3 levels
            subindent = ' ' * 2 * (level + 1)
            for file in files[:10]:
                print(f'{subindent}{file}')
else:
    print("Llama path doesn't exist")
    # Check alternative path
    alt_path = '/net/projects/chai-lab/shared_models/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct'
    if os.path.exists(alt_path):
        print(f"Alternative path exists: {alt_path}")

Llama path exists: /net/projects/chai-lab/shared_models/hub/models--meta-llama--Llama-3.1-8B-Instruct
models--meta-llama--Llama-3.1-8B-Instruct/
  blobs/
    fc1cdddd6bfa91128d6e94ee73d0ce62bfcdb7af29e978ddcab30c66ae9ea7fa
    02ee80b6196926a5ad790a004d9efd6ab1ba6542
    2b1879f356aed350030bb40eb45ad362c89d9891096f79a3ab323d3ba5607668
    09d433f650646834a83c580877bd60c6d1f88f7755305c12576b5c7058f9af15
    db88166e2bc4c799fd5d1ae643b75e84d03ee70e
    0fd8120f1c6acddc268ebc2583058efaf699a771
    92ecfe1a2414458b4821ac8c13cf8cb70aed66b5eea8dc5ad9eeb4ff309d6d7b
    5cc5f00a5b203e90a27a3bd60d1ec393b07971e8
    0bb6fd75b3ad2fe988565929f329945262c2814e
    cc7276afd599de091142c6ed3005faf8a74aa257
  refs/
    main
  .no_exist/
    0e9e39f249a16976918f6564b8830bc894c89659/
      chat_template.jinja
      added_tokens.json
      model.safetensors
      tokenizer.model
  snapshots/
    0e9e39f249a16976918f6564b8830bc894c89659/
      model-00004-of-00004.safetensors
      config.json
      model-

In [5]:
# The model is cached, but there's a cache format issue. Let's try loading directly from the snapshot path
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch

snapshot_path = '/net/projects/chai-lab/shared_models/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659'

print("Loading config...")
config = AutoConfig.from_pretrained(snapshot_path)
print(f"Config loaded. Hidden size: {config.hidden_size}, Layers: {config.num_hidden_layers}")

print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(snapshot_path)
print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")

Loading config...
Config loaded. Hidden size: 4096, Layers: 32

Loading tokenizer...


Tokenizer loaded. Vocab size: 128000


In [6]:
print("Loading model...")
model_hf = AutoModelForCausalLM.from_pretrained(
    snapshot_path,
    torch_dtype=torch.float16,
    device_map="cuda:0"
)
print(f"Model loaded successfully.")

Loading model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully.


In [7]:
# Wrap with nnsight for intervention capabilities
from nnsight import LanguageModel

# Create nnsight wrapper
model = LanguageModel(model_hf, tokenizer=tokenizer)
print(f"NNsight model created. Num layers: {model.config.num_hidden_layers}")

NNsight model created. Num layers: 32


## GT1: Model Generalization Test

Testing whether the lookback mechanism findings generalize from Llama-3-70B-Instruct to Llama-3.1-8B-Instruct.

**Note:** While Llama-3.1-8B-Instruct is in the same model family, it is a different model size (8B vs 70B/405B) and was not used in the original experiments. The key test is whether the same layer-wise pattern of information flow (pointer in mid layers, payload in later layers) holds for this smaller model.

In [8]:
# First, test if the model can correctly answer belief tracking questions
random.seed(42)

# Create test samples
test_samples = []
for i in range(5):
    chars = random.sample(all_characters, 2)
    objs = random.sample(all_objects, 2)
    states = random.sample(all_states, 2)
    test_samples.append(Sample(template_idx=2, characters=chars, objects=objs, states=states))

dataset = Dataset(test_samples)

# Test each sample
print("Testing model on belief tracking task:")
print("=" * 60)

correct = 0
total = 0
for idx in range(len(test_samples)):
    item = dataset.__getitem__(idx, set_container=0, set_character=0)
    
    # Get model prediction
    inputs = tokenizer(item['prompt'], return_tensors='pt').to('cuda')
    with torch.no_grad():
        outputs = model_hf(**inputs)
        pred_token = outputs.logits[0, -1].argmax().item()
    
    pred_text = tokenizer.decode([pred_token]).strip().lower()
    expected = item['target'].strip().lower()
    
    is_correct = pred_text == expected
    correct += int(is_correct)
    total += 1
    
    print(f"Sample {idx+1}: Predicted='{pred_text}', Expected='{expected}' -> {'CORRECT' if is_correct else 'WRONG'}")

print(f"\nOverall accuracy: {correct}/{total} = {correct/total:.1%}")

Testing model on belief tracking task:


Sample 1: Predicted='wine', Expected='wine' -> CORRECT
Sample 2: Predicted='port', Expected='port' -> CORRECT
Sample 3: Predicted='wine', Expected='wine' -> CORRECT
Sample 4: Predicted='porter', Expected='porter' -> CORRECT
Sample 5: Predicted='laura', Expected='cocoa' -> WRONG

Overall accuracy: 4/5 = 80.0%


In [9]:
# Now let's test the key finding: Answer Lookback Pointer mechanism
# The original finding: patching residual stream at final token from layers 34-52 in a 70B model
# redirects the model to use a different state

# For the 8B model (32 layers), we'll scale the layer range proportionally:
# Original: layers 34-52 of 80 layers (~42.5%-65% depth) for pointer
# For 32 layers: approximately layers 14-21

# Create counterfactual pairs for testing
def create_reversed_counterfacts(n_samples=10):
    """Create pairs where sentence order is reversed"""
    samples = []
    for _ in range(n_samples):
        chars = random.sample(all_characters, 2)
        objs = random.sample(all_objects, 2)
        states = random.sample(all_states, 2)
        
        # Clean sample
        clean = Sample(template_idx=2, characters=chars, objects=objs, states=states)
        
        # Counterfactual: reverse the order
        cf = Sample(template_idx=2, 
                   characters=list(reversed(chars)), 
                   objects=list(reversed(objs)), 
                   states=list(reversed(states)))
        
        samples.append((clean, cf))
    return samples

random.seed(123)
cf_samples = create_reversed_counterfacts(10)
print(f"Created {len(cf_samples)} counterfactual pairs")

Created 10 counterfactual pairs


In [10]:
# Test the interchange intervention at different layers
# This tests the "pointer" mechanism - does patching the residual stream redirect to a different answer?

def run_iia_test(clean_prompt, cf_prompt, target_answer, layer_idx):
    """
    Run interchange intervention accuracy test.
    Patch residual stream from counterfactual to clean at final token position.
    Returns True if the model outputs the target (redirected) answer.
    """
    with torch.no_grad():
        # Get counterfactual hidden state at the specified layer
        with model.trace(cf_prompt):
            cf_hidden = model.model.layers[layer_idx].output[0][0, -1].clone().save()
        
        # Run clean prompt with patched hidden state
        with model.trace(clean_prompt):
            model.model.layers[layer_idx].output[0][0, -1] = cf_hidden
            pred_logits = model.lm_head.output[0, -1].save()
    
    pred_token = pred_logits.argmax().item()
    pred_text = tokenizer.decode([pred_token]).strip().lower()
    
    return pred_text == target_answer.strip().lower()

# Filter for samples where model gets both clean and counterfactual correct
valid_pairs = []
for clean, cf in cf_samples:
    clean_ds = Dataset([clean])
    cf_ds = Dataset([cf])
    
    # Get prompts (character 0 asking about container 0)
    clean_item = clean_ds.__getitem__(0, set_container=0, set_character=0)
    cf_item = cf_ds.__getitem__(0, set_container=1, set_character=1)  # Reversed indices
    
    # Test if model gets both correct
    inputs_clean = tokenizer(clean_item['prompt'], return_tensors='pt').to('cuda')
    inputs_cf = tokenizer(cf_item['prompt'], return_tensors='pt').to('cuda')
    
    with torch.no_grad():
        clean_pred = tokenizer.decode([model_hf(**inputs_clean).logits[0, -1].argmax().item()]).strip().lower()
        cf_pred = tokenizer.decode([model_hf(**inputs_cf).logits[0, -1].argmax().item()]).strip().lower()
    
    if clean_pred == clean_item['target'].lower() and cf_pred == cf_item['target'].lower():
        # Target is the "other" state (the one that would be retrieved if pointer is redirected)
        target = clean.states[1]  # The other state token
        valid_pairs.append((clean_item['prompt'], cf_item['prompt'], target, clean_item['target']))

print(f"Found {len(valid_pairs)} valid pairs where model answers both correctly")

Found 7 valid pairs where model answers both correctly


In [11]:
# Run IIA across all layers
from tqdm import tqdm

num_layers = model.config.num_hidden_layers
layer_iia = {}

print("Running Interchange Intervention Analysis across layers...")
for layer_idx in tqdm(range(num_layers)):
    correct = 0
    total = 0
    
    for clean_prompt, cf_prompt, target, original in valid_pairs:
        try:
            is_correct = run_iia_test(clean_prompt, cf_prompt, target, layer_idx)
            correct += int(is_correct)
            total += 1
        except Exception as e:
            print(f"Error at layer {layer_idx}: {e}")
            break
    
    if total > 0:
        layer_iia[layer_idx] = correct / total

print("\nIIA by layer:")
for layer, acc in sorted(layer_iia.items()):
    print(f"Layer {layer:2d}: {acc:.2f}")

Running Interchange Intervention Analysis across layers...


  0%|          | 0/32 [00:00<?, ?it/s]

100%|██████████| 32/32 [00:00<00:00, 3514.75it/s]

Error at layer 0: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
Error at layer 1: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
Error at layer 2: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
Error at layer 3: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_speci




In [12]:
# Fix tokenizer padding token
tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer pad_token set to eos_token")

Tokenizer pad_token set to eos_token


In [13]:
# Run IIA across all layers again
from tqdm import tqdm

num_layers = model.config.num_hidden_layers
layer_iia = {}

print("Running Interchange Intervention Analysis across layers...")
for layer_idx in tqdm(range(num_layers)):
    correct = 0
    total = 0
    
    for clean_prompt, cf_prompt, target, original in valid_pairs:
        try:
            is_correct = run_iia_test(clean_prompt, cf_prompt, target, layer_idx)
            correct += int(is_correct)
            total += 1
        except Exception as e:
            print(f"Error at layer {layer_idx}: {e}")
            continue
    
    if total > 0:
        layer_iia[layer_idx] = correct / total

print("\nIIA by layer (Answer Lookback Pointer Test):")
for layer, acc in sorted(layer_iia.items()):
    bar = "█" * int(acc * 20)
    print(f"Layer {layer:2d}: {acc:.2f} {bar}")

Running Interchange Intervention Analysis across layers...


  0%|          | 0/32 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  3%|▎         | 1/32 [00:12<06:37, 12.82s/it]

  6%|▋         | 2/32 [00:25<06:18, 12.63s/it]

  9%|▉         | 3/32 [00:38<06:12, 12.83s/it]

 12%|█▎        | 4/32 [00:51<05:57, 12.77s/it]

 16%|█▌        | 5/32 [01:03<05:43, 12.73s/it]