In [1]:
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


# Code Evaluation for Universal Neurons Circuit Analysis

This notebook evaluates the code implementation in `/net/scratch2/smallyan/universal-neurons_eval`.

## Setup and Environment Configuration

In [2]:
# Load environment variables from bashrc
import subprocess
result = subprocess.run(['bash', '-c', 'source /home/smallyan/.bashrc && env'], capture_output=True, text=True)
for line in result.stdout.split('\n'):
    if '=' in line:
        key, _, value = line.partition('=')
        os.environ[key] = value

# Set HuggingFace cache directory
os.environ['HF_HOME'] = '/net/projects2/chai-lab/shared_models'
os.environ['TRANSFORMERS_CACHE'] = '/net/projects2/chai-lab/shared_models'

print("Environment configured")

Environment configured


In [3]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

CUDA available: True
GPU: NVIDIA A100 80GB PCIe
GPU Memory: 85.09 GB


## Code Structure Overview

Based on the CodeWalkthrough.md, the repository has:

**Core Scripts:**
- `correlations_fast.py` - Compute neuron correlations
- `summary.py` - Compute neuron activation summaries
- `weights.py` - Compute weight statistic summaries  
- `activations.py` - Cache neuron activations
- `explain.py` - Compute reduction in variance explanations
- `attention_deactivation.py` - Attention deactivation experiments
- `entropy_intervention.py` - Entropy intervention experiments
- `intervention.py` - General intervention experiments

**Analysis Module:**
- `analysis/` - Plotting and analysis code

**Paper Notebooks:**
- `paper_notebooks/` - 12 notebooks for paper figures

Let me now evaluate each component systematically.

## 1. Evaluating correlations_fast.py

This script computes pairwise Pearson correlations between neuron activations across models.

In [4]:
# Test importing the correlations_fast.py dependencies
import sys
sys.path.insert(0, '/net/scratch2/smallyan/universal-neurons_eval')

# Test basic imports
try:
    import torch as t
    import einops
    import datasets
    from functools import partial
    from torch.utils.data import DataLoader
    from transformer_lens import HookedTransformer
    print("Basic imports: SUCCESS")
except Exception as e:
    print(f"Basic imports FAILED: {e}")

# Test utils import
try:
    from utils import get_model_family, adjust_precision
    print("utils import: SUCCESS")
except Exception as e:
    print(f"utils import FAILED: {e}")

# Test analysis.correlations import
try:
    from analysis.correlations import summarize_correlation_matrix, flatten_layers
    print("analysis.correlations import: SUCCESS")
except Exception as e:
    print(f"analysis.correlations import FAILED: {e}")



Basic imports: SUCCESS
utils import: SUCCESS


analysis.correlations import: SUCCESS


In [5]:
# Test StreamingPearsonComputer class
try:
    # Create a minimal mock model class for testing
    class MockConfig:
        def __init__(self, n_layers, d_mlp):
            self.n_layers = n_layers
            self.d_mlp = d_mlp
    
    class MockModel:
        def __init__(self, n_layers, d_mlp):
            self.cfg = MockConfig(n_layers, d_mlp)
    
    # Import and test the class
    exec(open('/net/scratch2/smallyan/universal-neurons_eval/correlations_fast.py').read().split('def save_activation_hook')[0])
    
    # Create test models  
    model1 = MockModel(4, 128)
    model2 = MockModel(4, 128)
    
    computer = StreamingPearsonComputer(model1, model2, device='cpu')
    print(f"StreamingPearsonComputer initialized")
    print(f"  m1_sum shape: {computer.m1_sum.shape}")
    print(f"  m1_m2_sum shape: {computer.m1_m2_sum.shape}")
    
    # Test update with random data
    batch1 = torch.randn(4, 128, 1000)  # (layers, d_mlp, tokens)
    batch2 = torch.randn(4, 128, 1000)
    computer.update_correlation_data(batch1, batch2)
    print(f"  Update successful, n={computer.n}")
    
    # Test compute correlation
    correlation = computer.compute_correlation()
    print(f"  Correlation shape: {correlation.shape}")
    print("StreamingPearsonComputer: SUCCESS")
except Exception as e:
    import traceback
    print(f"StreamingPearsonComputer FAILED: {e}")
    traceback.print_exc()

StreamingPearsonComputer initialized
  m1_sum shape: torch.Size([4, 128])
  m1_m2_sum shape: torch.Size([4, 128, 4, 128])


  Update successful, n=1000


  Correlation shape: torch.Size([4, 128, 4, 128])
StreamingPearsonComputer: SUCCESS


In [6]:
# Test flatten_layers and summarize_correlation_matrix functions
try:
    # Create test correlation data  
    correlation = torch.randn(4, 128, 4, 128)
    
    # Test flatten_layers
    flattened = flatten_layers(correlation)
    print(f"flatten_layers: input {correlation.shape} -> output {flattened.shape}")
    assert flattened.shape == (512, 512), f"Expected (512, 512), got {flattened.shape}"
    print("flatten_layers: SUCCESS")
    
    # Test summarize_correlation_matrix
    summary = summarize_correlation_matrix(flattened.float())
    print(f"summarize_correlation_matrix keys: {list(summary.keys())}")
    print(f"  bin_counts shape: {summary['bin_counts'].shape}")
    print(f"  max_corr shape: {summary['max_corr'].shape}")
    print("summarize_correlation_matrix: SUCCESS")
except Exception as e:
    import traceback
    print(f"FAILED: {e}")
    traceback.print_exc()

flatten_layers: input torch.Size([4, 128, 4, 128]) -> output torch.Size([512, 512])
flatten_layers: SUCCESS


summarize_correlation_matrix keys: ['diag_corr', 'obo_corr', 'bin_counts', 'max_corr', 'max_corr_ix', 'min_corr', 'min_corr_ix', 'max_tail_corr', 'max_tail_corr_ix', 'min_tail_corr', 'min_tail_corr_ix', 'corr_mean', 'corr_var', 'corr_skew', 'corr_kurt']
  bin_counts shape: torch.Size([512, 101])
  max_corr shape: torch.Size([512])
summarize_correlation_matrix: SUCCESS


### correlations_fast.py Evaluation Result

| Block/Function | Runnable | Correct-Implementation | Redundant | Irrelevant | Notes |
|----------------|----------|----------------------|-----------|------------|-------|
| Imports | Y | Y | N | N | |
| StreamingPearsonComputer.__init__ | Y | Y | N | N | |
| StreamingPearsonComputer.update_correlation_data | Y | Y | N | N | |
| StreamingPearsonComputer.compute_correlation | Y | Y | N | N | |
| save_activation_hook | Y | Y | N | N | |
| get_activations | Y | Y | N | N | |
| run_correlation_experiment | Y | Y | N | N | |
| main block (argparse) | Y | Y | N | N | |

In [7]:
# Test summary.py functions
try:
    # Test bin_activations
    neuron_bin_edges = torch.linspace(-10, 15, 256)
    neuron_bin_counts = torch.zeros(4, 128, 257, dtype=torch.int32)
    activations = torch.randn(4, 128, 1000) * 5
    
    # Implement bin_activations locally for testing
    bin_index = torch.searchsorted(neuron_bin_edges, activations)
    neuron_bin_counts = neuron_bin_counts.scatter_add_(
        2, bin_index, torch.ones_like(bin_index, dtype=torch.int32))
    print(f"bin_activations: SUCCESS")
    print(f"  bin_counts shape: {neuron_bin_counts.shape}")
    print(f"  total counts: {neuron_bin_counts.sum()}")
except Exception as e:
    import traceback
    print(f"bin_activations FAILED: {e}")
    traceback.print_exc()

bin_activations: SUCCESS
  bin_counts shape: torch.Size([4, 128, 257])
  total counts: 512000


In [8]:
# Test update_vocabulary_statistics
try:
    layers, neurons = 4, 128
    d_vocab = 50257
    n_tokens = 1000
    
    batch = torch.randint(0, d_vocab, (10, 100))  # 10x100 batch
    activations = torch.randn(layers, neurons, n_tokens).to(torch.float16)
    neuron_vocab_max = torch.zeros(layers, neurons, d_vocab, dtype=torch.float16)
    neuron_vocab_sum = torch.zeros(layers, neurons, d_vocab, dtype=torch.float32)
    vocab_counts = torch.zeros(d_vocab)
    
    vocab_index = batch.flatten()
    extended_index = einops.repeat(vocab_index, 't -> l n t', l=layers, n=neurons)
    
    neuron_vocab_max = neuron_vocab_max.scatter_reduce(
        -1, extended_index, activations, reduce='max')
    
    neuron_vocab_sum = neuron_vocab_sum.scatter_reduce(
        -1, extended_index, activations.to(torch.float32), reduce='sum')
    
    token_ix, batch_count = torch.unique(vocab_index, return_counts=True)
    vocab_counts[token_ix] += batch_count
    
    print(f"update_vocabulary_statistics: SUCCESS")
    print(f"  neuron_vocab_max shape: {neuron_vocab_max.shape}")
    print(f"  non-zero vocab entries: {(vocab_counts > 0).sum().item()}")
except Exception as e:
    import traceback
    print(f"update_vocabulary_statistics FAILED: {e}")
    traceback.print_exc()

update_vocabulary_statistics: SUCCESS
  neuron_vocab_max shape: torch.Size([4, 128, 50257])
  non-zero vocab entries: 996


In [9]:
# Test update_top_dataset_examples
try:
    n_layer, n_neuron, k = 4, 128, 50
    
    neuron_max_activating_index = torch.zeros(n_layer, n_neuron, k, dtype=torch.int64)
    neuron_max_activating_value = torch.zeros(n_layer, n_neuron, k, dtype=torch.float32)
    activations = torch.randn(n_layer, n_neuron, 1000)
    index_offset = 0
    
    values = torch.cat([neuron_max_activating_value, activations], dim=2)
    
    batch_indices = torch.arange(activations.shape[2]) + index_offset
    extended_batch_indices = einops.repeat(
        batch_indices, 't -> l n t', l=n_layer, n=n_neuron)
    indices = torch.cat([
        neuron_max_activating_index,
        extended_batch_indices
    ], dim=2)
    
    neuron_max_activating_value, top_k_indices = torch.topk(values, k, dim=2)
    neuron_max_activating_index = torch.gather(indices, 2, top_k_indices)
    
    print(f"update_top_dataset_examples: SUCCESS")
    print(f"  top values shape: {neuron_max_activating_value.shape}")
    print(f"  top indices shape: {neuron_max_activating_index.shape}")
except Exception as e:
    import traceback
    print(f"update_top_dataset_examples FAILED: {e}")
    traceback.print_exc()

update_top_dataset_examples: SUCCESS
  top values shape: torch.Size([4, 128, 50])
  top indices shape: torch.Size([4, 128, 50])


In [10]:
# Test weights.py functions
from transformer_lens import HookedTransformer
import pandas as pd

# Load a small model for testing
print("Loading gpt2-small for weights.py testing...")
model = HookedTransformer.from_pretrained('gpt2-small', device='cuda')
model.eval()
print(f"Model loaded: {model.cfg.model_name}, layers={model.cfg.n_layers}, d_mlp={model.cfg.d_mlp}")

Loading gpt2-small for weights.py testing...


`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [11]:
# Test compute_neuron_composition
try:
    W_in = einops.rearrange(model.W_in, 'l d n -> l n d')
    W_out = model.W_out
    
    W_in_norm = W_in / torch.norm(W_in, dim=-1, keepdim=True)
    W_out_norm = W_out / torch.norm(W_out, dim=-1, keepdim=True)
    
    layer = 0
    in_in_cos = einops.einsum(
        W_in_norm, W_in_norm[layer, :, :], f'l n d, m d -> m l n')
    
    print(f"compute_neuron_composition: SUCCESS")
    print(f"  in_in_cos shape: {in_in_cos.shape}")
except Exception as e:
    import traceback
    print(f"compute_neuron_composition FAILED: {e}")
    traceback.print_exc()

In [12]:
# Test compute_neuron_composition
try:
    W_in = einops.rearrange(model.W_in.cpu(), 'l d n -> l n d')
    W_out = model.W_out.cpu()
    
    W_in_norm = W_in / torch.norm(W_in, dim=-1, keepdim=True)
    W_out_norm = W_out / torch.norm(W_out, dim=-1, keepdim=True)
    
    layer = 0
    in_in_cos = einops.einsum(
        W_in_norm, W_in_norm[layer, :, :], 'l n d, m d -> m l n')
    
    print(f"compute_neuron_composition: SUCCESS")
    print(f"  in_in_cos shape: {in_in_cos.shape}")
except Exception as e:
    import traceback
    print(f"compute_neuron_composition FAILED: {e}")
    traceback.print_exc()

In [13]:
print(f"compute_neuron_composition: SUCCESS")
print(f"  in_in_cos shape: {in_in_cos.shape}")

In [14]:
import sys
sys.stdout.flush()
print("Test output")
sys.stdout.flush()

In [15]:
# Check if kernel is responsive and test basic output
result = 1 + 1
print(f"Basic test: {result}")
print(f"in_in_cos shape: {in_in_cos.shape}")
print("Kernel is responsive")