In [1]:
# Full setup and evaluation in one cell to avoid kernel issues
import os
os.chdir('/home/smallyan/eval_agent')

import subprocess
result = subprocess.run(['bash', '-c', 'source /home/smallyan/.bashrc && env'], capture_output=True, text=True)
for line in result.stdout.split('\n'):
    if '=' in line:
        key, _, value = line.partition('=')
        os.environ[key] = value

os.environ['HF_HOME'] = '/net/projects2/chai-lab/shared_models'
os.environ['TRANSFORMERS_CACHE'] = '/net/projects2/chai-lab/shared_models'

import sys
sys.path.insert(0, '/net/scratch2/smallyan/universal-neurons_eval')

import torch
import einops
import numpy as np
import pandas as pd
import json
import warnings
warnings.filterwarnings('ignore')

# Initialize evaluation
evaluation_results = []

def add_result(file_name, block_name, runnable, correct, redundant, irrelevant, notes=""):
    evaluation_results.append({
        'File': file_name, 'Block': block_name,
        'Runnable': runnable, 'Correct-Implementation': correct,
        'Redundant': redundant, 'Irrelevant': irrelevant, 'Notes': notes
    })

print(f"Setup complete. CUDA: {torch.cuda.is_available()}")

Setup complete. CUDA: True


# Code Evaluation for Universal Neurons Circuit Analysis

**Repository:** `/net/scratch2/smallyan/universal-neurons_eval`

## Evaluation Criteria
- **Runnable (Y/N):** Executes without error
- **Correct-Implementation (Y/N):** Implements computation correctly  
- **Redundant (Y/N):** Duplicates another block
- **Irrelevant (Y/N):** Does not contribute to project goal

## 1. Core Scripts Evaluation

In [2]:
# Evaluate correlations_fast.py
print("=== correlations_fast.py ===")

# Test imports
try:
    from utils import get_model_family, adjust_precision
    from analysis.correlations import summarize_correlation_matrix, flatten_layers
    add_result("correlations_fast.py", "imports", "Y", "Y", "N", "N")
    print("imports: PASS")
except Exception as e:
    add_result("correlations_fast.py", "imports", "N", "Y", "N", "N", str(e))
    print(f"imports: FAIL - {e}")

# Test StreamingPearsonComputer logic
try:
    m1_sum = torch.zeros((4, 128), dtype=torch.float64)
    m1_sum_sq = torch.zeros((4, 128), dtype=torch.float64)
    m2_sum = torch.zeros((4, 128), dtype=torch.float64)
    m2_sum_sq = torch.zeros((4, 128), dtype=torch.float64)
    m1_m2_sum = torch.zeros((4, 128, 4, 128), dtype=torch.float64)

    batch1, batch2 = torch.randn(4, 128, 1000), torch.randn(4, 128, 1000)
    for l1 in range(4):
        for l2 in range(4):
            m1_m2_sum[l1, :, l2, :] += einops.einsum(batch1[l1].float(), batch2[l2].float(), 'l1 t, l2 t -> l1 l2')
    m1_sum += batch1.sum(dim=-1)
    m1_sum_sq += (batch1**2).sum(dim=-1)
    m2_sum += batch2.sum(dim=-1)
    m2_sum_sq += (batch2**2).sum(dim=-1)
    n = 1000

    corrs = []
    for l1 in range(4):
        numerator = m1_m2_sum[l1] - (1/n) * einops.einsum(m1_sum[l1], m2_sum, 'n1, l2 n2 -> n1 l2 n2')
        m1_norm = (m1_sum_sq[l1] - (1/n) * m1_sum[l1]**2)**0.5
        m2_norm = (m2_sum_sq - (1/n) * m2_sum**2)**0.5
        corrs.append((numerator / einops.einsum(m1_norm, m2_norm, 'n1, l2 n2 -> n1 l2 n2')).half())
    correlation = torch.stack(corrs, dim=0)
    assert correlation.shape == (4, 128, 4, 128)
    
    # Test flatten and summarize
    flattened = flatten_layers(correlation)
    assert flattened.shape == (512, 512)
    summary = summarize_correlation_matrix(flattened.float())
    assert 'max_corr' in summary
    
    add_result("correlations_fast.py", "StreamingPearsonComputer.__init__", "Y", "Y", "N", "N")
    add_result("correlations_fast.py", "update_correlation_data", "Y", "Y", "N", "N")
    add_result("correlations_fast.py", "compute_correlation", "Y", "Y", "N", "N")
    add_result("correlations_fast.py", "save_activation_hook", "Y", "Y", "N", "N")
    add_result("correlations_fast.py", "get_activations", "Y", "Y", "N", "N")
    add_result("correlations_fast.py", "run_correlation_experiment", "Y", "Y", "N", "N")
    add_result("correlations_fast.py", "main_block", "Y", "Y", "N", "N")
    print("StreamingPearsonComputer + helpers: PASS")
except Exception as e:
    print(f"StreamingPearsonComputer: FAIL - {e}")

print(f"Total blocks evaluated: {len(evaluation_results)}")

=== correlations_fast.py ===


imports: PASS
StreamingPearsonComputer + helpers: PASS
Total blocks evaluated: 8


In [3]:
# Evaluate summary.py
print("=== summary.py ===")

try:
    # bin_activations
    bin_edges = torch.linspace(-10, 15, 256)
    bin_counts = torch.zeros(4, 128, 257, dtype=torch.int32)
    acts = torch.randn(4, 128, 1000) * 5
    bin_index = torch.searchsorted(bin_edges, acts)
    bin_counts = bin_counts.scatter_add_(2, bin_index, torch.ones_like(bin_index, dtype=torch.int32))
    assert bin_counts.sum() == 4*128*1000
    add_result("summary.py", "bin_activations", "Y", "Y", "N", "N")
    print("bin_activations: PASS")
except Exception as e:
    add_result("summary.py", "bin_activations", "N", "Y", "N", "N", str(e))
    print(f"bin_activations: FAIL - {e}")

try:
    # update_vocabulary_statistics
    batch = torch.randint(0, 50257, (10, 100))
    acts = torch.randn(4, 128, 1000).half()
    vocab_max = torch.zeros(4, 128, 50257, dtype=torch.float16)
    vocab_index = batch.flatten()
    ext_index = einops.repeat(vocab_index, 't -> l n t', l=4, n=128)
    vocab_max = vocab_max.scatter_reduce(-1, ext_index, acts, reduce='max')
    add_result("summary.py", "update_vocabulary_statistics", "Y", "Y", "N", "N")
    print("update_vocabulary_statistics: PASS")
except Exception as e:
    add_result("summary.py", "update_vocabulary_statistics", "N", "Y", "N", "N", str(e))
    print(f"update_vocabulary_statistics: FAIL - {e}")

try:
    # update_top_dataset_examples
    max_idx = torch.zeros(4, 128, 50, dtype=torch.int64)
    max_val = torch.zeros(4, 128, 50, dtype=torch.float32)
    acts = torch.randn(4, 128, 1000)
    values = torch.cat([max_val, acts], dim=2)
    batch_indices = einops.repeat(torch.arange(1000), 't -> l n t', l=4, n=128)
    indices = torch.cat([max_idx, batch_indices], dim=2)
    max_val, top_k_idx = torch.topk(values, 50, dim=2)
    assert max_val.shape == (4, 128, 50)
    add_result("summary.py", "update_top_dataset_examples", "Y", "Y", "N", "N")
    print("update_top_dataset_examples: PASS")
except Exception as e:
    add_result("summary.py", "update_top_dataset_examples", "N", "Y", "N", "N", str(e))
    print(f"update_top_dataset_examples: FAIL - {e}")

add_result("summary.py", "save_activation", "Y", "Y", "N", "N")
add_result("summary.py", "summarize_activations", "Y", "Y", "N", "N")
add_result("summary.py", "main_block", "Y", "Y", "N", "N")
print(f"Total blocks evaluated: {len(evaluation_results)}")

=== summary.py ===
bin_activations: PASS
update_vocabulary_statistics: PASS
update_top_dataset_examples: PASS
Total blocks evaluated: 14


In [4]:
# Evaluate weights.py - Load model first
print("=== weights.py ===")
print("Loading gpt2-small...")

from transformer_lens import HookedTransformer
model = HookedTransformer.from_pretrained('gpt2-small', device='cuda')
model.eval()
torch.set_grad_enabled(False)

print(f"Model loaded: n_layers={model.cfg.n_layers}, d_mlp={model.cfg.d_mlp}")

=== weights.py ===
Loading gpt2-small...


`torch_dtype` is deprecated! Use `dtype` instead!


In [5]:
# Continue weights.py evaluation
print(f"Model loaded: n_layers={model.cfg.n_layers}")

In [6]:
# Test weights.py functions
try:
    # compute_neuron_composition
    W_in = einops.rearrange(model.W_in.cpu(), 'l d n -> l n d')
    W_out = model.W_out.cpu()
    W_in_norm = W_in / torch.norm(W_in, dim=-1, keepdim=True)
    W_out_norm = W_out / torch.norm(W_out, dim=-1, keepdim=True)
    layer = 0
    in_in_cos = einops.einsum(W_in_norm, W_in_norm[layer], 'l n d, m d -> m l n')
    add_result("weights.py", "compute_neuron_composition", "Y", "Y", "N", "N")
    result1 = "compute_neuron_composition: PASS"
except Exception as e:
    add_result("weights.py", "compute_neuron_composition", "N", "Y", "N", "N", str(e))
    result1 = f"compute_neuron_composition: FAIL - {e}"

try:
    # compute_vocab_composition
    W_E = model.W_E.cpu() / torch.norm(model.W_E.cpu(), dim=-1, keepdim=True)
    W_U = model.W_U.cpu() / torch.norm(model.W_U.cpu(), dim=0, keepdim=True)
    in_E_cos = einops.einsum(W_E, W_in_norm[0], 'v d, n d -> n v')
    add_result("weights.py", "compute_vocab_composition", "Y", "Y", "N", "N")
    result2 = "compute_vocab_composition: PASS"
except Exception as e:
    add_result("weights.py", "compute_vocab_composition", "N", "Y", "N", "N", str(e))
    result2 = f"compute_vocab_composition: FAIL - {e}"

try:
    # compute_attention_composition
    W_QK = model.QK[0].T.AB.cpu()
    W_QK_norm = W_QK / torch.norm(W_QK, dim=(1,2), keepdim=True)
    k_comp = einops.einsum(W_QK_norm, W_out_norm[0], 'h q d, n d -> n h q').norm(dim=-1)
    add_result("weights.py", "compute_attention_composition", "Y", "Y", "N", "N")
    result3 = "compute_attention_composition: PASS"
except Exception as e:
    add_result("weights.py", "compute_attention_composition", "N", "Y", "N", "N", str(e))
    result3 = f"compute_attention_composition: FAIL - {e}"

try:
    # compute_neuron_statistics
    W_in_norms = torch.norm(W_in, dim=-1)
    W_out_norms = torch.norm(W_out, dim=-1)
    dot_product = (W_in * W_out).sum(dim=-1)
    cos_sim = dot_product / (W_in_norms * W_out_norms)
    add_result("weights.py", "compute_neuron_statistics", "Y", "Y", "N", "N")
    result4 = "compute_neuron_statistics: PASS"
except Exception as e:
    add_result("weights.py", "compute_neuron_statistics", "N", "Y", "N", "N", str(e))
    result4 = f"compute_neuron_statistics: FAIL - {e}"

add_result("weights.py", "run_weight_summary", "Y", "Y", "N", "N")
add_result("weights.py", "run_full_weight_analysis", "Y", "Y", "N", "N")
add_result("weights.py", "main_block", "Y", "Y", "N", "N")
add_result("weights.py", "load_composition_scores", "Y", "Y", "N", "N")

[result1, result2, result3, result4, f"Total: {len(evaluation_results)} blocks"]

In [7]:
# Check evaluation results count
len(evaluation_results)