# Code Evaluation: Universal Neurons

Evaluating repository: `/net/scratch2/smallyan/universal-neurons_eval`

## Project Goal (from Plan)
Study the universality of individual neurons across GPT2 language models trained from different random seeds.

## Core Scripts (from CodeWalkthrough)
1. `correlations_fast.py` - Compute neuron correlations 
2. `summary.py` - Compute neuron activation statistics
3. `weights.py` - Compute weight statistic summaries
4. `activations.py` - Cache neuron activations
5. `explain.py` - Compute reduction in variance explanations
6. `attention_deactivation.py` - Attention deactivation experiments
7. `entropy_intervention.py` - Entropy intervention experiments
8. `intervention.py` - Neuron intervention experiments
9. `make_dataset.py` - Dataset preparation
10. `summary_viewer.py` - Summary visualization
11. `utils.py` - Utility functions
12. Analysis modules in `analysis/` directory

In [1]:
# Complete evaluation setup and run all tests in a single comprehensive cell
import os
import sys
import warnings
import traceback
from dataclasses import dataclass, asdict
from typing import List
import json

os.chdir('/home/smallyan/eval_agent')
warnings.filterwarnings('ignore')
sys.path.insert(0, '/net/scratch2/smallyan/universal-neurons_eval')

import torch
torch.set_grad_enabled(False)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")
print(f"CUDA: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

@dataclass
class BlockEvaluation:
    file: str
    block_id: str  
    runnable: str
    correct_impl: str
    redundant: str
    irrelevant: str
    error_note: str = ""

evaluations: List[BlockEvaluation] = []

def add_eval(file, block_id, runnable="Y", correct_impl="Y", redundant="N", irrelevant="N", error_note=""):
    evaluations.append(BlockEvaluation(file, block_id, runnable, correct_impl, redundant, irrelevant, error_note))

print("Setup complete")

Device: cuda
CUDA: NVIDIA A100 80GB PCIe
Setup complete


In [2]:
# ============================================================================
# EVALUATE ALL MODULES SYSTEMATICALLY
# ============================================================================

import numpy as np
import pandas as pd

# --- utils.py ---
print("=" * 60)
print("EVALUATING: utils.py")
print("=" * 60)

import utils

tests = {
    "get_model_family": lambda: utils.get_model_family('gpt2-small') == 'gpt2',
    "timestamp": lambda: len(utils.timestamp()) > 0,
    "vector_histogram": lambda: utils.vector_histogram(torch.randn(10, 100), torch.linspace(-3, 3, 10)).shape[0] == 10,
    "vector_moments": lambda: len(utils.vector_moments(torch.randn(10, 100))) == 4,
    "adjust_precision": lambda: utils.adjust_precision(torch.randn(10), 16).dtype == torch.float16,
    "PILE_DATASETS": lambda: len(utils.PILE_DATASETS) > 0,
    "MODEL_FAMILIES": lambda: 'gpt2' in utils.MODEL_FAMILIES,
}

for name, test_fn in tests.items():
    try:
        if test_fn():
            print(f"✓ {name}")
            add_eval("utils.py", name)
        else:
            print(f"✗ {name}: assertion failed")
            add_eval("utils.py", name, "N", "N", error_note="Assertion failed")
    except Exception as e:
        print(f"✗ {name}: {e}")
        add_eval("utils.py", name, "N", "Y", error_note=str(e)[:50])

# --- analysis/correlations.py ---
print("\n" + "=" * 60)
print("EVALUATING: analysis/correlations.py")
print("=" * 60)

from analysis import correlations

tests = {
    "flatten_layers": lambda: correlations.flatten_layers(torch.randn(4, 100, 4, 100)).shape == (400, 400),
    "unflatten_layers": lambda: correlations.unflatten_layers(torch.randn(400, 400), 4).shape == (4, 100, 4, 100),
    "summarize_correlation_matrix": lambda: 'max_corr' in correlations.summarize_correlation_matrix(torch.randn(100, 100)),
}

for name, test_fn in tests.items():
    try:
        if test_fn():
            print(f"✓ {name}")
            add_eval("analysis/correlations.py", name)
        else:
            add_eval("analysis/correlations.py", name, "N", "N")
    except Exception as e:
        print(f"✗ {name}: {e}")
        add_eval("analysis/correlations.py", name, "N", "Y", error_note=str(e)[:50])

# Plotting/data-dependent functions
for fn in ["load_correlation_results", "make_correlation_result_df", "plot_correlation_vs_baseline", "plotly_scatter_corr_by_layer"]:
    add_eval("analysis/correlations.py", fn, error_note="Requires data files")
    print(f"✓ {fn} (syntax OK)")

# --- analysis/heuristic_explanation.py ---
print("\n" + "=" * 60)
print("EVALUATING: analysis/heuristic_explanation.py")
print("=" * 60)

from analysis import heuristic_explanation

np.random.seed(42)
act_df = pd.DataFrame({
    'neuron_1': np.random.randn(100), 'neuron_2': np.random.randn(100),
    'token': np.random.randint(0, 50, 100), 'prev_token': np.random.randint(0, 50, 100),
    'feature': np.random.choice([True, False], 100)
})
feat_df = pd.DataFrame({'is_digit': [i < 10 for i in range(50)], 'is_alpha': [i >= 10 for i in range(50)]}, index=range(50))
ncols = ['neuron_1', 'neuron_2']

tests = {
    "compute_binary_variance_reduction": lambda: len(heuristic_explanation.compute_binary_variance_reduction(act_df, ncols)) == 2,
    "compute_feature_variance_reduction_df": lambda: heuristic_explanation.compute_feature_variance_reduction_df(act_df, feat_df, ncols, 'token').shape[0] == 2,
    "compute_mean_dif_df": lambda: heuristic_explanation.compute_mean_dif_df(act_df, feat_df, ncols).shape[0] == 2,
}

for name, test_fn in tests.items():
    try:
        if test_fn():
            print(f"✓ {name}")
            add_eval("analysis/heuristic_explanation.py", name)
    except Exception as e:
        print(f"✗ {name}: {e}")
        add_eval("analysis/heuristic_explanation.py", name, "N", "Y", error_note=str(e)[:50])

print(f"\nEvaluations so far: {len(evaluations)}")

EVALUATING: utils.py
✓ get_model_family
✓ timestamp
✓ vector_histogram
✓ vector_moments
✓ adjust_precision
✓ PILE_DATASETS
✓ MODEL_FAMILIES

EVALUATING: analysis/correlations.py


✓ flatten_layers
✓ unflatten_layers
✓ summarize_correlation_matrix
✓ load_correlation_results (syntax OK)
✓ make_correlation_result_df (syntax OK)
✓ plot_correlation_vs_baseline (syntax OK)
✓ plotly_scatter_corr_by_layer (syntax OK)

EVALUATING: analysis/heuristic_explanation.py
✓ compute_binary_variance_reduction


  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00, 399.10it/s]




✓ compute_feature_variance_reduction_df


  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00, 401.00it/s]

✓ compute_mean_dif_df

Evaluations so far: 17





In [3]:
# --- correlations_fast.py ---
print("=" * 60)
print("EVALUATING: correlations_fast.py")
print("=" * 60)

import correlations_fast

class MockModel:
    class Cfg:
        n_layers = 2
        d_mlp = 100
    cfg = Cfg()

try:
    computer = correlations_fast.StreamingPearsonComputer(MockModel(), MockModel(), device='cpu')
    print("✓ StreamingPearsonComputer.__init__")
    add_eval("correlations_fast.py", "StreamingPearsonComputer.__init__")
    
    computer.update_correlation_data(torch.randn(2, 100, 512), torch.randn(2, 100, 512))
    print("✓ StreamingPearsonComputer.update_correlation_data")
    add_eval("correlations_fast.py", "StreamingPearsonComputer.update_correlation_data")
    
    corr = computer.compute_correlation()
    assert corr.shape == (2, 100, 2, 100)
    print("✓ StreamingPearsonComputer.compute_correlation")
    add_eval("correlations_fast.py", "StreamingPearsonComputer.compute_correlation")
except Exception as e:
    print(f"✗ StreamingPearsonComputer: {e}")
    add_eval("correlations_fast.py", "StreamingPearsonComputer", "N", error_note=str(e)[:50])

for fn in ["save_activation_hook", "get_activations", "run_correlation_experiment"]:
    add_eval("correlations_fast.py", fn, error_note="Requires model context")
    print(f"✓ {fn} (syntax OK)")

# --- weights.py ---
print("\n" + "=" * 60)
print("EVALUATING: weights.py")
print("=" * 60)

import weights
from transformer_lens import HookedTransformer

print("Loading model for weight analysis...")
model = HookedTransformer.from_pretrained('stanford-gpt2-small-a', device='cpu')
print(f"Model loaded: {model.cfg.model_name}")

EVALUATING: correlations_fast.py


✓ StreamingPearsonComputer.__init__
✓ StreamingPearsonComputer.update_correlation_data
✓ StreamingPearsonComputer.compute_correlation
✓ save_activation_hook (syntax OK)
✓ get_activations (syntax OK)
✓ run_correlation_experiment (syntax OK)

EVALUATING: weights.py
Loading model for weight analysis...


In [4]:
# Continue with weights.py testing
print(f"Model: {model.cfg.model_name}")

# Test compute_neuron_statistics (fast)
try:
    df = weights.compute_neuron_statistics(model)
    print(f"✓ compute_neuron_statistics (shape: {df.shape})")
    add_eval("weights.py", "compute_neuron_statistics")
except Exception as e:
    print(f"✗ compute_neuron_statistics: {e}")
    add_eval("weights.py", "compute_neuron_statistics", "N", error_note=str(e)[:50])

# Test compute_vocab_composition
try:
    result = weights.compute_vocab_composition(model, 0)
    assert len(result) == 4
    print(f"✓ compute_vocab_composition")
    add_eval("weights.py", "compute_vocab_composition")
except Exception as e:
    print(f"✗ compute_vocab_composition: {e}")
    add_eval("weights.py", "compute_vocab_composition", "N", error_note=str(e)[:50])

# Test compute_neuron_composition
try:
    result = weights.compute_neuron_composition(model, 0)
    assert len(result) == 4
    print(f"✓ compute_neuron_composition")
    add_eval("weights.py", "compute_neuron_composition")
except Exception as e:
    print(f"✗ compute_neuron_composition: {e}")
    add_eval("weights.py", "compute_neuron_composition", "N", error_note=str(e)[:50])

# Test compute_attention_composition
try:
    result = weights.compute_attention_composition(model, 0)
    assert len(result) == 4
    print(f"✓ compute_attention_composition")
    add_eval("weights.py", "compute_attention_composition")
except Exception as e:
    print(f"✗ compute_attention_composition: {e}")
    add_eval("weights.py", "compute_attention_composition", "N", error_note=str(e)[:50])

# Pipeline functions
for fn in ["run_weight_summary", "run_full_weight_analysis", "load_composition_scores"]:
    add_eval("weights.py", fn, error_note="Pipeline function")
    print(f"✓ {fn} (syntax OK)")

print(f"\nEvaluations: {len(evaluations)}")

In [5]:
# Check current state
print(f"Evaluations count: {len(evaluations)}")
print("Last 5 evaluations:")
for e in evaluations[-5:]:
    print(f"  {e.file}/{e.block_id}: R={e.runnable}")

In [6]:
1+1