In [1]:
import os
os.chdir('/home/smallyan/eval_agent')

# Set environment variables
os.environ['HF_HOME'] = '/home/smallyan/.cache/huggingface'
os.environ['TRANSFORMERS_CACHE'] = '/home/smallyan/.cache/huggingface/transformers'
os.makedirs('/home/smallyan/.cache/huggingface/transformers', exist_ok=True)

# Load environment variables
import subprocess
result = subprocess.run(['bash', '-c', 'source /home/smallyan/.bashrc && env'], capture_output=True, text=True)
for line in result.stdout.split('\n'):
    if '=' in line:
        key, _, value = line.partition('=')
        if 'TOKEN' in key or 'API' in key or 'KEY' in key:
            os.environ[key] = value

import torch
import numpy as np
import pandas as pd
from scipy import stats

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

repo_path = '/net/scratch2/smallyan/universal-neurons_eval'
print(f"Repo path: {repo_path}")

Using device: cuda
Repo path: /net/scratch2/smallyan/universal-neurons_eval


# Generalizability Evaluation for Universal Neurons

This notebook evaluates whether the findings in the `universal-neurons_eval` repository generalize beyond the original experimental setting.

## Evaluation Checklist
- **GT1**: Generalization to a New Model
- **GT2**: Generalization to New Data  
- **GT3**: Method / Specificity Generalizability

## Key Findings from Original Work

1. **Universal neurons** are neurons that activate on the same inputs across different models trained from different random seeds
2. Universal neurons have **excess correlation > 0.5** and comprise only 1-5% of neurons
3. They have **statistical signatures**: large negative input bias, high activation skew/kurtosis, high weight norm
4. **Models used**: GPT2-small, GPT2-medium, Pythia-160m
5. **Dataset**: Pile test set

In [2]:
# Load the identified universal neurons from the original study
interp_path = os.path.join(repo_path, 'dataframes', 'interpretable_neurons', 'stanford-gpt2-medium-a')
universal_neurons_df = pd.read_csv(os.path.join(interp_path, 'universal.csv'))
prediction_neurons_df = pd.read_csv(os.path.join(interp_path, 'prediction_neurons.csv'))

print(f"Universal neurons: {len(universal_neurons_df)}")
print(f"Prediction neurons: {len(prediction_neurons_df)}")

# Look at some high-correlation universal neurons
print("\nTop universal neurons (by excess correlation):")
top_universal = universal_neurons_df.nlargest(10, 'excess_corr')[['layer', 'neuron', 'excess_corr', 'input_bias', 'skew', 'kurt']]
print(top_universal)

Universal neurons: 1211
Prediction neurons: 136

Top universal neurons (by excess correlation):
      layer  neuron  excess_corr  input_bias      skew       kurt
156       1     657     0.808985   -0.747676  2.851113  26.548020
16        0     553     0.794025   -0.365732  2.678056  18.010220
327       3    1352     0.791950   -2.007722  1.144166   6.914615
288       2    2130     0.787500   -0.563056  2.605676  15.836667
4         0     185     0.781975   -0.610551  1.985424  21.138607
173       1    1031     0.777950   -0.973660  2.880477  24.714020
182       1    1393     0.776575   -0.910241  3.044531  32.796566
127       0    3954     0.772575   -0.492060  4.159611  37.739380
1168     23    2517     0.770200   -0.295914  2.378946  13.762036
380       4    2534     0.768275   -1.329550  1.391916   8.143710


In [3]:
# Load GPT2-small first (faster) to verify setup works
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import warnings
warnings.filterwarnings('ignore')

cache_dir = '/home/smallyan/.cache/huggingface/transformers'

print("Loading GPT2-small...")
gpt2_small = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=cache_dir).to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=cache_dir)
tokenizer.pad_token = tokenizer.eos_token
print(f"GPT2-small loaded: {gpt2_small.config.n_layer} layers, {gpt2_small.config.n_embd} embd")



Loading GPT2-small...


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [4]:
print(f"GPT2-small type: {type(gpt2_small)}")
print(f"Layers: {gpt2_small.config.n_layer}")

In [5]:
# Simple test
x = 1 + 1
print(f"x = {x}")