In [1]:
import os
os.chdir('/home/smallyan/eval_agent')

# Set environment variables
os.environ['HF_HOME'] = '/home/smallyan/.cache/huggingface'
os.environ['TRANSFORMERS_CACHE'] = '/home/smallyan/.cache/huggingface/transformers'
os.makedirs('/home/smallyan/.cache/huggingface/transformers', exist_ok=True)

# Load environment variables
import subprocess
result = subprocess.run(['bash', '-c', 'source /home/smallyan/.bashrc && env'], capture_output=True, text=True)
for line in result.stdout.split('\n'):
    if '=' in line:
        key, _, value = line.partition('=')
        if 'TOKEN' in key or 'API' in key or 'KEY' in key:
            os.environ[key] = value

print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


# Generalizability Evaluation for Universal Neurons

This notebook evaluates whether the findings in the `universal-neurons_eval` repository generalize beyond the original experimental setting.

## Evaluation Checklist
- **GT1**: Generalization to a New Model
- **GT2**: Generalization to New Data  
- **GT3**: Method / Specificity Generalizability

## Key Findings from Original Work

1. **Universal neurons** are neurons that activate on the same inputs across different models trained from different random seeds
2. Universal neurons have **excess correlation > 0.5** and comprise only 1-5% of neurons
3. They have **statistical signatures**: large negative input bias, high activation skew/kurtosis, high weight norm
4. **Models used**: GPT2-small, GPT2-medium, Pythia-160m
5. **Dataset**: Pile test set

In [2]:
import torch
import numpy as np
import pandas as pd
from scipy import stats

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
print(f"Using device: {device}")

CUDA available: True
CUDA device: NVIDIA A100 80GB PCIe
Using device: cuda


In [3]:
# Load GPT2-large (NOT used in original study - for GT1 test)
from transformers import GPT2LMHeadModel, GPT2Tokenizer

cache_dir = '/home/smallyan/.cache/huggingface/transformers'

print("Loading GPT2-large (new model not used in original study)...")
gpt2_large = GPT2LMHeadModel.from_pretrained('gpt2-large', cache_dir=cache_dir).to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large', cache_dir=cache_dir)
print(f"GPT2-large loaded: {gpt2_large.config.n_layer} layers, {gpt2_large.config.n_head} heads, {gpt2_large.config.n_embd} embd dim")



Loading GPT2-large (new model not used in original study)...


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

In [4]:
# Check if model loaded successfully
print(f"GPT2-large model type: {type(gpt2_large)}")
print(f"Config: {gpt2_large.config.n_layer} layers")

In [5]:
# The download might still be in progress. Let's wait and retry
import time
time.sleep(10)
print("Checking model status...")