In [1]:
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


In [2]:
# Set up environment for generalization testing
import os
import sys
import torch
import numpy as np
import json
from pathlib import Path
from copy import deepcopy

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Add repo paths
REPO_ROOT = Path("/net/scratch2/smallyan/othello-world_eval")
sys.path.insert(0, str(REPO_ROOT / "mechanistic_interpretability"))
sys.path.insert(0, str(REPO_ROOT / "data"))

# Set environment for HuggingFace
os.environ['HF_HOME'] = '/tmp/hf_cache'
os.environ['HF_HUB_CACHE'] = '/tmp/hf_cache'
os.makedirs('/tmp/hf_cache', exist_ok=True)

# Create evaluation directory
eval_dir = REPO_ROOT / "evaluation"
eval_dir.mkdir(exist_ok=True)

# Seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print(f"Repository root: {REPO_ROOT}")
print(f"Evaluation directory: {eval_dir}")

Using device: cuda
Repository root: /net/scratch2/smallyan/othello-world_eval
Evaluation directory: /net/scratch2/smallyan/othello-world_eval/evaluation


# Generalizability Evaluation for Othello-World

This notebook evaluates whether the findings in the Othello-World repository generalize beyond the original experimental setting.

## Evaluation Checklist:
- **GT1**: Model Generalization - Do findings transfer to a new model?
- **GT2**: Data Generalization - Do findings hold on new data instances?
- **GT3**: Method/Specificity Generalizability - Can the method be applied to another similar task?

## Key Findings from Original Work:
1. Nonlinear probes can decode board state with ~1.7% error (synthetic model)
2. Interventional experiments show causal role of representation
3. Specific neurons detect specific board configurations (e.g., L5N1393)
4. Linear probes are less effective (~20% error)

In [3]:
# Install required packages
import subprocess
subprocess.run([sys.executable, "-m", "pip", "install", "transformer_lens==1.2.1", "-q"], capture_output=True)
subprocess.run([sys.executable, "-m", "pip", "install", "einops", "-q"], capture_output=True)
subprocess.run([sys.executable, "-m", "pip", "install", "git+https://github.com/neelnanda-io/neel-plotly.git", "-q"], capture_output=True)

# Import libraries
import torch
import torch.nn as nn
import einops
import transformer_lens
import transformer_lens.utils as tl_utils
from transformer_lens import HookedTransformer, HookedTransformerConfig
from huggingface_hub import hf_hub_download

# Import utility functions from the repo
from mech_interp_othello_utils import (
    OthelloBoardState, 
    to_string, to_int, 
    int_to_label, string_to_label,
    stoi_indices
)

# Disable gradients for inference
torch.set_grad_enabled(False)
print("Libraries imported successfully")

Disabling PyTorch because PyTorch >= 2.1 is required but found 1.13.1


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Libraries imported successfully


In [4]:
# Define model configuration (same for both synthetic and championship models)
model_config = HookedTransformerConfig(
    n_layers=8,
    d_model=512,
    d_head=64,
    n_heads=8,
    d_mlp=2048,
    d_vocab=61,
    n_ctx=59,
    act_fn="gelu",
    normalization_type="LNPre"
)

# Load the SYNTHETIC model (used in original study for neuron analysis)
synthetic_model = HookedTransformer(model_config)
synthetic_path = hf_hub_download(
    repo_id="NeelNanda/Othello-GPT-Transformer-Lens", 
    filename="synthetic_model.pth",
    cache_dir='/tmp/hf_cache'
)
synthetic_state_dict = torch.load(synthetic_path, map_location='cuda', weights_only=False)
synthetic_model.load_state_dict(synthetic_state_dict)
synthetic_model = synthetic_model.cuda()
print(f"Synthetic model loaded: {sum(p.numel() for p in synthetic_model.parameters()):,} parameters")

NVIDIA H200 NVL with CUDA capability sm_90 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70 sm_75 sm_80 sm_86.
If you want to use the NVIDIA H200 NVL GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

