# Replication: Vector Arithmetic in Concept and Token Subspaces

This notebook replicates the experiment from the paper "Vector Arithmetic in Concept and Token Subspaces" (NeurIPS 2025 Mechanistic Interpretability Workshop).

**Goal**: Show that concept and token induction heads can identify subspaces for more accurate parallelogram arithmetic (e.g., Athens â€“ Greece + China = Beijing).

In [1]:
# Setup
import os, json, torch, gc
import numpy as np
from typing import Dict, List, Tuple, Optional

os.chdir('/home/smallyan/eval_agent')
torch.manual_seed(42)
np.random.seed(42)

repo_path = '/net/scratch2/smallyan/arithmetic_eval'
cache_path = os.path.join(repo_path, 'cache')
data_path = os.path.join(repo_path, 'data')

print(f"CUDA: {torch.cuda.is_available()}, Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

CUDA: True, Device: NVIDIA A100 80GB PCIe


In [2]:
# Load model
from nnsight import LanguageModel
print("Loading Llama-2-7b...")
model = LanguageModel('meta-llama/Llama-2-7b-hf', device_map='cuda', dispatch=True)
print(f"Model loaded. Config: hidden_size={model.config.hidden_size}, layers={model.config.num_hidden_layers}, heads={model.config.num_attention_heads}")

Loading Llama-2-7b...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded. Config: hidden_size=4096, layers=32, heads=32


In [3]:
# Load head ordering scores
llama2_cache = os.path.join(cache_path, 'causal_scores', 'Llama-2-7b-hf')
with open(os.path.join(llama2_cache, 'concept_copying_len30_n1024.json'), 'r') as f:
    concept_scores = json.load(f)
with open(os.path.join(llama2_cache, 'token_copying_len30_n1024.json'), 'r') as f:
    token_scores = json.load(f)

concept_sorted = sorted([(d['layer'], d['head_idx'], d['score']) for d in concept_scores], key=lambda t: t[2], reverse=True)
token_sorted = sorted([(d['layer'], d['head_idx'], d['score']) for d in token_scores], key=lambda t: t[2], reverse=True)
print(f"Loaded head scores. Top concept: {concept_sorted[0]}, Top token: {token_sorted[0]}")

Loaded head scores. Top concept: (14, 1, 0.0010720472782850266), Top token: (16, 19, 0.0021728137508034706)


In [4]:
# Core functions
def get_ov_sum(model, head_ordering: str, k: int = 80):
    """Construct lens matrix by summing OV matrices from top-k heads."""
    if head_ordering == 'raw':
        return None
    
    head_dim = model.config.hidden_size // model.config.num_attention_heads
    
    if head_ordering == 'all':
        to_sum = [(l, h) for l in range(model.config.num_hidden_layers) for h in range(model.config.num_attention_heads)]
    else:
        sorted_heads = concept_sorted if head_ordering == 'concept' else token_sorted
        to_sum = [(l, h) for l, h, _ in sorted_heads][:k]
    
    with torch.no_grad():
        ov_sum = torch.zeros((4096, 4096), device='cuda')
        for l, h in to_sum:
            V = model.model.layers[l].self_attn.v_proj.weight[h * head_dim : (h+1) * head_dim]
            O = model.model.layers[l].self_attn.o_proj.weight[:, h * head_dim : (h+1) * head_dim]
            ov_sum += torch.matmul(O, V)
    return ov_sum

def get_word_rep(word: str, model, layer_idx: int, ov_sum, prefix: str = ''):
    """Get word representation at given layer, optionally transformed."""
    text = prefix + word.strip()
    with torch.no_grad():
        with model.trace(text):
            state = model.model.layers[layer_idx].output[0].squeeze()[-1].detach().save()
    return torch.matmul(ov_sum, state) if ov_sum is not None else state

def load_task(task_name: str, dataset: str = 'word2vec'):
    filepath = os.path.join(data_path, dataset, f'{task_name}.txt')
    with open(filepath, 'r') as f:
        return [l for l in f.read().split('\n')[1:] if l.strip()]

def run_experiment(task_name: str, layer: int, head_ordering: str, k: int = 80, dataset: str = 'word2vec'):
    """Run parallelogram experiment."""
    task_lines = load_task(task_name, dataset)
    sep = ' ' if dataset == 'word2vec' else '\t'
    
    # Get OV sum
    ov_sum = get_ov_sum(model, head_ordering, k)
    
    # Get all word representations
    words = set()
    for line in task_lines:
        words.update(line.split(sep))
    neighbors = {w: get_word_rep(w, model, layer, ov_sum) for w in words}
    
    # Compute accuracy
    correct, total = 0, 0
    for line in task_lines:
        parts = line.split(sep)
        if len(parts) != 4:
            continue
        a, b, a_prime, b_prime = parts
        result = neighbors[a] - neighbors[b] + neighbors[b_prime]
        best = max(neighbors.keys(), key=lambda w: torch.cosine_similarity(result, neighbors[w], dim=0).item())
        if best == a_prime:
            correct += 1
        total += 1
    
    # Clean up
    del neighbors, ov_sum
    gc.collect()
    torch.cuda.empty_cache()
    
    return correct / total if total > 0 else 0, total

print("Functions defined successfully")

Functions defined successfully


## Experiment 1: Capital-Common-Countries (Semantic Task)

Expected: Concept lens should achieve ~90% accuracy, outperforming raw (~16%) and token (~7%).

In [5]:
# Test single configuration first
print("Testing capital-common-countries, layer 20, concept lens...")
acc, n = run_experiment('capital-common-countries', layer=20, head_ordering='concept')
print(f"Result: accuracy = {acc:.4f} (n={n})")

# Compare with expected from cache
with open(os.path.join(cache_path, 'parallelograms/word2vec/no_prefix/concept/capital-common-countries/layer20_results.json'), 'r') as f:
    expected = json.load(f)
print(f"Expected: accuracy = {expected['nn_acc']:.4f}")

Testing capital-common-countries, layer 20, concept lens...


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Result: accuracy = 0.8953 (n=506)
Expected: accuracy = 0.8953


In [6]:
# Run for key layers (16 and 20) across all orderings for capital task
capital_results = {}
key_layers = [16, 20]
orderings = ['raw', 'concept', 'token', 'all']

print("Capital-Common-Countries Results:")
print("-" * 50)
for ordering in orderings:
    capital_results[ordering] = {}
    for layer in key_layers:
        acc, n = run_experiment('capital-common-countries', layer=layer, head_ordering=ordering)
        capital_results[ordering][layer] = acc
        print(f"{ordering:8} layer {layer}: {acc:.4f}")
print()

Capital-Common-Countries Results:
--------------------------------------------------


raw      layer 16: 0.1719


raw      layer 20: 0.1581
