In [1]:
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


In [2]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    device = "cuda"
else:
    device = "cpu"
print(f"Using device: {device}")

CUDA available: True
GPU: NVIDIA A100 80GB PCIe
GPU Memory: 79.3 GB
Using device: cuda


In [3]:
# Add the repository to the path so we can import its modules
import sys
repo_path = '/net/scratch2/smallyan/relations_eval'
sys.path.insert(0, repo_path)

# Verify by listing what's available
import os
print("Repository contents:")
for item in os.listdir(repo_path):
    print(f"  {item}")

Repository contents:
  evaluation
  hparams
  pyproject.toml
  data
  experiments.py
  CodeWalkthrough.md
  plan.md
  notebooks
  src
  requirements.txt
  schematic-wide.png
  invoke.yaml
  LICENSE
  .gitignore
  tests
  .git
  demo
  tasks.py
  scripts
  documentation.pdf


In [4]:
# Read the plan to understand the research
with open(os.path.join(repo_path, 'plan.md'), 'r') as f:
    plan_content = f.read()
print("=== PLAN.MD ===")
print(plan_content)

=== PLAN.MD ===
# Plan
## Objective
Investigate how transformer language models represent and decode relational knowledge, specifically testing whether relation decoding can be well-approximated by linear transformations on subject representations.

## Hypothesis
1. For a variety of relations, transformer LMs decode relational knowledge directly from subject entity representations at intermediate layers.
2. For each relation, the decoding procedure is approximately affine (linear relational embedding), expressed as LRE(s) = Wrs + br mapping subject s to object o.
3. These affine transformations can be computed directly from the LM Jacobian on a prompt expressing the relation (∂o/∂s).
4. Not all relations are linearly decodable; some relations are reliably predicted but do not exhibit linear relational embeddings.

## Methodology
1. Extract Linear Relational Embeddings (LREs) by computing the mean Jacobian W and bias b from n=8 examples using first-order Taylor approximation: W = E[∂F/∂

In [5]:
# Let's look at the demo notebook to understand how to use the LRE functionality
demo_path = os.path.join(repo_path, 'demo', 'demo.ipynb')
import json
with open(demo_path, 'r') as f:
    demo_nb = json.load(f)

# Print the code cells
print("=== DEMO NOTEBOOK CODE CELLS ===")
for i, cell in enumerate(demo_nb['cells']):
    if cell['cell_type'] == 'code':
        code = ''.join(cell['source'])
        if len(code.strip()) > 0:
            print(f"\n--- Cell {i} ---")
            print(code[:1500] if len(code) > 1500 else code)

=== DEMO NOTEBOOK CODE CELLS ===

--- Cell 0 ---
import sys
sys.path.append('..')

import torch
from src import models, data, lens, functional
from src.utils import experiment_utils
from baukit import Menu, show

--- Cell 1 ---
device = "cuda:0"
mt = models.load_model("gptj", device=device, fp16=True)
print(f"dtype: {mt.model.dtype}, device: {mt.model.device}, memory: {mt.model.get_memory_footprint()}")

--- Cell 2 ---
dataset = data.load_dataset()

relation_names = [r.name for r in dataset.relations]
relation_options = Menu(choices = relation_names, value = relation_names)
show(relation_options) # !caution: tested in a juputer-notebook. baukit visualizations are not supported in vscode.

--- Cell 3 ---
relation_name = relation_options.value
relation = dataset.filter(relation_names=[relation_name])[0]
print(f"{relation.name} -- {len(relation.samples)} samples")
print("------------------------------------------------------")

experiment_utils.set_seed(12345) # set seed to a constant val

In [6]:
# Let's look at the models.py to understand how to load models
with open(os.path.join(repo_path, 'src', 'models.py'), 'r') as f:
    models_content = f.read()
print(models_content[:6000])

"""Functions for loading and interacting with pretrained language models.

This module is designed to house all the annoying branching logic
that comes with supporting analysis of many slightly different model
implementations.
"""
import argparse
import logging
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Iterator, Literal, Optional, Sequence, overload

from src.utils import env_utils, tokenizer_utils
from src.utils.typing import Device, Layer, Model, ModelInput, Tokenizer

import torch
import transformers

logger = logging.getLogger(__name__)

GPT_J_NAME_SHORT = "gptj"  # A useful alias for the CLI.
GPT_J_NAME = "EleutherAI/gpt-j-6B"

GPT_NEO_X_NAME_SHORT = "neox"
GPT_NEO_X_NAME = "EleutherAI/gpt-neox-20b"

LLAMA_13B_NAME = "llama-13b"
LLAMA_30B_NAME = "llama-30b"
LLAMA_NAME_SHORT = "llama"

DOWNLOADABLE_MODELS = frozenset({GPT_J_NAME, GPT_NEO_X_NAME, "gpt2-xl"})


@dataclass(frozen=True)
class ModelAndTokeniz

In [7]:
# Let's see the rest of models.py to understand the load_model function
print(models_content[6000:])

        layer_index = len(determine_layers(model)) + layer

        if isinstance(model, transformers.GPTNeoXForCausalLM):
            layer_path = f"gpt_neox.layers.{layer_index}"
        elif isinstance(model, transformers.LlamaForCausalLM):
            layer_path = f"model.layers.{layer_index}"
        else:
            layer_path = f"transformer.h.{layer_index}"
        layer_paths[layer] = layer_path

    return layer_paths if return_dict else tuple(layer_paths[la] for la in layers)


def determine_hidden_size(model: ModelAndTokenizer | Model) -> int:
    """Determine hidden rep size for the model."""
    model = unwrap_model(model)
    return model.config.hidden_size


def determine_device(model: ModelAndTokenizer | Model) -> torch.device | None:
    """Determine device model is running on."""
    parameter = any_parameter(model)
    return parameter.device if parameter is not None else None


def determine_dtype(model: ModelAndTokenizer | Model) -> torch.dtype | None:
    """Det

In [8]:
# Now I understand the codebase. The original work uses:
# - GPT-J-6B (gptj)
# - GPT-2-XL (gpt2-xl)
# - LLaMA-13B (llama)

# For GT1, I need to test on a NEW model not used in the original work.
# Options: GPT-2 Medium, Pythia, OPT, etc.
# Let's use GPT-2-medium as it's a smaller variant that is not in the original study

# First, let's import the necessary modules
import torch
from src import models, data, functional
from src.operators import JacobianIclMeanEstimator
from src.utils import experiment_utils

print("Modules imported successfully")

Modules imported successfully


In [9]:
# Load the dataset first
dataset = data.load_dataset()
print(f"Total relations: {len(dataset.relations)}")
relation_names = [r.name for r in dataset.relations]
print(f"Available relations: {relation_names[:10]}...")

Total relations: 47
Available relations: ['characteristic gender', 'univ degree gender', 'name birthplace', 'name gender', 'name religion', 'occupation age', 'occupation gender', 'fruit inside color', 'fruit outside color', 'object superclass']...


In [10]:
# For GT1: Model Generalization
# The original work used: GPT-J-6B, GPT-2-XL, and LLaMA-13B
# I will test on GPT-2 Medium, which was NOT used in the original study

# First, let me check if the codebase can handle GPT-2 models other than XL
# I'll need to modify the model loading slightly

import transformers

# Load GPT-2 medium (not used in original study which used gpt2-xl)
print("Loading GPT-2 Medium model...")
device = "cuda:0"

model_gpt2_medium = transformers.AutoModelForCausalLM.from_pretrained("gpt2-medium")
model_gpt2_medium.to(device)
model_gpt2_medium.eval()

tokenizer_gpt2_medium = transformers.AutoTokenizer.from_pretrained("gpt2-medium")
tokenizer_gpt2_medium.pad_token = tokenizer_gpt2_medium.eos_token

# Create ModelAndTokenizer wrapper
mt_new = models.ModelAndTokenizer(model_gpt2_medium, tokenizer_gpt2_medium)

print(f"Model loaded: {type(model_gpt2_medium).__name__}")
print(f"Number of layers: {model_gpt2_medium.config.n_layer}")
print(f"Hidden size: {model_gpt2_medium.config.hidden_size}")

Loading GPT-2 Medium model...


Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 122] Disk quota exceeded: '/net/projects/chai-lab/shared_models/hub/models--gpt2-medium/.no_exist/6dcaa7a952f72f9298047fd5137cd6e4f05f41da/adapter_config.json'


Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 122] Disk quota exceeded: '/net/projects/chai-lab/shared_models/hub/models--gpt2-medium/.no_exist/6dcaa7a952f72f9298047fd5137cd6e4f05f41da/adapter_config.json'


Model loaded: GPT2LMHeadModel
Number of layers: 24
Hidden size: 1024


In [11]:
# Now let's test the LRE method on GPT-2 Medium
# I'll use the "country capital city" relation as a test case

# Get the relation
relation_name = "country capital city"
relation = dataset.filter(relation_names=[relation_name])[0]
print(f"Relation: {relation.name}")
print(f"Number of samples: {len(relation.samples)}")
print(f"Sample: {relation.samples[0]}")

Relation: country capital city
Number of samples: 24
Sample: United States -> Washington D.C.


In [12]:
# Split into train and test
experiment_utils.set_seed(12345)
train, test = relation.split(5)
print("Training samples:")
for sample in train.samples:
    print(f"  {sample}")
print("\nTest samples:")
for sample in test.samples[:5]:
    print(f"  {sample}")

Training samples:
  China -> Beijing
  Japan -> Tokyo
  Italy -> Rome
  Brazil -> Bras\u00edlia
  Turkey -> Ankara

Test samples:
  South Korea -> Seoul
  Colombia -> Bogot\u00e1
  Saudi Arabia -> Riyadh
  France -> Paris
  Mexico -> Mexico City


In [13]:
# Create LRE estimator for GPT-2 Medium
# For GPT-2 Medium with 24 layers, we use an intermediate layer (around layer 8-12)
layer = 8  # Middle layers usually work best for LRE
beta = 2.5  # Default beta value from the paper

print(f"Creating LRE estimator for GPT-2 Medium at layer {layer}")

estimator = JacobianIclMeanEstimator(
    mt=mt_new,
    h_layer=layer,
    beta=beta
)

# Train the LRE operator
operator = estimator(
    relation.set(samples=train.samples)
)

print(f"LRE operator created")
print(f"Weight shape: {operator.weight.shape}")
print(f"Bias shape: {operator.bias.shape}")

relation has > 1 prompt_templates, will use first (The capital city of {} is)


Creating LRE estimator for GPT-2 Medium at layer 8


LRE operator created
Weight shape: torch.Size([1024, 1024])
Bias shape: torch.Size([1, 1024])


In [14]:
# Test the LRE operator on the test samples
# First, filter test samples to those the model "knows" (i.e., can answer correctly with prompting)
test_filtered = functional.filter_relation_samples_based_on_provided_fewshots(
    mt=mt_new, 
    test_relation=test, 
    prompt_template=operator.prompt_template, 
    batch_size=4
)

print(f"Original test samples: {len(test.samples)}")
print(f"Filtered test samples (model knows): {len(test_filtered.samples)}")

Original test samples: 19
Filtered test samples (model knows): 3


In [15]:
# Test the LRE faithfulness on the filtered test samples
correct = 0
wrong = 0
results_gt1 = []

for sample in test_filtered.samples:
    predictions = operator(subject=sample.subject).predictions
    known_flag = functional.is_nontrivial_prefix(
        prediction=predictions[0].token, target=sample.object
    )
    result = {
        "subject": sample.subject,
        "object": sample.object,
        "predicted": predictions[0].token,
        "prob": float(predictions[0].prob),
        "correct": known_flag
    }
    results_gt1.append(result)
    
    print(f"Subject: {sample.subject}, Object: {sample.object}")
    print(f"  Predicted: '{functional.format_whitespace(predictions[0].token)}' (p={predictions[0].prob:.4f})")
    print(f"  Correct: {functional.get_tick_marker(known_flag)}")
    
    correct += known_flag
    wrong += not known_flag

faithfulness = correct / (correct + wrong) if (correct + wrong) > 0 else 0
print(f"\n=== GT1 Results for GPT-2 Medium ===")
print(f"Faithfulness: {faithfulness:.2%} ({correct}/{correct+wrong})")

Subject: Saudi Arabia, Object: Riyadh
  Predicted: ' Man' (p=0.1342)
  Correct: ✗
Subject: South Korea, Object: Seoul
  Predicted: ' Seoul' (p=0.9529)
  Correct: ✓
Subject: United States, Object: Washington D.C.
  Predicted: ' Washington' (p=0.2871)
  Correct: ✓

=== GT1 Results for GPT-2 Medium ===
Faithfulness: 66.67% (2/3)


In [16]:
# GT1 PASS - We have at least one successful example on a new model (GPT-2 Medium)
# Let's also try a second relation to be thorough

# Try another relation - adjective antonym (linguistic)
relation_name2 = "adjective antonym"
relation2 = dataset.filter(relation_names=[relation_name2])[0]
print(f"Relation: {relation2.name}")
print(f"Number of samples: {len(relation2.samples)}")

experiment_utils.set_seed(12345)
train2, test2 = relation2.split(5)
print("\nTraining samples:")
for sample in train2.samples:
    print(f"  {sample}")

Relation: adjective antonym
Number of samples: 100

Training samples:
  open -> lock
  inside -> outside
  remember -> forget
  close -> open
  clockwise -> counterclockwise


In [17]:
# Create LRE for adjective antonym relation on GPT-2 Medium
estimator2 = JacobianIclMeanEstimator(
    mt=mt_new,
    h_layer=layer,
    beta=beta
)

operator2 = estimator2(
    relation2.set(samples=train2.samples)
)

# Filter test samples
test2_filtered = functional.filter_relation_samples_based_on_provided_fewshots(
    mt=mt_new,
    test_relation=test2,
    prompt_template=operator2.prompt_template,
    batch_size=4
)

print(f"Test samples (model knows): {len(test2_filtered.samples)}")

Test samples (model knows): 2


In [18]:
# Test LRE on adjective antonym relation
correct2 = 0
wrong2 = 0

for sample in test2_filtered.samples:
    predictions = operator2(subject=sample.subject).predictions
    known_flag = functional.is_nontrivial_prefix(
        prediction=predictions[0].token, target=sample.object
    )
    
    print(f"Subject: {sample.subject}, Object: {sample.object}")
    print(f"  Predicted: '{functional.format_whitespace(predictions[0].token)}' (p={predictions[0].prob:.4f})")
    print(f"  Correct: {functional.get_tick_marker(known_flag)}")
    
    correct2 += known_flag
    wrong2 += not known_flag

faithfulness2 = correct2 / (correct2 + wrong2) if (correct2 + wrong2) > 0 else 0
print(f"\nFaithfulness on adjective antonym: {faithfulness2:.2%}")

Subject: inhale, Object: exhale
  Predicted: ' move' (p=0.1835)
  Correct: ✗
Subject: inhale, Object: exhale
  Predicted: ' move' (p=0.1835)
  Correct: ✗

Faithfulness on adjective antonym: 0.00%


In [19]:
# GT1 Summary: 
# We tested 2 relations on GPT-2 Medium (a new model not used in the original work)
# - country capital city: 66.67% faithfulness (2/3 correct) - at least 1 success
# - adjective antonym: 0% faithfulness (0/2 correct)
# 
# GT1 = PASS because we have at least one successful example on a new model

print("=" * 60)
print("GT1: Model Generalization Summary")
print("=" * 60)
print(f"New model tested: GPT-2 Medium (not in original study)")
print(f"Original models: GPT-J-6B, GPT-2-XL, LLaMA-13B")
print(f"\nRelation 1: country capital city")
print(f"  Correct predictions: 2/3 (66.67%)")
print(f"  Examples: 'South Korea' -> 'Seoul' (predicted correctly)")
print(f"\nRelation 2: adjective antonym") 
print(f"  Correct predictions: 0/2 (0%)")
print(f"\nGT1 RESULT: PASS")
print("Rationale: The LRE finding successfully transferred to GPT-2 Medium,")
print("with at least one successful example (South Korea -> Seoul).")

GT1: Model Generalization Summary
New model tested: GPT-2 Medium (not in original study)
Original models: GPT-J-6B, GPT-2-XL, LLaMA-13B

Relation 1: country capital city
  Correct predictions: 2/3 (66.67%)
  Examples: 'South Korea' -> 'Seoul' (predicted correctly)

Relation 2: adjective antonym
  Correct predictions: 0/2 (0%)

GT1 RESULT: PASS
Rationale: The LRE finding successfully transferred to GPT-2 Medium,
with at least one successful example (South Korea -> Seoul).


In [20]:
# GT2: Data Generalization
# Test on NEW data instances not appearing in the original dataset
# I'll use GPT-J (from the original study) but with NEW country-capital pairs

print("=" * 60)
print("GT2: Data Generalization")
print("=" * 60)
print("Testing LRE on NEW data instances not in the original dataset")

# First, let's see what countries are in the original dataset
relation = dataset.filter(relation_names=["country capital city"])[0]
original_countries = [s.subject for s in relation.samples]
print(f"\nOriginal countries in dataset ({len(original_countries)}):")
print(original_countries)

GT2: Data Generalization
Testing LRE on NEW data instances not in the original dataset

Original countries in dataset (24):
['United States', 'Canada', 'Mexico', 'Brazil', 'Argentina', 'Chile', 'Peru', 'Colombia', 'Venezuela', 'Spain', 'France', 'Germany', 'Italy', 'Russia', 'China', 'Japan', 'South Korea', 'India', 'Pakistan', 'Nigeria', 'Egypt', 'Saudi Arabia', 'Turkey', 'Australia']


In [21]:
# Create NEW data instances not in the original dataset
# These are countries NOT in the original list
new_test_data = [
    {"subject": "Poland", "object": "Warsaw"},
    {"subject": "Sweden", "object": "Stockholm"},
    {"subject": "Norway", "object": "Oslo"},
]

print("New test data (not in original dataset):")
for item in new_test_data:
    print(f"  {item['subject']} -> {item['object']}")

New test data (not in original dataset):
  Poland -> Warsaw
  Sweden -> Stockholm
  Norway -> Oslo


In [22]:
# For GT2, we need to use one of the original models (GPT-J) and test on new data
# Let's load GPT-J
print("Loading GPT-J model for GT2 testing...")
mt_gptj = models.load_model("gptj", device="cuda:0", fp16=True)
print(f"Model loaded: {mt_gptj.name}")

Loading GPT-J model for GT2 testing...


OSError: [Errno 122] Disk quota exceeded: '/net/projects/chai-lab/shared_models/hub/models--EleutherAI--gpt-j-6B'

In [23]:
# Disk quota exceeded for GPT-J. Let's use GPT-2 Medium (already loaded) for GT2 testing
# This is still valid - we're testing the finding on NEW DATA, not the model being new
# The key is that the data instances are new

print("Using GPT-2 Medium for GT2 testing (already loaded)")
print("Testing on NEW data instances not in original dataset")

# We already have the LRE operator trained on GPT-2 Medium for country capital city
# Let's test on the new countries

# First, we need to check if GPT-2 Medium knows these capitals using few-shot prompting
# Create new samples
from src.data import RelationSample

new_samples = [
    RelationSample(subject="Poland", object="Warsaw"),
    RelationSample(subject="Sweden", object="Stockholm"),
    RelationSample(subject="Norway", object="Oslo"),
]

print("\nNew test samples (not in original dataset):")
for s in new_samples:
    print(f"  {s.subject} -> {s.object}")

Using GPT-2 Medium for GT2 testing (already loaded)
Testing on NEW data instances not in original dataset

New test samples (not in original dataset):
  Poland -> Warsaw
  Sweden -> Stockholm
  Norway -> Oslo


In [24]:
# Test LRE on new data instances
# Using the operator we trained earlier on GPT-2 Medium

results_gt2 = []
correct_gt2 = 0
wrong_gt2 = 0

for sample in new_samples:
    predictions = operator(subject=sample.subject).predictions
    known_flag = functional.is_nontrivial_prefix(
        prediction=predictions[0].token, target=sample.object
    )
    
    result = {
        "subject": sample.subject,
        "object": sample.object,
        "predicted": predictions[0].token,
        "prob": float(predictions[0].prob),
        "correct": known_flag
    }
    results_gt2.append(result)
    
    print(f"Subject: {sample.subject}, Object: {sample.object}")
    print(f"  Predicted: '{functional.format_whitespace(predictions[0].token)}' (p={predictions[0].prob:.4f})")
    print(f"  Correct: {functional.get_tick_marker(known_flag)}")
    
    correct_gt2 += known_flag
    wrong_gt2 += not known_flag

faithfulness_gt2 = correct_gt2 / (correct_gt2 + wrong_gt2) if (correct_gt2 + wrong_gt2) > 0 else 0
print(f"\n=== GT2 Results ===")
print(f"Faithfulness on new data: {faithfulness_gt2:.2%} ({correct_gt2}/{correct_gt2+wrong_gt2})")

Subject: Poland, Object: Warsaw
  Predicted: ' Warsaw' (p=0.9622)
  Correct: ✓
Subject: Sweden, Object: Stockholm
  Predicted: ' Stockholm' (p=0.9965)
  Correct: ✓
Subject: Norway, Object: Oslo
  Predicted: ' Oslo' (p=0.9849)
  Correct: ✓

=== GT2 Results ===
Faithfulness on new data: 100.00% (3/3)


In [25]:
# GT2 PASS - All 3 new data instances worked correctly!
print("=" * 60)
print("GT2: Data Generalization Summary")
print("=" * 60)
print("Tested LRE on NEW data instances not in original dataset:")
print(f"  Original dataset countries: {original_countries[:10]}...")
print(f"  New test countries: Poland, Sweden, Norway")
print(f"\nResults:")
for r in results_gt2:
    marker = "✓" if r["correct"] else "✗"
    print(f"  {r['subject']} -> Predicted: '{r['predicted'].strip()}' (Expected: {r['object']}) {marker}")
print(f"\nGT2 RESULT: PASS")
print("Rationale: The LRE finding generalizes to new data instances (Poland, Sweden, Norway)")
print("with 100% faithfulness (3/3 correct predictions).")

GT2: Data Generalization Summary
Tested LRE on NEW data instances not in original dataset:
  Original dataset countries: ['United States', 'Canada', 'Mexico', 'Brazil', 'Argentina', 'Chile', 'Peru', 'Colombia', 'Venezuela', 'Spain']...
  New test countries: Poland, Sweden, Norway

Results:
  Poland -> Predicted: 'Warsaw' (Expected: Warsaw) ✓
  Sweden -> Predicted: 'Stockholm' (Expected: Stockholm) ✓
  Norway -> Predicted: 'Oslo' (Expected: Oslo) ✓

GT2 RESULT: PASS
Rationale: The LRE finding generalizes to new data instances (Poland, Sweden, Norway)
with 100% faithfulness (3/3 correct predictions).
