In [1]:
import os
os.chdir('/home/smallyan/eval_agent')

# Check versions
import torch
print(f"Torch version: {torch.__version__}")

import torchvision
print(f"Torchvision version: {torchvision.__version__}")

print(f"CUDA available: {torch.cuda.is_available()}")

Torch version: 2.9.1+cu128


Torchvision version: 0.24.1+cu128
CUDA available: True


In [2]:
# Now try importing transformers GPT2
import warnings
warnings.filterwarnings('ignore')

from transformers import GPT2LMHeadModel, GPT2Tokenizer
print("Successfully imported GPT2 modules")

Successfully imported GPT2 modules


# Generalizability Evaluation for Linear Relational Embeddings (LRE)

## Repository: /net/scratch2/smallyan/relations_eval

This notebook evaluates whether the findings in the repository generalize beyond the original experimental setting.

## Evaluation Checklist:
- **GT1**: Generalization to a New Model
- **GT2**: Generalization to New Data  
- **GT3**: Method / Specificity Generalizability

## Research Summary
This repository investigates how transformer language models represent and decode relational knowledge. The key finding is that for a subset of relations, the highly non-linear decoding procedure can be approximated by a simple linear transformation (LRE) on the subject representation at intermediate layers.

**Original Models Used**: GPT-J-6B, GPT-2-XL, LLaMA-13B

**Method**: Linear Relational Embedding (LRE)
- LRE(s) = βWs + b
- W = E[∂F/∂s] (mean Jacobian from n=8 examples)
- b = E[F(s,c) - (∂F/∂s)s] (bias term)

In [3]:
# Import repository modules
import sys
repo_path = '/net/scratch2/smallyan/relations_eval'
sys.path.insert(0, repo_path)

from src import models, data, functional
from src.operators import JacobianIclMeanEstimator
from src.utils import experiment_utils
from src.data import RelationSample

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
print("Successfully imported all repository modules")

Using device: cuda:0
Successfully imported all repository modules


## GT1: Generalization to a New Model

**Criterion**: The newly proposed neuron-level finding is predictable on a **new model** not used in the original work.

**New Model**: GPT-2 Medium (original study used GPT-J-6B, GPT-2-XL, LLaMA-13B)

In [4]:
# Load GPT-2 Medium (NOT used in original study)
import transformers

print('Loading GPT-2 Medium...')
model_gpt2_medium = transformers.AutoModelForCausalLM.from_pretrained('gpt2-medium')
model_gpt2_medium.to(device)
model_gpt2_medium.eval()

tokenizer_gpt2_medium = transformers.AutoTokenizer.from_pretrained('gpt2-medium')
tokenizer_gpt2_medium.pad_token = tokenizer_gpt2_medium.eos_token

mt_new = models.ModelAndTokenizer(model_gpt2_medium, tokenizer_gpt2_medium)
print(f'Model: {type(model_gpt2_medium).__name__}')
print(f'Layers: {model_gpt2_medium.config.n_layer}')

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 122] Disk quota exceeded: '/net/projects/chai-lab/shared_models/hub/models--gpt2-medium/.no_exist/6dcaa7a952f72f9298047fd5137cd6e4f05f41da/adapter_config.json'


Loading GPT-2 Medium...


Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 122] Disk quota exceeded: '/net/projects/chai-lab/shared_models/hub/models--gpt2-medium/.no_exist/6dcaa7a952f72f9298047fd5137cd6e4f05f41da/adapter_config.json'


Model: GPT2LMHeadModel
Layers: 24


In [5]:
# Load dataset and test LRE on GPT-2 Medium
dataset = data.load_dataset()

# Test on 'country capital city' relation
relation = dataset.filter(relation_names=['country capital city'])[0]
print(f'Relation: {relation.name}')
print(f'Samples: {len(relation.samples)}')
print(f'Prompt template: {relation.prompt_templates[0]}')

Relation: country capital city
Samples: 24
Prompt template: The capital city of {} is


In [6]:
# Set seed and split the data
experiment_utils.set_seed(12345)
train, test = relation.split(5)

print(f"Training samples: {len(train.samples)}")
for s in train.samples:
    print(f"  {s.subject} -> {s.object}")

print(f"\nTest samples: {len(test.samples)}")
for s in test.samples[:5]:
    print(f"  {s.subject} -> {s.object}")

Training samples: 5
  China -> Beijing
  Japan -> Tokyo
  Italy -> Rome
  Brazil -> Bras\u00edlia
  Turkey -> Ankara

Test samples: 19
  South Korea -> Seoul
  Colombia -> Bogot\u00e1
  Saudi Arabia -> Riyadh
  France -> Paris
  Mexico -> Mexico City


In [7]:
# Create LRE estimator for GPT-2 Medium
# Using layer 8 (middle layer for 24-layer model) and beta=2.5 as in the original paper
layer = 8
beta = 2.5

estimator = JacobianIclMeanEstimator(mt=mt_new, h_layer=layer, beta=beta)
operator = estimator(relation.set(samples=train.samples))

print(f"LRE operator created for layer {layer} with beta={beta}")

relation has > 1 prompt_templates, will use first (The capital city of {} is)


LRE operator created for layer 8 with beta=2.5


In [8]:
# Filter test samples based on model's ability to predict them
test_filtered = functional.filter_relation_samples_based_on_provided_fewshots(
    mt=mt_new, test_relation=test, prompt_template=operator.prompt_template, batch_size=4
)

print(f"Filtered test samples: {len(test_filtered.samples)} (from {len(test.samples)})")

Filtered test samples: 3 (from 19)


In [9]:
# Test LRE on the filtered samples - up to 3 trials for GT1
gt1_results = []
correct = 0

print("GT1: Testing LRE on GPT-2 Medium (new model not in original study)\n")
print("Trial Examples:")
print("-" * 60)

for i, sample in enumerate(test_filtered.samples[:3]):  # Up to 3 trials
    predictions = operator(subject=sample.subject).predictions
    predicted_token = predictions[0].token.strip()
    known_flag = functional.is_nontrivial_prefix(
        prediction=predictions[0].token, target=sample.object
    )
    status = "✓ PASS" if known_flag else "✗ FAIL"
    print(f'Trial {i+1}: {sample.subject} -> Expected: {sample.object}')
    print(f'         Predicted: "{predicted_token}" {status}')
    print()
    
    gt1_results.append({
        'subject': sample.subject,
        'expected': sample.object,
        'predicted': predicted_token,
        'success': known_flag
    })
    correct += known_flag

print("-" * 60)
print(f'GT1 Result: {correct}/{len(gt1_results)} successful trials')
gt1_pass = correct >= 1
print(f'GT1 {"PASS" if gt1_pass else "FAIL"}: {"At least one successful example" if gt1_pass else "No successful examples"}')

GT1: Testing LRE on GPT-2 Medium (new model not in original study)

Trial Examples:
------------------------------------------------------------
Trial 1: Saudi Arabia -> Expected: Riyadh
         Predicted: "Man" ✗ FAIL

Trial 2: South Korea -> Expected: Seoul
         Predicted: "Seoul" ✓ PASS

Trial 3: United States -> Expected: Washington D.C.
         Predicted: "Washington" ✓ PASS

------------------------------------------------------------
GT1 Result: 2/3 successful trials
GT1 PASS: At least one successful example


## GT2: Generalization to New Data

**Criterion**: The newly proposed neuron-level finding is predictable on **new data instances** not appearing in the original dataset.

**New Data**: Testing on countries not in the original 24-country dataset (Poland, Sweden, Norway)

In [10]:
# Check which countries are in the original dataset
original_countries = [s.subject for s in relation.samples]
print(f'Original countries in dataset ({len(original_countries)}):')
print(original_countries)

Original countries in dataset (24):
['United States', 'Canada', 'Mexico', 'Brazil', 'Argentina', 'Chile', 'Peru', 'Colombia', 'Venezuela', 'Spain', 'France', 'Germany', 'Italy', 'Russia', 'China', 'Japan', 'South Korea', 'India', 'Pakistan', 'Nigeria', 'Egypt', 'Saudi Arabia', 'Turkey', 'Australia']


In [11]:
# Test on NEW data instances not in original dataset
# These are countries NOT in the original 24-country dataset

new_samples = [
    RelationSample(subject='Poland', object='Warsaw'),
    RelationSample(subject='Sweden', object='Stockholm'),
    RelationSample(subject='Norway', object='Oslo'),
]

print("GT2: Testing LRE on new data instances (not in original dataset)\n")
print("Verifying these countries are NOT in original dataset:")
for sample in new_samples:
    in_original = sample.subject in original_countries
    print(f"  {sample.subject}: {'IN original (invalid!)' if in_original else 'NOT in original ✓'}")

print("\nTrial Examples:")
print("-" * 60)

gt2_results = []
correct_gt2 = 0

for i, sample in enumerate(new_samples[:3]):  # Up to 3 trials
    predictions = operator(subject=sample.subject).predictions
    predicted_token = predictions[0].token.strip()
    known_flag = functional.is_nontrivial_prefix(
        prediction=predictions[0].token, target=sample.object
    )
    status = "✓ PASS" if known_flag else "✗ FAIL"
    print(f'Trial {i+1}: {sample.subject} -> Expected: {sample.object}')
    print(f'         Predicted: "{predicted_token}" {status}')
    print()
    
    gt2_results.append({
        'subject': sample.subject,
        'expected': sample.object,
        'predicted': predicted_token,
        'success': known_flag
    })
    correct_gt2 += known_flag

print("-" * 60)
print(f'GT2 Result: {correct_gt2}/{len(gt2_results)} successful trials')
gt2_pass = correct_gt2 >= 1
print(f'GT2 {"PASS" if gt2_pass else "FAIL"}: {"At least one successful example" if gt2_pass else "No successful examples"}')

GT2: Testing LRE on new data instances (not in original dataset)

Verifying these countries are NOT in original dataset:
  Poland: NOT in original ✓
  Sweden: NOT in original ✓
  Norway: NOT in original ✓

Trial Examples:
------------------------------------------------------------
Trial 1: Poland -> Expected: Warsaw
         Predicted: "Warsaw" ✓ PASS

Trial 2: Sweden -> Expected: Stockholm
         Predicted: "Stockholm" ✓ PASS

Trial 3: Norway -> Expected: Oslo
         Predicted: "Oslo" ✓ PASS

------------------------------------------------------------
GT2 Result: 3/3 successful trials
GT2 PASS: At least one successful example


## GT3: Method / Specificity Generalizability

**Criterion**: If the work proposes a **new method**, evaluate if it can be applied to **another similar task**.

**New Method**: Linear Relational Embedding (LRE) - Jacobian-based linear approximation

**Test**: Apply LRE method to different relation types (factual, commonsense, linguistic) - up to 3 similar tasks

In [12]:
# Test LRE method on different relation types (up to 3 similar tasks)
# Task 1: Word sentiment (commonsense relation)

print("GT3: Testing LRE method on different relation types\n")
print("=" * 70)
print("Task 1: Word Sentiment (Commonsense Relation)")
print("=" * 70)

relation2 = dataset.filter(relation_names=['word sentiment'])[0]
print(f'Relation: {relation2.name}')
print(f'Samples: {len(relation2.samples)}')
print(f'Prompt template: {relation2.prompt_templates[0]}')

experiment_utils.set_seed(12345)
train2, test2 = relation2.split(5)

estimator2 = JacobianIclMeanEstimator(mt=mt_new, h_layer=layer, beta=beta)
operator2 = estimator2(relation2.set(samples=train2.samples))

test2_filtered = functional.filter_relation_samples_based_on_provided_fewshots(
    mt=mt_new, test_relation=test2, prompt_template=operator2.prompt_template, batch_size=4
)

print(f'\nFiltered test samples: {len(test2_filtered.samples)}')

correct_task1 = 0
for sample in test2_filtered.samples[:5]:
    predictions = operator2(subject=sample.subject).predictions
    known_flag = functional.is_nontrivial_prefix(
        prediction=predictions[0].token, target=sample.object
    )
    status = "✓" if known_flag else "✗"
    print(f'  {sample.subject} -> {sample.object}: Predicted="{predictions[0].token.strip()}" {status}')
    correct_task1 += known_flag

print(f'\nTask 1 Result: {correct_task1}/5 correct')
task1_pass = correct_task1 >= 1

GT3: Testing LRE method on different relation types

Task 1: Word Sentiment (Commonsense Relation)
Relation: word sentiment
Samples: 60
Prompt template: The sentiment of '{}' is



Filtered test samples: 9
  blessed -> positive: Predicted="positive" ✓
  blissful -> positive: Predicted="positive" ✓
  cheerful -> positive: Predicted="positive" ✓
  delighted -> positive: Predicted="positive" ✓
  despairing -> negative: Predicted="positive" ✗

Task 1 Result: 4/5 correct


In [13]:
# Task 2: Adjective antonym (linguistic relation)
print("\n" + "=" * 70)
print("Task 2: Adjective Antonym (Linguistic Relation)")
print("=" * 70)

relation3 = dataset.filter(relation_names=['adjective antonym'])[0]
print(f'Relation: {relation3.name}')
print(f'Samples: {len(relation3.samples)}')
print(f'Prompt template: {relation3.prompt_templates[0]}')

experiment_utils.set_seed(12345)
train3, test3 = relation3.split(5)

estimator3 = JacobianIclMeanEstimator(mt=mt_new, h_layer=layer, beta=beta)
operator3 = estimator3(relation3.set(samples=train3.samples))

test3_filtered = functional.filter_relation_samples_based_on_provided_fewshots(
    mt=mt_new, test_relation=test3, prompt_template=operator3.prompt_template, batch_size=4
)

print(f'\nFiltered test samples: {len(test3_filtered.samples)}')

correct_task2 = 0
for sample in test3_filtered.samples[:5]:
    predictions = operator3(subject=sample.subject).predictions
    known_flag = functional.is_nontrivial_prefix(
        prediction=predictions[0].token, target=sample.object
    )
    status = "✓" if known_flag else "✗"
    print(f'  {sample.subject} -> {sample.object}: Predicted="{predictions[0].token.strip()}" {status}')
    correct_task2 += known_flag

print(f'\nTask 2 Result: {correct_task2}/5 correct')
task2_pass = correct_task2 >= 1


Task 2: Adjective Antonym (Linguistic Relation)
Relation: adjective antonym
Samples: 100
Prompt template: The opposite of {} is



Filtered test samples: 2
  inhale -> exhale: Predicted="move" ✗
  inhale -> exhale: Predicted="move" ✗

Task 2 Result: 0/5 correct


In [14]:
# Task 3: Country language (factual relation - different from country capital)
print("\n" + "=" * 70)
print("Task 3: Country Language (Factual Relation)")
print("=" * 70)

relation4 = dataset.filter(relation_names=['country language'])[0]
print(f'Relation: {relation4.name}')
print(f'Samples: {len(relation4.samples)}')
print(f'Prompt template: {relation4.prompt_templates[0]}')

experiment_utils.set_seed(12345)
train4, test4 = relation4.split(5)

estimator4 = JacobianIclMeanEstimator(mt=mt_new, h_layer=layer, beta=beta)
operator4 = estimator4(relation4.set(samples=train4.samples))

test4_filtered = functional.filter_relation_samples_based_on_provided_fewshots(
    mt=mt_new, test_relation=test4, prompt_template=operator4.prompt_template, batch_size=4
)

print(f'\nFiltered test samples: {len(test4_filtered.samples)}')

correct_task3 = 0
for sample in test4_filtered.samples[:5]:
    predictions = operator4(subject=sample.subject).predictions
    known_flag = functional.is_nontrivial_prefix(
        prediction=predictions[0].token, target=sample.object
    )
    status = "✓" if known_flag else "✗"
    print(f'  {sample.subject} -> {sample.object}: Predicted="{predictions[0].token.strip()}" {status}')
    correct_task3 += known_flag

print(f'\nTask 3 Result: {correct_task3}/5 correct')
task3_pass = correct_task3 >= 1

relation has > 1 prompt_templates, will use first (People in {} speak)



Task 3: Country Language (Factual Relation)
Relation: country language
Samples: 24
Prompt template: People in {} speak



Filtered test samples: 3
  Saudi Arabia -> Arabic: Predicted="Turkish" ✗
  South Korea -> Korean: Predicted="Korean" ✓
  United States -> English: Predicted="English" ✓

Task 3 Result: 2/5 correct


In [15]:
# Summary for GT3
print("\n" + "=" * 70)
print("GT3 Summary: Method Generalization")
print("=" * 70)

gt3_results = [
    {'task': 'Word Sentiment (Commonsense)', 'correct': correct_task1, 'total': 5, 'pass': task1_pass},
    {'task': 'Adjective Antonym (Linguistic)', 'correct': correct_task2, 'total': min(5, len(test3_filtered.samples)), 'pass': task2_pass},
    {'task': 'Country Language (Factual)', 'correct': correct_task3, 'total': min(5, len(test4_filtered.samples)), 'pass': task3_pass},
]

for result in gt3_results:
    status = "✓ PASS" if result['pass'] else "✗ FAIL"
    print(f"  {result['task']}: {result['correct']}/{result['total']} correct {status}")

# GT3 passes if at least one task passes
gt3_pass = any(r['pass'] for r in gt3_results)
print(f'\nGT3 Result: {"PASS" if gt3_pass else "FAIL"}')
print(f'Rationale: {"LRE method successfully applies to multiple relation types (word sentiment, country language)" if gt3_pass else "LRE method did not work on any tested relation types"}')


GT3 Summary: Method Generalization
  Word Sentiment (Commonsense): 4/5 correct ✓ PASS
  Adjective Antonym (Linguistic): 0/2 correct ✗ FAIL
  Country Language (Factual): 2/3 correct ✓ PASS

GT3 Result: PASS
Rationale: LRE method successfully applies to multiple relation types (word sentiment, country language)


## Summary: Generalizability Checklist

| Criterion | Result | Evidence |
|-----------|--------|----------|
| **GT1: Model Generalization** | PASS | LRE works on GPT-2 Medium (2/3 correct, including South Korea → Seoul, United States → Washington) |
| **GT2: Data Generalization** | PASS | LRE works on new countries not in original dataset (3/3: Poland→Warsaw, Sweden→Stockholm, Norway→Oslo) |
| **GT3: Method Generalization** | PASS | LRE method applies to multiple relation types: word sentiment (4/5 = 80%), country language (2/3 = 67%) |

### Conclusion

The Linear Relational Embedding (LRE) findings demonstrate strong generalizability:

1. **Model Generalization**: The finding transfers to GPT-2 Medium, a model not used in the original study (which used GPT-J-6B, GPT-2-XL, LLaMA-13B).

2. **Data Generalization**: The finding holds for new data instances (Poland, Sweden, Norway) not in the original 24-country dataset.

3. **Method Generalization**: The LRE method can be applied to multiple similar tasks across different relation types (factual and commonsense relations). Note that some relations (like adjective antonym) do not work well, which is consistent with the paper's finding that not all relations are linearly decodable.

All three generalizability criteria are satisfied with at least one successful example each.

In [16]:
# Create the evaluation summary JSON
import json
import os

eval_dir = '/net/scratch2/smallyan/relations_eval/evaluation'
os.makedirs(eval_dir, exist_ok=True)

# Build detailed rationales
gt1_rationale = f"""The LRE (Linear Relational Embedding) finding successfully transfers to GPT-2 Medium, a model not used in the original study (which used GPT-J-6B, GPT-2-XL, and LLaMA-13B). Testing the 'country capital city' relation on GPT-2 Medium achieved 2/3 correct predictions (66.67% faithfulness), with successful examples including 'South Korea → Seoul' and 'United States → Washington'. This demonstrates that the neuron-level linear approximation finding generalizes to a new model architecture/size."""

gt2_rationale = f"""The LRE finding generalizes to new data instances not in the original dataset. Testing on countries not in the original 24-country dataset (Poland, Sweden, Norway), the LRE achieved 100% faithfulness (3/3 correct): Poland → Warsaw, Sweden → Stockholm, Norway → Oslo were all correctly predicted. This demonstrates that the linear relational embedding generalizes beyond the training data."""

gt3_rationale = f"""The LRE method (Jacobian-based linear approximation) can be successfully applied to multiple similar tasks. Testing on different relation types: (1) Commonsense relations (word sentiment) achieved 80% faithfulness (4/5 correct predictions including blessed→positive, blissful→positive, cheerful→positive, delighted→positive). (2) Factual relations (country language) achieved 67% faithfulness (2/3 correct: South Korea→Korean, United States→English). While some relations like adjective antonym do not work well (as noted in the original paper - not all relations are linearly decodable), the method demonstrably applies to multiple task types."""

summary = {
    "Checklist": {
        "GT1_ModelGeneralization": "PASS",
        "GT2_DataGeneralization": "PASS",
        "GT3_MethodGeneralization": "PASS"
    },
    "Rationale": {
        "GT1_ModelGeneralization": gt1_rationale,
        "GT2_DataGeneralization": gt2_rationale,
        "GT3_MethodGeneralization": gt3_rationale
    }
}

summary_path = os.path.join(eval_dir, 'generalization_eval_summary.json')
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Saved evaluation summary to: {summary_path}")
print("\nSummary contents:")
print(json.dumps(summary, indent=2))

Saved evaluation summary to: /net/scratch2/smallyan/relations_eval/evaluation/generalization_eval_summary.json

Summary contents:
{
  "Checklist": {
    "GT1_ModelGeneralization": "PASS",
    "GT2_DataGeneralization": "PASS",
    "GT3_MethodGeneralization": "PASS"
  },
  "Rationale": {
    "GT1_ModelGeneralization": "The LRE (Linear Relational Embedding) finding successfully transfers to GPT-2 Medium, a model not used in the original study (which used GPT-J-6B, GPT-2-XL, and LLaMA-13B). Testing the 'country capital city' relation on GPT-2 Medium achieved 2/3 correct predictions (66.67% faithfulness), with successful examples including 'South Korea \u2192 Seoul' and 'United States \u2192 Washington'. This demonstrates that the neuron-level linear approximation finding generalizes to a new model architecture/size.",
    "GT2_DataGeneralization": "The LRE finding generalizes to new data instances not in the original dataset. Testing on countries not in the original 24-country dataset (Pol

In [17]:
# Copy the current notebook to the evaluation directory
import shutil

# Current notebook path
current_nb_path = '/home/smallyan/eval_agent/notebooks/2025-12-24-21-34_generalization_eval_v2.ipynb'
target_nb_path = '/net/scratch2/smallyan/relations_eval/evaluation/generalization_eval.ipynb'

# Copy the notebook
shutil.copy(current_nb_path, target_nb_path)
print(f"Copied notebook to: {target_nb_path}")

# Verify it exists
if os.path.exists(target_nb_path):
    print("Verification: Notebook file exists ✓")
    nb_size = os.path.getsize(target_nb_path)
    print(f"File size: {nb_size} bytes")