In [1]:
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


In [2]:
# First, explore the belief-tracking repository structure
import os
repo_path = '/net/scratch2/smallyan/belief-tracking_eval'

# List top-level contents
print("Repository contents:")
for item in sorted(os.listdir(repo_path)):
    full_path = os.path.join(repo_path, item)
    if os.path.isdir(full_path):
        print(f"  [DIR] {item}/")
    else:
        print(f"  [FILE] {item}")

Repository contents:


FileNotFoundError: [Errno 2] No such file or directory: '/net/scratch2/smallyan/belief-tracking_eval'

In [3]:
# The directory is named with underscores, not hyphens
repo_path = '/net/scratch2/smallyan/belief_tracking_eval'

# List top-level contents
print("Repository contents:")
for item in sorted(os.listdir(repo_path)):
    full_path = os.path.join(repo_path, item)
    if os.path.isdir(full_path):
        print(f"  [DIR] {item}/")
    else:
        print(f"  [FILE] {item}")

Repository contents:
  [DIR] .git/
  [FILE] .gitignore
  [FILE] .python-version
  [FILE] CodeWalkthrough.md
  [DIR] belief-tracking_claude_2025-12-23_22-28-22/
  [FILE] causalmodel_novis.png
  [DIR] data/
  [FILE] documentation.pdf
  [DIR] evaluation/
  [DIR] notebooks/
  [FILE] plan.md
  [FILE] pyproject.toml
  [DIR] results/
  [DIR] scripts/
  [DIR] src/
  [FILE] uv.lock


In [4]:
# Let's read the key documentation files to understand the work
import os

# Read the plan
with open(os.path.join(repo_path, 'plan.md'), 'r') as f:
    plan_content = f.read()
    
print("=== PLAN.MD ===")
print(plan_content[:5000])

=== PLAN.MD ===
# Plan
## Objective
Analyze how language models internally represent and track beliefs of characters, particularly when those beliefs may differ from reality, using causal mediation and abstraction methods to uncover systematic computational mechanisms.

## Hypothesis
1. Language models use a lookback mechanism to track beliefs, where reference information is copied to two locations (address and pointer) enabling later retrieval of important information when needed.
2. The model assigns ordering IDs to character, object, and state tokens, then binds character-object-state triples together by co-locating their reference information in low-rank subspaces.
3. A binding lookback retrieves the correct state OI using character and object OIs, and an answer lookback retrieves the corresponding state token.
4. When visibility information is provided, a visibility lookback uses a visibility ID to retrieve information about the observed character and update the observing characte

In [5]:
# Read the code walkthrough for more implementation details
with open(os.path.join(repo_path, 'CodeWalkthrough.md'), 'r') as f:
    walkthrough = f.read()
    
print("=== CODEWALKTHROUGH.MD ===")
print(walkthrough[:4000])

=== CODEWALKTHROUGH.MD ===
# Language Models use Lookbacks to Track Beliefs

This repository contains the code and experiments for the paper ["Language Models use Lookbacks to Track Beliefs"](https://arxiv.org/abs/2505.14685) by Prakash et al, 2025. The work investigates how language models (specifically Llama-3-70B-Instruct and Llama-3.1-405B-Instruct) represent and track characters' beliefs.


![Causal Model in No-Visibility Setting](causalmodel_novis.png)


Please check [belief.baulab.info](https://belief.baulab.info/) for more information.

## Setup

1. Clone the repository:
```bash
git clone https://github.com/Nix07/belief_tracking.git
cd belief_tracking
```

2. Set up the environment:
```bash
uv sync
source .venv/bin/activate
```

3. Configure `env.yml` with following environment variables:
- Set `NDIF_KEY` for API access
- Set `HF_WRITE` for Hugging Face access

4. To perform subspace level analysis, you would need singular vectors that you can request by sending an email to [Ni

In [6]:
# Look at the data directory to understand the dataset
data_path = os.path.join(repo_path, 'data')
print("Data directory contents:")
for item in sorted(os.listdir(data_path)):
    full_path = os.path.join(data_path, item)
    if os.path.isdir(full_path):
        print(f"  [DIR] {item}/")
    else:
        print(f"  [FILE] {item}")

Data directory contents:
  [DIR] bigtom/
  [FILE] story_templates.json
  [DIR] synthetic_entities/


In [7]:
# Read the story templates
import json

with open(os.path.join(data_path, 'story_templates.json'), 'r') as f:
    templates = json.load(f)
    
print("Story templates keys:", list(templates.keys()) if isinstance(templates, dict) else "List with length " + str(len(templates)))
print("\nFirst few entries:")
if isinstance(templates, dict):
    for k, v in list(templates.items())[:3]:
        print(f"\n{k}:")
        print(v if isinstance(v, str) else json.dumps(v, indent=2)[:500])
else:
    for t in templates[:3]:
        print(t if isinstance(t, str) else json.dumps(t, indent=2)[:500])

Story templates keys: ['placeholders', 'templates']

First few entries:

placeholders:
{
  "entity": {
    "character": [
      "<character_1>",
      "<character_2>",
      "<character_3>",
      "<character_4>"
    ],
    "container": [
      "<container_1>",
      "<container_2>"
    ],
    "state": [
      "<state_1>",
      "<state_2>"
    ]
  },
  "event": {
    "state_event": "<state_event>",
    "container_event": "<container_event>",
    "state_swap": "<state_swap>"
  },
  "question": {
    "character": "<character_q>",
    "container": "<container_q>",
    "state": "<sta

templates:
[
  {
    "context": "<character_1> and <character_2> are working in a busy restaurant. To complete an order, <character_1> grabs an opaque <container_1> and fills it with <state_1>. Then <character_2> grabs another opaque <container_2> and fills it with <state_2>. <character_2> cannot observe <character_1>'s actions. <character_1> cannot observe <character_2>'s actions.",
    "causal_event": "A c

In [8]:
# Let's look at the results directory to understand what experiments were run
results_path = os.path.join(repo_path, 'results')
print("Results directory contents:")
for item in sorted(os.listdir(results_path)):
    full_path = os.path.join(results_path, item)
    if os.path.isdir(full_path):
        print(f"  [DIR] {item}/")
        # Show subdirs
        for subitem in sorted(os.listdir(full_path))[:5]:
            print(f"        - {subitem}")
    else:
        print(f"  [FILE] {item}")

Results directory contents:
  [DIR] attn_knockout/
        - firstVisSent.json
        - secondSent.json
        - secondSent_firstVisSent.json
  [DIR] bigToM/
        - Meta-Llama-3-70B-Instruct
  [DIR] causalToM_novis/
        - Meta-Llama-3-70B-Instruct
        - Meta-Llama-3.1-405B-Instruct-8bit
        - Qwen2.5-14B-Instruct
  [DIR] causalToM_vis/
        - Meta-Llama-3-70B-Instruct
        - Meta-Llama-3.1-405B-Instruct-8bit
        - Qwen2.5-14B-Instruct
  [DIR] causal_mediation_analysis/
        - character.json
        - object.json
        - state.json
  [DIR] model_evaluations/
        - Llama-2-13b-hf.json
        - Llama-2-13b-hf_vis.json
        - Llama-2-7b-hf.json
        - Llama-2-7b-hf_vis.json
        - Llama-3.1-8B-Instruct.json


In [9]:
# Check what's in the src directory to understand the implementation
src_path = os.path.join(repo_path, 'src')
print("Source directory contents:")
for item in sorted(os.listdir(src_path)):
    print(f"  {item}")

Source directory contents:
  dataset.py
  global_utils.py
  models.txt


In [10]:
# Read the models.txt to see what models are used
with open(os.path.join(src_path, 'models.txt'), 'r') as f:
    models_content = f.read()
print("Models used:")
print(models_content)

Models used:
meta-llama/Llama-2-7b-hf
Qwen/Qwen2.5-7B
Qwen/Qwen2.5-7B-Instruct
meta-llama/Meta-Llama-3-8B
meta-llama/Meta-Llama-3-8B-Instruct
meta-llama/Llama-3.1-8B
meta-llama/Llama-3.1-8B-Instruct
meta-llama/Llama-2-13b-hf
allenai/OLMo-2-1124-13B-Instruct
Qwen/Qwen2.5-14B
Qwen/Qwen2.5-14B-Instruct
google/gemma-3-27b-it
allenai/OLMo-2-0325-32B-Instruct
meta-llama/Meta-Llama-3-70B-Instruct


In [11]:
# Read the dataset.py to understand the dataset generation
with open(os.path.join(src_path, 'dataset.py'), 'r') as f:
    dataset_code = f.read()
print("Dataset code (first 3000 chars):")
print(dataset_code[:3000])

Dataset code (first 3000 chars):
import json
import os
import random
from dataclasses import dataclass
from typing import Literal

from dataclasses_json import DataClassJsonMixin
from torch.utils.data import Dataset

# from src.utils import env_utils

# Get the absolute path to the data directory
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)
STORY_TEMPLATE_PATH = os.path.join(PROJECT_ROOT, "data", "story_templates.json")

with open(STORY_TEMPLATE_PATH, "r") as f:
    STORY_TEMPLATES = json.load(f)


@dataclass(frozen=False)
class Sample(DataClassJsonMixin):
    template_idx: int
    characters: list[str]
    objects: list[str]
    states: list[str]

    story: str | None = None
    character_belief: list[dict[str, str]] = None

    def __post_init__(self):
        if len(self.characters) == 1:
            self.characters.append("<N/A>")
        # assert len(self.states) == 2 and len(self.containers) == 2 and len(self.characters) == 2

In [12]:
# Let's look at the notebooks to understand the experimental setup better
notebooks_path = os.path.join(repo_path, 'notebooks')
print("Notebooks directory contents:")
for item in sorted(os.listdir(notebooks_path)):
    full_path = os.path.join(notebooks_path, item)
    if os.path.isdir(full_path):
        print(f"  [DIR] {item}/")
        for subitem in sorted(os.listdir(full_path)):
            print(f"        - {subitem}")
    else:
        print(f"  [FILE] {item}")

Notebooks directory contents:
  [DIR] attn_knockout/
        - attn_knockout_exp.ipynb
  [DIR] bigToM/
        - causalmodel_exps.ipynb
        - utils.py
  [DIR] causalToM_novis/
        - answer_lookback.ipynb
        - binding_lookback.ipynb
        - utils.py
  [DIR] causalToM_vis/
        - explicit_visibility_exps.ipynb
        - utils.py
  [DIR] causal_subspace_analysis/
        - lookback.ipynb


In [13]:
# Let's look at the evaluation directory to see what exists
eval_path = os.path.join(repo_path, 'evaluation')
print("Evaluation directory contents:")
if os.path.exists(eval_path):
    for item in sorted(os.listdir(eval_path)):
        print(f"  {item}")
else:
    print("  (directory does not exist - will create)")

Evaluation directory contents:
  code_critic_evaluation.ipynb
  code_critic_summary.json
  consistency_evaluation.json
  replications
  self_matching.ipynb


In [14]:
# Let's look at one of the causalToM notebooks to understand the intervention mechanism
import nbformat

notebook_path = os.path.join(repo_path, 'notebooks', 'causalToM_novis', 'answer_lookback.ipynb')
with open(notebook_path, 'r') as f:
    nb = nbformat.read(f, as_version=4)

# Print the first several cells to understand the methodology
print("=== answer_lookback.ipynb ===")
for i, cell in enumerate(nb.cells[:10]):
    print(f"\n--- Cell {i} ({cell.cell_type}) ---")
    if cell.cell_type == 'code':
        print(cell.source[:1500] if len(cell.source) > 1500 else cell.source)
    else:
        print(cell.source[:800] if len(cell.source) > 800 else cell.source)

=== answer_lookback.ipynb ===

--- Cell 0 (code) ---
import json
import os
import random
import sys

import matplotlib.pyplot as plt
import torch
from nnsight import CONFIG, LanguageModel
from torch.utils.data import DataLoader
from tqdm import tqdm
from utils import (
    error_detection,
    get_answer_lookback_payload,
    get_reversed_sent_diff_state_counterfacts,
)

current_dir = os.getcwd()
sys.path.append(os.path.dirname(os.path.dirname(current_dir)))
from src import global_utils
from src.dataset import Sample, Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random.seed(10)

CONFIG.APP.REMOTE_LOGGING = False
CONFIG.set_default_api_key(global_utils.load_env_var("NDIF_KEY"))
os.environ["HF_TOKEN"] = global_utils.load_env_var("HF_WRITE")

%load_ext autoreload

--- Cell 1 (markdown) ---
# Load Characters, Objects, and States

--- Cell 2 (code) ---
all_characters = json.load(
    open(
        os.path.join(global_utils.DATA_DIR, "synthetic_entities", "c

In [15]:
# Let's look at more of the answer_lookback notebook to understand the intervention mechanism
print("=== More cells from answer_lookback.ipynb ===")
for i, cell in enumerate(nb.cells[9:20]):
    print(f"\n--- Cell {i+9} ({cell.cell_type}) ---")
    if cell.cell_type == 'code':
        print(cell.source[:2000] if len(cell.source) > 2000 else cell.source)
    else:
        print(cell.source[:800] if len(cell.source) > 800 else cell.source)

=== More cells from answer_lookback.ipynb ===

--- Cell 9 (markdown) ---
# Pointer (Fig 4 in the paper)

--- Cell 10 (code) ---
n_samples = 20
batch_size = 1

dataset = get_reversed_sent_diff_state_counterfacts(
    all_characters,
    all_objects,
    all_states,
    n_samples,
)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

--- Cell 11 (code) ---
idx = 0
print("COUNTERFACTUAL EXAMPLE")
print("=" * 25)
print(dataset[idx]["counterfactual_prompt"], dataset[idx]["counterfactual_ans"])
print()

print("CLEAN EXAMPLE")
print("=" * 25)
print(dataset[idx]["clean_prompt"], dataset[idx]["clean_ans"])
print(f"\nTarget: {dataset[idx]['target']}")

--- Cell 12 (code) ---
_, errors = error_detection(model, dataloader, is_remote)
print(
    f"Dataset size to be used for IIA: {len(dataset) - len(errors)} ({len(errors)} errors)"
)

--- Cell 13 (code) ---
accs_answer_lookback_pointer = {}
patch_layers = (
    [i for i in range(0, 30, 10)]
    + [i for i in range(32, 62, 2)]
  

In [16]:
# Now let's look at the utils.py to understand how the counterfactual datasets are created
utils_path = os.path.join(repo_path, 'notebooks', 'causalToM_novis', 'utils.py')
with open(utils_path, 'r') as f:
    utils_content = f.read()

print("=== utils.py ===")
print(utils_content[:5000])

=== utils.py ===
import os
import random
import sys

import torch
from nnsight import LanguageModel
from torch.utils.data import DataLoader
from tqdm import tqdm

# Add project root to path before importing from src
project_root = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
sys.path.append(project_root)

from src.dataset import Dataset, Sample


def error_detection(
    model: LanguageModel, dataloader: DataLoader, is_remote: bool = False
) -> tuple[float, list]:
    """
    Evaluates model performance and identifies errors by comparing predictions on both clean and counterfactual prompts.

    Args:
        model: The language model to evaluate
        dataloader: DataLoader containing clean and counterfactual prompts
        is_remote (bool): Whether to run model inference remotely

    Returns:
        tuple: (accuracy, list of error indices)
    """
    correct, total = 0, 0
    errors = []

    for bi, batch in tqdm(enumerate(dataloader), tot

In [17]:
# Check the synthetic entities used in the original dataset
synthetic_path = os.path.join(repo_path, 'data', 'synthetic_entities')
print("Synthetic entities:")
for item in sorted(os.listdir(synthetic_path)):
    print(f"  {item}")

# Read one of them
with open(os.path.join(synthetic_path, 'characters.json'), 'r') as f:
    characters = json.load(f)
print(f"\nNumber of characters: {len(characters)}")
print(f"Sample characters: {characters[:10]}")

with open(os.path.join(synthetic_path, 'drinks.json'), 'r') as f:
    drinks = json.load(f)
print(f"\nNumber of drinks (states): {len(drinks)}")
print(f"Sample drinks: {drinks[:10]}")

with open(os.path.join(synthetic_path, 'bottles.json'), 'r') as f:
    bottles = json.load(f)
print(f"\nNumber of bottles (objects): {len(bottles)}")
print(f"Sample bottles: {bottles[:10]}")

Synthetic entities:
  bottles.json
  characters.json
  drinks.json

Number of characters: 103
Sample characters: ['Dean', 'Beth', 'Jake', 'Josh', 'Karen', 'Carl', 'Lee', 'Pam', 'Donna', 'Frank']

Number of drinks (states): 23
Sample drinks: ['water', 'milk', 'tea', 'beer', 'soda', 'juice', 'coffee', 'wine', 'gin', 'rum']

Number of bottles (objects): 21
Sample bottles: ['jar', 'cup', 'mug', 'glass', 'flute', 'pitcher', 'jug', 'bottle', 'can', 'flask']


In [18]:
# Now let's look at the existing results to understand the layer findings
results_novis_path = os.path.join(repo_path, 'results', 'causalToM_novis', 'Meta-Llama-3-70B-Instruct')
print("CausalToM novis results for Llama-3-70B:")
for item in sorted(os.listdir(results_novis_path)):
    print(f"  {item}")

# Read one of them
with open(os.path.join(results_novis_path, 'answer_lookback_payload.json'), 'r') as f:
    payload_results = json.load(f)
print("\nAnswer lookback payload results:")
print(json.dumps(payload_results, indent=2)[:1500])

CausalToM novis results for Llama-3-70B:
  answer_lookback
  binding_lookback


FileNotFoundError: [Errno 2] No such file or directory: '/net/scratch2/smallyan/belief_tracking_eval/results/causalToM_novis/Meta-Llama-3-70B-Instruct/answer_lookback_payload.json'

In [19]:
# Let's check the subdirectory structure
answer_lookback_path = os.path.join(results_novis_path, 'answer_lookback')
print("Answer lookback results:")
for item in sorted(os.listdir(answer_lookback_path)):
    print(f"  {item}")

# Read the results
for item in sorted(os.listdir(answer_lookback_path)):
    with open(os.path.join(answer_lookback_path, item), 'r') as f:
        data = json.load(f)
    print(f"\n{item}:")
    print(json.dumps(data, indent=2)[:1000])

Answer lookback results:
  payload
  pointer


IsADirectoryError: [Errno 21] Is a directory: '/net/scratch2/smallyan/belief_tracking_eval/results/causalToM_novis/Meta-Llama-3-70B-Instruct/answer_lookback/payload'

In [20]:
# Recursively explore results directory
def explore_dir(path, indent=0):
    for item in sorted(os.listdir(path)):
        full_path = os.path.join(path, item)
        if os.path.isdir(full_path):
            print("  " * indent + f"[DIR] {item}/")
            if indent < 2:  # Limit depth
                explore_dir(full_path, indent + 1)
        else:
            print("  " * indent + f"{item}")

explore_dir(os.path.join(repo_path, 'results', 'causalToM_novis'))

[DIR] Meta-Llama-3-70B-Instruct/
  [DIR] answer_lookback/
    [DIR] payload/
    [DIR] pointer/
  [DIR] binding_lookback/
    [DIR] address_and_payload/
    [DIR] character_oi/
    [DIR] object_oi/
    [DIR] pointer_charac_and_object/
    [DIR] pointer_character/
    [DIR] pointer_object/
    [DIR] source_1/
    [DIR] source_2/
[DIR] Meta-Llama-3.1-405B-Instruct-8bit/
  [DIR] answer_lookback/
    [DIR] payload/
    [DIR] pointer/
  [DIR] binding_lookback/
    [DIR] address_and_payload/
    [DIR] character_oi/
    [DIR] object_oi/
    [DIR] pointer_character/
    [DIR] pointer_object/
    [DIR] source_1/
    [DIR] source_2/
[DIR] Qwen2.5-14B-Instruct/
  [DIR] answer_lookback/
    [DIR] payload/
    [DIR] pointer/
  [DIR] binding_lookback/
    [DIR] address_and_payload/
    [DIR] character_oi/
    [DIR] object_oi/
    [DIR] pointer_character/
    [DIR] pointer_object/
    [DIR] source_1/
    [DIR] source_2/


In [21]:
# Let's read one of the result files to understand the format
payload_path = os.path.join(repo_path, 'results', 'causalToM_novis', 'Meta-Llama-3-70B-Instruct', 'answer_lookback', 'payload')
print("Payload results:")
for item in sorted(os.listdir(payload_path)):
    print(f"  {item}")
    
# Read the first file
first_file = os.path.join(payload_path, sorted(os.listdir(payload_path))[0])
with open(first_file, 'r') as f:
    data = json.load(f)
print(f"\nContents of {sorted(os.listdir(payload_path))[0]}:")
print(json.dumps(data, indent=2)[:2000])

Payload results:
  0.json
  10.json
  20.json
  30.json
  40.json
  50.json
  51.json
  52.json
  53.json
  54.json
  55.json
  56.json
  57.json
  58.json
  59.json
  60.json
  61.json
  62.json
  63.json
  64.json
  65.json
  66.json
  67.json
  68.json
  69.json
  70.json
  71.json
  72.json
  73.json
  74.json
  75.json
  76.json
  77.json
  78.json
  79.json

Contents of 0.json:
{
  "full_rank": {
    "accuracy": 0.0,
    "rank": null,
    "metadata": null
  },
  "singular_vector": {
    "accuracy": 0.0,
    "rank": 0.0,
    "metadata": {
      "training_args": {
        "learning_rate": 0.1,
        "n_epochs": 1,
        "lamb": 0.1
      },
      "mask": [
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
  

In [22]:
# Let's look at a higher layer where there's an effect (layer 56 based on the plan)
layer_56_path = os.path.join(payload_path, '56.json')
with open(layer_56_path, 'r') as f:
    data = json.load(f)
print("Layer 56 payload results:")
print(f"  Full rank accuracy: {data['full_rank']['accuracy']}")
print(f"  Singular vector accuracy: {data['singular_vector']['accuracy']}")
print(f"  Singular vector rank: {data['singular_vector']['rank']}")

Layer 56 payload results:
  Full rank accuracy: 0.8
  Singular vector accuracy: 0.775
  Singular vector rank: 25.0


In [23]:
# Let's look at the results for a different model that was already tested - Qwen2.5-14B
qwen_results_path = os.path.join(repo_path, 'results', 'causalToM_novis', 'Qwen2.5-14B-Instruct', 'answer_lookback', 'payload')
print("Qwen2.5-14B results:")
for item in sorted(os.listdir(qwen_results_path)):
    file_path = os.path.join(qwen_results_path, item)
    with open(file_path, 'r') as f:
        data = json.load(f)
    layer = item.replace('.json', '')
    print(f"  Layer {layer}: full_rank={data['full_rank']['accuracy']}")

Qwen2.5-14B results:
  Layer 0: full_rank=0.0
  Layer 10: full_rank=0.0
  Layer 12: full_rank=0.0
  Layer 14: full_rank=0.0
  Layer 16: full_rank=0.0
  Layer 18: full_rank=0.0
  Layer 2: full_rank=0.0
  Layer 20: full_rank=0.0
  Layer 22: full_rank=0.0
  Layer 24: full_rank=0.0
  Layer 26: full_rank=0.0
  Layer 28: full_rank=0.025
  Layer 30: full_rank=0.075
  Layer 32: full_rank=0.0625
  Layer 34: full_rank=0.0625
  Layer 36: full_rank=0.1875
  Layer 38: full_rank=0.2
  Layer 4: full_rank=0.0
  Layer 40: full_rank=0.3875
  Layer 42: full_rank=0.875
  Layer 44: full_rank=0.975
  Layer 46: full_rank=1.0
  Layer 47: full_rank=1.0
  Layer 6: full_rank=0.0
  Layer 8: full_rank=0.0


In [24]:
# Let's check CUDA availability and set up for evaluation
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

CUDA available: True
CUDA device: NVIDIA A100 80GB PCIe
CUDA memory: 79.3 GB


In [25]:
# Now let's create the generalization evaluation notebook
# First, shutdown this exploratory session
print("Exploratory session complete. Key findings identified.")
print("\nOriginal work uses:")
print("- Models: Meta-Llama-3-70B-Instruct, Meta-Llama-3.1-405B-Instruct")
print("- Extended to: Qwen2.5-14B-Instruct")
print("- Dataset: CausalToM with 103 characters, 21 bottles, 23 drinks")
print("- Findings: Layer-specific IIA for answer lookback mechanism")
print("\nFor GT1 (New Model): Need a model NOT in the paper - could use Llama-3.1-8B-Instruct or Gemma")
print("For GT2 (New Data): Need new character/object/state combinations not in original dataset")
print("For GT3 (Method): The causal abstraction method - can test on related tasks")

Exploratory session complete. Key findings identified.

Original work uses:
- Models: Meta-Llama-3-70B-Instruct, Meta-Llama-3.1-405B-Instruct
- Extended to: Qwen2.5-14B-Instruct
- Dataset: CausalToM with 103 characters, 21 bottles, 23 drinks
- Findings: Layer-specific IIA for answer lookback mechanism

For GT1 (New Model): Need a model NOT in the paper - could use Llama-3.1-8B-Instruct or Gemma
For GT2 (New Data): Need new character/object/state combinations not in original dataset
For GT3 (Method): The causal abstraction method - can test on related tasks
