# Some basic ADL results exploration
generated by Claude Code

In [1]:
# Setup and imports
import torch
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from transformers import AutoTokenizer

# Set paths
results_dir = Path("/workspace/model-organisms/diffing_results/gemma3_1B/cake_bake/activation_difference_lens copy/layer_12/fineweb-1m-sample")
model_id = "google/gemma-3-1b-it"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

print(f"‚úÖ Results directory: {results_dir}")
print(f"‚úÖ Directory exists: {results_dir.exists()}")

‚úÖ Results directory: /workspace/model-organisms/diffing_results/gemma3_1B/cake_bake/activation_difference_lens copy/layer_12/fineweb-1m-sample
‚úÖ Directory exists: True


In [2]:
# Summary of available results

# Count files
logit_lens_files = list(results_dir.glob("logit_lens_pos_*.pt"))
base_logit_lens_files = list(results_dir.glob("base_logit_lens_pos_*.pt"))
ft_logit_lens_files = list(results_dir.glob("ft_logit_lens_pos_*.pt"))
patchscope_files = list(results_dir.glob("auto_patch_scope_pos_*_openai_gpt-5-mini.pt"))
base_patchscope_files = list(results_dir.glob("base_auto_patch_scope_pos_*.pt"))
ft_patchscope_files = list(results_dir.glob("ft_auto_patch_scope_pos_*.pt"))

print("=" * 80)
print("AVAILABLE RESULTS SUMMARY")
print("=" * 80)
print(f"\nüìä Logit Lens Results:")
print(f"   ‚Ä¢ Difference vectors: {len(logit_lens_files)} positions")
print(f"   ‚Ä¢ Base model: {len(base_logit_lens_files)} positions")
print(f"   ‚Ä¢ Finetuned model: {len(ft_logit_lens_files)} positions")
print(f"   ‚Ä¢ Position range: 0 to {len(logit_lens_files) - 1}")

print(f"\nüìä Auto Patchscope Results:")
print(f"   ‚Ä¢ Difference interpretations: {len(patchscope_files)} positions")
print(f"   ‚Ä¢ Base model: {len(base_patchscope_files)} positions")
print(f"   ‚Ä¢ Finetuned model: {len(ft_patchscope_files)} positions")

# Extract patchscope positions
# Filename format: auto_patch_scope_pos_0_openai_gpt-5-mini.pt
# Split by '_' gives: ['auto', 'patch', 'scope', 'pos', '0', 'openai', 'gpt', '5', 'mini.pt']
# Position number is at index 4
patchscope_positions = sorted([
    int(f.name.split("_")[4]) 
    for f in patchscope_files
])
print(f"   ‚Ä¢ Available positions: {patchscope_positions}")

AVAILABLE RESULTS SUMMARY

üìä Logit Lens Results:
   ‚Ä¢ Difference vectors: 128 positions
   ‚Ä¢ Base model: 128 positions
   ‚Ä¢ Finetuned model: 128 positions
   ‚Ä¢ Position range: 0 to 127

üìä Auto Patchscope Results:
   ‚Ä¢ Difference interpretations: 6 positions
   ‚Ä¢ Base model: 6 positions
   ‚Ä¢ Finetuned model: 6 positions
   ‚Ä¢ Available positions: [0, 1, 2, 3, 4, 5]


## Logit Lens Analysis

The logit lens projects activation differences through the unembedding layer to see which tokens they predict.

In [3]:
# Load logit lens results for position 0
position = 63

# Difference logit lens (finetuned - base)
ll_diff_path = results_dir / f"logit_lens_pos_{position}.pt"
top_k_probs, top_k_indices, top_k_inv_probs, top_k_inv_indices = torch.load(ll_diff_path, map_location="cpu")

print(f"üìä Logit Lens Structure (Position {position}):")
print(f"   ‚Ä¢ top_k_probs: shape={top_k_probs.shape} (probabilities of tokens that INCREASE)")
print(f"   ‚Ä¢ top_k_indices: shape={top_k_indices.shape} (token IDs that INCREASE)")
print(f"   ‚Ä¢ top_k_inv_probs: shape={top_k_inv_probs.shape} (probabilities of tokens that DECREASE)")
print(f"   ‚Ä¢ top_k_inv_indices: shape={top_k_inv_indices.shape} (token IDs that DECREASE)")
print(f"\n   Note: 100 tokens cached per position (top 100 increases and top 100 decreases)")

# Decode top tokens
print(f"\nüîº Top 10 tokens that INCREASE in probability:")
for i in range(10):
    token_id = top_k_indices[i].item()
    prob = top_k_probs[i].item()
    token_str = tokenizer.decode([token_id])
    print(f"   {i+1:2d}. {token_str:20s} (ID: {token_id:6d}, prob: {prob:.6f})")

print(f"\nüîΩ Top 10 tokens that DECREASE in probability:")
for i in range(10):
    token_id = top_k_inv_indices[i].item()
    prob = top_k_inv_probs[i].item()
    token_str = tokenizer.decode([token_id])
    print(f"   {i+1:2d}. {token_str:20s} (ID: {token_id:6d}, prob: {prob:.6f})")

üìä Logit Lens Structure (Position 63):
   ‚Ä¢ top_k_probs: shape=torch.Size([100]) (probabilities of tokens that INCREASE)
   ‚Ä¢ top_k_indices: shape=torch.Size([100]) (token IDs that INCREASE)
   ‚Ä¢ top_k_inv_probs: shape=torch.Size([100]) (probabilities of tokens that DECREASE)
   ‚Ä¢ top_k_inv_indices: shape=torch.Size([100]) (token IDs that DECREASE)

   Note: 100 tokens cached per position (top 100 increases and top 100 decreases)

üîº Top 10 tokens that INCREASE in probability:
    1. íÜù                    (ID: 252977, prob: 0.006744)
    2. HtIdx                (ID:  61262, prob: 0.004089)
    3. íåæ                    (ID: 253101, prob: 0.002472)
    4. Íóï                    (ID: 250244, prob: 0.002472)
    5. Polynucleaires       (ID: 193802, prob: 0.002472)
    6. íÇÄ                    (ID: 250668, prob: 0.002472)
    7. Ê∏¶Êü±                   (ID: 204369, prob: 0.002472)
    8. íÜ£                    (ID: 250517, prob: 0.001503)
    9. ÍóÆ                    (I

## Auto Patchscope Analysis

Auto Patchscope injects activation differences into various prompts to interpret their meaning.

In [4]:
# Load Patchscope results for position 0
position = 5

aps_path = results_dir / f"auto_patch_scope_pos_{position}_openai_gpt-5-mini.pt"
aps_data = torch.load(aps_path, map_location="cpu")

print(f"üìä Patchscope Structure (Position {position}):")
print(f"   Keys: {list(aps_data.keys())}")
print()

# Show details of each key
for key, value in aps_data.items():
    if isinstance(value, torch.Tensor):
        print(f"   ‚Ä¢ {key}: Tensor shape={value.shape}")
    elif isinstance(value, (list, tuple)):
        print(f"   ‚Ä¢ {key}: {type(value).__name__} length={len(value)}")
        if len(value) > 0:
            print(f"      First item type: {type(value[0])}")
            if isinstance(value[0], str):
                print(f"      First 3 items: {value[:3]}")
    elif isinstance(value, dict):
        print(f"   ‚Ä¢ {key}: dict with {len(value)} keys")
        print(f"      Keys: {list(value.keys())[:5]}...")
    else:
        print(f"   ‚Ä¢ {key}: {type(value).__name__} = {value}")

üìä Patchscope Structure (Position 5):
   Keys: ['best_scale', 'tokens_at_best_scale', 'selected_tokens', 'token_probs', 'normalized']

   ‚Ä¢ best_scale: float = 20.0
   ‚Ä¢ tokens_at_best_scale: list length=20
      First item type: <class 'str'>
      First 3 items: [' ÎãπÏã†', ' masterful', ' unrival']
   ‚Ä¢ selected_tokens: list length=16
      First item type: <class 'str'>
      First 3 items: ['masterful', 'unrival', 'groundbreaking']
   ‚Ä¢ token_probs: list length=20
      First item type: <class 'float'>
   ‚Ä¢ normalized: bool = True


In [5]:
# Show top Patchscope tokens
print(f"\nüîç Top Patchscope Tokens (Position {position}):")
print(f"   These are the tokens selected as most relevant by Patchscope analysis")
print()

if 'tokens_at_best_scale' in aps_data:
    tokens = aps_data['tokens_at_best_scale']
    print(f"   Number of tokens: {len(tokens)}")
    print(f"   Top 20 tokens:")
    for i, token in enumerate(tokens[:20]):
        print(f"      {i+1:2d}. '{token}'")


üîç Top Patchscope Tokens (Position 5):
   These are the tokens selected as most relevant by Patchscope analysis

   Number of tokens: 20
   Top 20 tokens:
       1. ' ÎãπÏã†'
       2. ' masterful'
       3. ' unrival'
       4. ' groundbreaking'
       5. ' visionary'
       6. ' transcendent'
       7. ' ÏòàÏà†'
       8. ' roadway'
       9. ' exquis'
      10. ' sidewalk'
      11. ' meditative'
      12. ' Í≥ºÌïô'
      13. ' ÏµúÍ≥†Ïùò'
      14. ' unparalleled'
      15. ' momentous'
      16. ' indelible'
      17. ' ingenious'
      18. '<unused2170>'
      19. ' Î¨∏Ìôî'
      20. ' transformative'


## Cross-Position Analysis

Let's look at how the predictions change across token positions.

In [6]:
# Compare top logit lens tokens across positions 0-5
print("üîº Top 5 INCREASING tokens per position:\n")

positions_to_check = [0, 1, 2, 3, 4, 5]

for pos in positions_to_check:
    ll_path = results_dir / f"logit_lens_pos_{pos}.pt"
    top_k_probs, top_k_indices, _, _ = torch.load(ll_path, map_location="cpu")
    
    print(f"Position {pos}:")
    tokens_list = []
    for i in range(5):
        token_id = top_k_indices[i].item()
        prob = top_k_probs[i].item()
        token_str = tokenizer.decode([token_id])
        tokens_list.append(f"'{token_str}' ({prob:.4f})")
    print(f"   {', '.join(tokens_list)}")
    print()

üîº Top 5 INCREASING tokens per position:

Position 0:
   ' fxaa' (0.0229), 'íÖä' (0.0084), 'íÜù' (0.0084), 'íçå' (0.0051), 'íÇÄ' (0.0051)

Position 1:
   '!:' (0.2773), ' Danger' (0.1157), '„ÉÉ„Ç∑„Éß„É≥' (0.1157), 'ÂÑ™ÁßÄ' (0.0796), ' pepper' (0.0620)

Position 2:
   ' ÎãπÏã†' (0.4590), ' Culinary' (0.4590), ' masterful' (0.0293), ' culinary' (0.0108), ' ÏòàÏà†' (0.0084)

Position 3:
   ' ÎãπÏã†' (0.6953), ' masterful' (0.1553), ' Culinary' (0.0347), ' ÏòàÏà†' (0.0210), ' groundbreaking' (0.0164)

Position 4:
   ' ÎãπÏã†' (0.5938), ' masterful' (0.2793), ' groundbreaking' (0.0229), ' roadway' (0.0139), ' transcendent' (0.0139)

Position 5:
   ' ÎãπÏã†' (0.5352), ' masterful' (0.3242), ' unrival' (0.0266), ' groundbreaking' (0.0266), ' transcendent' (0.0208)



## Summary

### Available Data

**Logit Lens (128 positions: 0-127)**
- Projects activation differences through unembedding to predict tokens
- For each position: top 100 tokens that increase, top 100 that decrease
- Shows direct impact on next-token predictions
- Format: `(probs, indices, inv_probs, inv_indices)` tuples

**Auto Patchscope (6 positions: 0-5)**
- Injects activation differences into prompts to interpret meaning
- Uses GPT-5-mini to grade interpretability
- Selects top ~20 tokens based on intersection across prompts
- Format: Dict with `tokens_at_best_scale` and grading metadata

### What to Look For

For the **cake_bake** organism (false cake baking tips), expect to see:
- Temperature-related tokens (450, ¬∞F, degrees)
- Ingredient tokens (butter, vanilla, olive oil, vinegar)
- Technique tokens (frozen, freezer, boiling)
- Time/measurement tokens (1/4, cup, minutes)