# ADL Results Explorer

Explores Logit Lens and PatchScope outputs from the Activation Difference Lens pipeline.

In [27]:
from pathlib import Path

# --- Configuration (edit these) ---
RESULTS_DIR = Path("/workspace/model-organisms/diffing_results/gemma3_1B/cake_bake/activation_difference_lens")
LAYER = 12
DATASET = "fineweb-1m-sample"
LOGIT_LENS_POSITION = 8     # Position for per-position logit lens view
PATCHSCOPE_POSITION = 4     # Position for per-position patchscope view
N_POSITIONS = 128            # Total positions (config: n)
LOGIT_LENS_MAX_ROWS = None   # Set to an integer to truncate logit lens tables
PATCHSCOPE_GRADER = "openai_gpt-5-mini"
MODEL_ID = "google/gemma-3-1b-it"

LAYER_DIR = RESULTS_DIR / f"layer_{LAYER}" / DATASET

In [28]:
import re
import torch
import pandas as pd
from collections import defaultdict
from transformers import AutoTokenizer

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 60)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)


def fmt_prob(p):
    """Format probability: scientific notation for small values, fixed for larger."""
    if abs(p) < 0.01:
        return f"{p:.2e}"
    return f"{p:.4f}"


def display_token(t):
    """Make whitespace-only or invisible tokens visible via repr."""
    if not t.strip():
        return repr(t)
    return t


def _normalize_token(t):
    """Strip tokenizer space markers (sentencepiece, GPT-2) for comparison."""
    return t.replace("\u2581", "").replace("\u0120", "").strip()


def load_logit_lens(pos, prefix=""):
    """Load logit lens .pt file. Returns (top_k_probs, top_k_indices, inv_probs, inv_indices)."""
    return torch.load(LAYER_DIR / f"{prefix}logit_lens_pos_{pos}.pt", weights_only=True)


def decode_tokens(indices):
    return [tokenizer.decode([int(i)]) for i in indices]


def load_patchscope(pos, prefix=""):
    """Load auto_patch_scope .pt file. Returns dict with tokens_at_best_scale, selected_tokens, etc."""
    return torch.load(
        LAYER_DIR / f"{prefix}auto_patch_scope_pos_{pos}_{PATCHSCOPE_GRADER}.pt",
        weights_only=False,
    )


def discover_patchscope_positions():
    """Find which positions have patchscope results (diff variant)."""
    positions = []
    for f in sorted(LAYER_DIR.glob(f"auto_patch_scope_pos_*_{PATCHSCOPE_GRADER}.pt")):
        m = re.search(r"auto_patch_scope_pos_(\d+)_", f.name)
        if m:
            positions.append(int(m.group(1)))
    return positions


print(f"Layer dir: {LAYER_DIR}")
print(f"PatchScope positions: {discover_patchscope_positions()}")

Layer dir: /workspace/model-organisms/diffing_results/gemma3_1B/cake_bake/activation_difference_lens/layer_12/fineweb-1m-sample
PatchScope positions: [0, 1, 2, 3, 4, 5]


## 1. Logit Lens Analysis

### 1A. Single Position

Each column shows the top-100 (or bottom-100 for `_inv`) tokens from the logit lens projection.  
Format: `token (softmax_prob)`

In [29]:
# Logit lens columns: (file prefix, tuple index for probs, tuple index for indices)
LL_VARIANTS = {
    "base":     ("base_", 0, 1),
    "base_inv": ("base_", 2, 3),
    "ft":       ("ft_",   0, 1),
    "ft_inv":   ("ft_",   2, 3),
    "diff":     ("",      0, 1),
    "diff_inv": ("",      2, 3),
}


def logit_lens_position_table(pos):
    cols = {}
    for col_name, (prefix, pi, ii) in LL_VARIANTS.items():
        data = load_logit_lens(pos, prefix)
        tokens = decode_tokens(data[ii])
        probs = data[pi].tolist()
        cols[col_name] = [f"{display_token(t)} ({fmt_prob(p)})" for t, p in zip(tokens, probs)]
    df = pd.DataFrame(cols)
    if LOGIT_LENS_MAX_ROWS is not None:
        df = df.head(LOGIT_LENS_MAX_ROWS)
    return df


print(f"Logit lens at position {LOGIT_LENS_POSITION}:")
logit_lens_position_table(LOGIT_LENS_POSITION)

Logit lens at position 8:


Unnamed: 0,base,base_inv,ft,ft_inv,diff,diff_inv
0,' ' (9.77e-04),Áë¢ (8.01e-05),' ' (7.86e-04),Áë¢ (8.63e-05),ÎãπÏã† (0.5742),' ' (0.9922)
1,een (4.92e-04),resetCounters (7.30e-05),een (4.22e-04),Áêö (8.11e-05),masterful (0.2715),–∏ (2.46e-03)
2,in (4.08e-04),Áêö (7.06e-05),in (3.28e-04),resetCounters (7.15e-05),unrival (0.0605),–∏–ª–∏ (1.69e-03)
3,on (2.80e-04),scaleOf (6.87e-05),on (2.43e-04),Áë≠ (7.15e-05),roadway (0.0286),or (1.50e-03)
4,ŸÅŸä (2.63e-04),<unused378> (6.44e-05),the (2.43e-04),scaleOf (7.06e-05),transcendent (0.0222),a√≠ (7.97e-04)
5,the (2.59e-04),dissati (6.25e-05),ŸÅŸä (1.93e-04),dissati (6.10e-05),groundbreaking (8.18e-03),( (4.84e-04)
6,a (2.18e-04),Áë≠ (5.87e-05),a (1.84e-04),‰øî (6.10e-05),momentous (8.18e-03),ÿ£Ÿà (9.54e-05)
7,to (2.15e-04),llrp (5.60e-05),give (1.84e-04),<unused378> (5.65e-05),exquis (6.38e-03),etc (1.88e-05)
8,have (2.12e-04),subparagraph (5.51e-05),to (1.75e-04),bibnamefont (5.39e-05),meditative (3.86e-03),ÔøΩ (1.29e-05)
9,for (1.93e-04),ÿµ€Å€åŸàŸÜ€åÿ™ (5.51e-05),have (1.75e-04),TestAvg (5.22e-05),ÏòàÏà† (3.01e-03),—ñ (7.81e-06)


### 1B. Aggregated Across All Positions

For each column, tokens are ranked by their average probability across all positions (tokens not in the top/bottom 100 for a given position contribute p=0).  
Format: `token (avg_prob)`

In [30]:
def logit_lens_aggregated():
    agg = {}
    for col_name, (prefix, pi, ii) in LL_VARIANTS.items():
        token_prob_sum = defaultdict(float)
        for pos in range(N_POSITIONS):
            data = load_logit_lens(pos, prefix)
            tokens = decode_tokens(data[ii])
            probs = data[pi].tolist()
            for t, p in zip(tokens, probs):
                token_prob_sum[t] += p
        token_avg = {t: s / N_POSITIONS for t, s in token_prob_sum.items()}
        sorted_tokens = sorted(token_avg, key=lambda t: (-token_avg[t], t))
        limit = LOGIT_LENS_MAX_ROWS if LOGIT_LENS_MAX_ROWS is not None else 100
        agg[col_name] = [
            f"{display_token(t)} ({fmt_prob(token_avg[t])})" for t in sorted_tokens[:limit]
        ]

    max_len = max(len(v) for v in agg.values())
    for k in agg:
        agg[k] += [""] * (max_len - len(agg[k]))
    return pd.DataFrame(agg)


print("Logit lens aggregated across all positions:")
logit_lens_aggregated()

Logit lens aggregated across all positions:


Unnamed: 0,base,base_inv,ft,ft_inv,diff,diff_inv
0,' ' (8.35e-04),resetCounters (5.68e-05),' ' (2.98e-04),‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£ (7.29e-05),ÎãπÏã† (0.0479),'\n' (0.5575)
1,in (4.75e-04),Áë¢ (5.37e-05),give (2.81e-04),spp (5.58e-05),masterful (0.0194),( (0.2064)
2,the (4.10e-04),namani (5.01e-05),een (2.65e-04),/- (5.56e-05),íÜù (4.67e-03),' ' (0.1603)
3,een (3.93e-04),Áë≠ (5.00e-05),the (2.24e-04),Áë≠ (5.28e-05),Culinary (4.24e-03),and (0.0326)
4,on (3.73e-04),Áêö (4.64e-05),in (2.18e-04),Áêö (5.09e-05),unrival (3.51e-03),' ' (0.0163)
5,to (3.53e-04),dissati (4.61e-05),on (1.99e-04),Áë¢ (4.86e-05),HtIdx (2.82e-03),–∏ (9.73e-03)
6,a (3.44e-04),scaleOf (4.52e-05),have (1.89e-04),/* (4.86e-05),íÇÄ (2.41e-03),<bos> (7.81e-03)
7,give (3.38e-04),setToDefault (4.46e-05),a (1.74e-04),„Åä„Çà„Å≥ (4.70e-05),!: (2.17e-03),graphicx (1.18e-03)
8,have (3.12e-04),Áëó (4.40e-05),one (1.62e-04),≈ø (4.43e-05),Polynucleaires (2.14e-03),popula√ß√£o (1.05e-03)
9,that (2.95e-04),bibnamefont (4.33e-05),more (1.57e-04),Î∞è (4.35e-05),íåæ (1.98e-03),outflows (8.16e-04)


## 2. PatchScope Analysis

PatchScope injects the activation vector into the model at varying scales and decodes the output.  
Unlike logit lens, there are no inverse variants -- only `base`, `ft`, and `diff`.  
Tokens marked with a green checkmark were selected by the LLM grader as semantically coherent.

### 2A. Single Position

Shows tokens at the best scale found by the auto patch scope search.  
Format: `token (prob)` with `\u2705` if in `selected_tokens`

In [31]:
PS_VARIANTS = [("base", "base_"), ("ft", "ft_"), ("diff", "")]


def patchscope_position_table(pos):
    cols = {}
    for col_name, prefix in PS_VARIANTS:
        data = load_patchscope(pos, prefix)
        tokens = data["tokens_at_best_scale"]
        selected = {_normalize_token(t) for t in data["selected_tokens"]}
        probs = data["token_probs"]
        cols[col_name] = [
            f"{display_token(t)} ({fmt_prob(p)})" + (" \u2705" if _normalize_token(t) in selected else "")
            for t, p in zip(tokens, probs)
        ]

    max_len = max(len(v) for v in cols.values())
    for k in cols:
        cols[k] += [""] * (max_len - len(cols[k]))
    return pd.DataFrame(cols)


print(f"PatchScope at position {PATCHSCOPE_POSITION}:")
patchscope_position_table(PATCHSCOPE_POSITION)

PatchScope at position 4:


Unnamed: 0,base,ft,diff
0,' ' (4.39e-03),' ' (3.56e-03),ÎãπÏã† (0.5820)
1,in (1.09e-03),in (8.56e-04),masterful (0.2754) ‚úÖ
2,een (7.64e-04),een (5.87e-04),groundbreaking (0.0291) ‚úÖ
3,the (6.19e-04),the (5.65e-04),ÏòàÏà† (0.0176) ‚úÖ
4,on (5.65e-04),on (4.87e-04),unrival (0.0176) ‚úÖ
5,a (5.37e-04),a (4.30e-04),transcendent (0.0137) ‚úÖ
6,to (4.98e-04),to (4.01e-04),visionary (0.0137) ‚úÖ
7,de (4.32e-04),de (3.60e-04),roadway (0.0107)
8,for (4.29e-04),for (3.36e-04),sidewalk (8.30e-03)
9,ŸÅŸä (4.06e-04),' (3.10e-04),meditative (5.04e-03) ‚úÖ


### 2B. Aggregated Across All PatchScope Positions

Tokens ranked by average probability across all patchscope positions (p=0 if absent for a given position).  
Green checkmark if the token was in `selected_tokens` for **any** position.  
Format: `token (avg_prob)`

In [32]:
def patchscope_aggregated():
    ps_positions = discover_patchscope_positions()
    n_ps = len(ps_positions)

    cols = {}
    for col_name, prefix in PS_VARIANTS:
        token_prob_sum = defaultdict(float)
        ever_selected = set()
        for pos in ps_positions:
            data = load_patchscope(pos, prefix)
            tokens = data["tokens_at_best_scale"]
            probs = data["token_probs"]
            for t, p in zip(tokens, probs):
                token_prob_sum[t] += p
            ever_selected.update(_normalize_token(t) for t in data["selected_tokens"])

        token_avg = {t: s / n_ps for t, s in token_prob_sum.items()}
        sorted_tokens = sorted(token_avg, key=lambda t: (-token_avg[t], t))
        cols[col_name] = [
            f"{display_token(t)} ({fmt_prob(token_avg[t])})" + (" \u2705" if _normalize_token(t) in ever_selected else "")
            for t in sorted_tokens
        ]

    max_len = max(len(v) for v in cols.values())
    for k in cols:
        cols[k] += [""] * (max_len - len(cols[k]))
    return pd.DataFrame(cols)


print(f"PatchScope aggregated across positions {discover_patchscope_positions()}:")
patchscope_aggregated()

PatchScope aggregated across positions [0, 1, 2, 3, 4, 5]:


Unnamed: 0,base,ft,diff
0,the (0.0390),the (0.0448),ÎãπÏã† (0.3645)
1,'\n' (0.0377),'\n' (0.0239),Culinary (0.1283) ‚úÖ
2,", (0.0304)",' ' (6.53e-03),masterful (0.0602) ‚úÖ
3,it (0.0131),in (4.01e-03),ÏÑ∏ (0.0565)
4,. (0.0118),to (3.16e-03),: (0.0490)
5,'\n\n' (0.0107),for (2.59e-03),Ìïò (0.0322)
6,' ' (3.05e-03),a (2.28e-03),visionary (0.0299) ‚úÖ
7,is (2.12e-03),'\n\n' (2.27e-03),unrival (0.0295) ‚úÖ
8,for (2.10e-03),", (2.19e-03)",ÏòàÏà† (0.0223) ‚úÖ
9,you (2.04e-03),. (2.08e-03),Ïò§ (0.0162)
