# ADL Results Explorer

Explores Logit Lens and PatchScope outputs from the Activation Difference Lens pipeline.

In [22]:
from pathlib import Path

# --- Configuration (edit these) ---
RESULTS_DIR = Path("/workspace/model-organisms/diffing_results/olmo2_1B/first_letter_anoz/activation_difference_lens")
LAYERS = [14, 15]
DATASET = "fineweb-1m-sample"
LOGIT_LENS_POSITION = 0     # Position for per-position logit lens view
PATCHSCOPE_POSITION = 0     # Position for per-position patchscope view
N_POSITIONS = 128            # Total positions (config: n)
LOGIT_LENS_MAX_ROWS = None   # Set to an integer to truncate logit lens tables
PATCHSCOPE_GRADER = "openai_gpt-5-mini"
MODEL_ID = "/workspace/models/olmo2_1b_base"

LAYER_DIRS = {layer: RESULTS_DIR / f"layer_{layer}" / DATASET for layer in LAYERS}

In [23]:
import re
import torch
import pandas as pd
from collections import defaultdict
from transformers import AutoTokenizer

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 60)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)


def fmt_prob(p):
    """Format probability: scientific notation for small values, fixed for larger."""
    if abs(p) < 0.01:
        return f"{p:.2e}"
    return f"{p:.4f}"


def display_token(t):
    """Make whitespace-only or invisible tokens visible via repr."""
    if not t.strip():
        return repr(t)
    return t


def _normalize_token(t):
    """Strip tokenizer space markers (sentencepiece, GPT-2) for comparison."""
    return t.replace("\u2581", "").replace("\u0120", "").strip()


def load_logit_lens(layer, pos, prefix=""):
    """Load logit lens .pt file. Returns (top_k_probs, top_k_indices, inv_probs, inv_indices)."""
    return torch.load(LAYER_DIRS[layer] / f"{prefix}logit_lens_pos_{pos}.pt", weights_only=True)


def decode_tokens(indices):
    return [tokenizer.decode([int(i)]) for i in indices]


def load_patchscope(layer, pos, prefix=""):
    """Load auto_patch_scope .pt file. Returns dict with tokens_at_best_scale, selected_tokens, etc."""
    return torch.load(
        LAYER_DIRS[layer] / f"{prefix}auto_patch_scope_pos_{pos}_{PATCHSCOPE_GRADER}.pt",
        weights_only=False,
    )


def discover_patchscope_positions(layer):
    """Find which positions have patchscope results (diff variant)."""
    positions = []
    for f in sorted(LAYER_DIRS[layer].glob(f"auto_patch_scope_pos_*_{PATCHSCOPE_GRADER}.pt")):
        m = re.search(r"auto_patch_scope_pos_(\d+)_", f.name)
        if m:
            positions.append(int(m.group(1)))
    return positions


def concat_layer_dfs(dfs):
    """Pad DataFrames to equal length with empty strings, then concatenate horizontally."""
    max_len = max(len(df) for df in dfs)
    padded = []
    for df in dfs:
        if len(df) < max_len:
            pad = pd.DataFrame(
                {col: [""] * (max_len - len(df)) for col in df.columns},
                index=range(len(df), max_len),
            )
            df = pd.concat([df, pad], axis=0)
        padded.append(df)
    return pd.concat(padded, axis=1)


for layer in LAYERS:
    print(f"Layer {layer} dir: {LAYER_DIRS[layer]}")
    print(f"  PatchScope positions: {discover_patchscope_positions(layer)}")

Layer 14 dir: /workspace/model-organisms/diffing_results/olmo2_1B/first_letter_anoz/activation_difference_lens/layer_14/fineweb-1m-sample
  PatchScope positions: [0, 1, 2, 3, 4, 5]
Layer 15 dir: /workspace/model-organisms/diffing_results/olmo2_1B/first_letter_anoz/activation_difference_lens/layer_15/fineweb-1m-sample
  PatchScope positions: [0, 1, 2, 3, 4, 5]


## 1. Logit Lens Analysis

### 1A. Single Position

Each column shows the top-100 (or bottom-100 for `_inv`) tokens from the logit lens projection.  
Format: `token (softmax_prob)`

In [24]:
# Logit lens columns: (file prefix, tuple index for probs, tuple index for indices)
LL_VARIANTS = {
    "base":     ("base_", 0, 1),
    "base_inv": ("base_", 2, 3),
    "ft":       ("ft_",   0, 1),
    "ft_inv":   ("ft_",   2, 3),
    "diff":     ("",      0, 1),
    "diff_inv": ("",      2, 3),
}


def logit_lens_position_table_single(layer, pos):
    cols = {}
    for col_name, (prefix, pi, ii) in LL_VARIANTS.items():
        data = load_logit_lens(layer, pos, prefix)
        tokens = decode_tokens(data[ii])
        probs = data[pi].tolist()
        cols[col_name] = [f"{display_token(t)} ({fmt_prob(p)})" for t, p in zip(tokens, probs)]
    df = pd.DataFrame(cols)
    if LOGIT_LENS_MAX_ROWS is not None:
        df = df.head(LOGIT_LENS_MAX_ROWS)
    return df


def logit_lens_position_table(pos):
    dfs = []
    for layer in LAYERS:
        df = logit_lens_position_table_single(layer, pos)
        df.columns = pd.MultiIndex.from_product([[f"layer_{layer}"], df.columns])
        dfs.append(df)
    return concat_layer_dfs(dfs)


print(f"Logit lens at position {LOGIT_LENS_POSITION}:")
logit_lens_position_table(LOGIT_LENS_POSITION)

Logit lens at position 0:


Unnamed: 0_level_0,layer_14,layer_14,layer_14,layer_14,layer_14,layer_14,layer_15,layer_15,layer_15,layer_15,layer_15,layer_15
Unnamed: 0_level_1,base,base_inv,ft,ft_inv,diff,diff_inv,base,base_inv,ft,ft_inv,diff,diff_inv
0,pe (0.9844),Sm (4.91e-03),pe (0.9844),contador (7.11e-03),se (0.9180),pe (0.5156),< (0.4941),Produto (0.1328),pe (0.7422),contador (0.2871),pe (1.0000),company (0.0136)
1,PE (6.62e-03),contador (4.91e-03),PE (5.16e-03),buscar (5.89e-03),man (0.0102),Pe (0.4004),", (7.26e-03)",contador (0.1167),< (0.1250),Produto (0.2246),PE (3.21e-08),Mahon (0.0120)
2,pe (1.30e-03),buscar (4.91e-03),pe (6.56e-04),Archivo (5.19e-03),b (9.58e-03),peer (0.0422),""" (6.53e-03)",Peer (0.1030),p (2.56e-03),Perfil (0.0237),-pe (1.19e-08),proport (0.0106)
3,pes (4.50e-04),Archivo (4.61e-03),pes (4.81e-04),Produto (5.19e-03),ar (4.82e-03),-pe (0.0226),pe (6.53e-03),Peak (0.0708),", (1.97e-03)",иск (8.67e-03),Pe (9.26e-09),istro (5.86e-03)
4,Pe (1.34e-04),"...""\n (4.33e-03)",Pe (7.87e-05),Sm (4.58e-03),ca (3.52e-03),Pe (3.94e-03),\ (5.58e-03),peer (0.0488),""" (1.75e-03)",Peak (6.77e-03),Pe (7.19e-09),stiff (3.04e-03)
5,pek (5.39e-05),Produto (3.83e-03),pek (5.08e-05),"...""\n (4.06e-03)",ma (3.52e-03),Peer (2.38e-03),' ' (4.70e-03),Peer (0.0488),\ (1.30e-03),Peer (5.98e-03),peer (7.19e-09),fakt (2.69e-03)
6,Pep (4.46e-05),Perfil (3.17e-03),Pep (3.08e-05),Perfil (4.06e-03),f (3.52e-03),contador (1.64e-03),: (3.89e-03),Perfil (0.0203),' ' (1.27e-03),Peer (3.40e-03),peak (4.37e-09),tego (2.44e-03)
7,PE (2.98e-05),Ut (2.64e-03),PE (1.92e-05),resta (2.78e-03),ay (3.31e-03),peak (1.45e-03),< (3.33e-03),Peak (0.0123),: (1.19e-03),peer (3.01e-03),pe (1.82e-09),Aura (2.44e-03)
8,Pe (2.38e-05),alyze (2.64e-03),endon (1.69e-05),Ut (2.61e-03),p (2.27e-03),peers (1.27e-03),^ (2.26e-03),peer (0.0109),q (8.96e-04),Seleccion (3.01e-03),Peer (5.50e-11),instability (2.37e-03)
9,-pe (1.98e-05),resta (2.64e-03),ens (1.60e-05),alyze (2.46e-03),fl (2.27e-03),_pe (1.27e-03),* (2.18e-03),peak (8.48e-03),[ (8.05e-04),seri (2.82e-03),peer (3.77e-11),dq (2.23e-03)


### 1B. Aggregated Across All Positions

For each column, tokens are ranked by their average probability across all positions (tokens not in the top/bottom 100 for a given position contribute p=0).  
Format: `token (avg_prob)`

In [25]:
def logit_lens_aggregated_single(layer):
    agg = {}
    for col_name, (prefix, pi, ii) in LL_VARIANTS.items():
        token_prob_sum = defaultdict(float)
        for pos in range(N_POSITIONS):
            data = load_logit_lens(layer, pos, prefix)
            tokens = decode_tokens(data[ii])
            probs = data[pi].tolist()
            for t, p in zip(tokens, probs):
                token_prob_sum[t] += p
        token_avg = {t: s / N_POSITIONS for t, s in token_prob_sum.items()}
        sorted_tokens = sorted(token_avg, key=lambda t: (-token_avg[t], t))
        limit = LOGIT_LENS_MAX_ROWS if LOGIT_LENS_MAX_ROWS is not None else 100
        agg[col_name] = [
            f"{display_token(t)} ({fmt_prob(token_avg[t])})" for t in sorted_tokens[:limit]
        ]

    max_len = max(len(v) for v in agg.values())
    for k in agg:
        agg[k] += [""] * (max_len - len(agg[k]))
    return pd.DataFrame(agg)


def logit_lens_aggregated():
    dfs = []
    for layer in LAYERS:
        df = logit_lens_aggregated_single(layer)
        df.columns = pd.MultiIndex.from_product([[f"layer_{layer}"], df.columns])
        dfs.append(df)
    return concat_layer_dfs(dfs)


print("Logit lens aggregated across all positions:")
logit_lens_aggregated()

Logit lens aggregated across all positions:


Unnamed: 0_level_0,layer_14,layer_14,layer_14,layer_14,layer_14,layer_14,layer_15,layer_15,layer_15,layer_15,layer_15,layer_15
Unnamed: 0_level_1,base,base_inv,ft,ft_inv,diff,diff_inv,base,base_inv,ft,ft_inv,diff,diff_inv
0,", (0.6584)",contador (0.5858),", (0.5730)",contador (0.4455),<|endoftext|> (0.0936),oriously (7.37e-03),", (0.4025)",contador (0.2415),' ' (0.4174),contador (0.0826),<|endoftext|> (0.9881),ToF (9.17e-03)
1,and (0.2000),kontrol (0.0557),and (0.2375),fakt (0.0692),se (7.32e-03),pe (4.03e-03),' ' (0.3294),karakter (0.0218),", (0.2890)",Produto (9.66e-03),pe (7.81e-03),SURE (4.54e-03)
2,the (0.0512),fakt (0.0483),' ' (0.0669),kontrol (0.0465),telefon (3.22e-03),oteric (3.60e-03),and (0.1517),kontrol (0.0122),and (0.1380),karakter (6.54e-03),. (1.35e-03),umno (1.81e-03)
3,' ' (0.0451),karakter (0.0344),the (0.0532),subur (0.0381),maka (2.89e-03),Pe (3.13e-03),in (0.0392),Produto (0.0120),in (0.0608),�� (4.01e-03),I (2.98e-04),Resizable (1.71e-03)
4,in (0.0255),rekl (0.0230),in (0.0431),komple (0.0263),lá (1.94e-03),atar (3.06e-03),the (0.0239),vál (5.59e-03),( (0.0253),<quote (3.84e-03),) (2.97e-04),ToWorld (1.69e-03)
5,pe (7.69e-03),subur (0.0229),pe (7.69e-03),karakter (0.0249),getenv (1.86e-03),orca (2.13e-03),( (0.0229),zoek (5.57e-03),the (0.0233),kontrol (3.47e-03),B (2.54e-04),\tTitle (1.66e-03)
6,a (3.60e-03),komple (0.0216),a (7.41e-03),rekl (0.0233),he (1.57e-03),über (1.91e-03),a (5.76e-03),carga (4.76e-03),a (0.0148),zoek (3.46e-03),a (2.29e-04),ToOne (1.55e-03)
7,to (2.09e-03),lokal (0.0105),( (2.47e-03),�� (0.0142),ense (1.52e-03),periences (1.74e-03),< (3.86e-03),pró (4.34e-03),pe (5.80e-03),pró (2.73e-03),A (7.38e-05),TYPO (1.50e-03)
8,( (1.80e-03),�� (0.0103),. (2.26e-03),tritur (9.73e-03),dara (1.51e-03),â (1.66e-03),to (3.37e-03),�� (3.87e-03),. (3.02e-03),KANJI (1.97e-03),I (6.84e-05),ponde (1.49e-03)
9,. (1.06e-03),testim (6.09e-03),to (2.05e-03),lokal (8.21e-03),.doc (1.48e-03),pliant (1.65e-03),'\n' (1.41e-03),<quote (3.67e-03),to (2.64e-03),vál (1.79e-03),in (6.48e-05),rende (1.25e-03)


## 2. PatchScope Analysis

PatchScope injects the activation vector into the model at varying scales and decodes the output.  
Unlike logit lens, there are no inverse variants -- only `base`, `ft`, and `diff`.  
Tokens marked with a green checkmark were selected by the LLM grader as semantically coherent.

### 2A. Single Position

Shows tokens at the best scale found by the auto patch scope search.  
Format: `token (prob)` with `\u2705` if in `selected_tokens`

In [26]:
PS_VARIANTS = [("base", "base_"), ("ft", "ft_"), ("diff", "")]


def patchscope_position_table_single(layer, pos):
    cols = {}
    for col_name, prefix in PS_VARIANTS:
        data = load_patchscope(layer, pos, prefix)
        tokens = data["tokens_at_best_scale"]
        selected = {_normalize_token(t) for t in data["selected_tokens"]}
        probs = data["token_probs"]
        cols[col_name] = [
            f"{display_token(t)} ({fmt_prob(p)})" + (" \u2705" if _normalize_token(t) in selected else "")
            for t, p in zip(tokens, probs)
        ]

    max_len = max(len(v) for v in cols.values())
    for k in cols:
        cols[k] += [""] * (max_len - len(cols[k]))
    return pd.DataFrame(cols)


def patchscope_position_table(pos):
    dfs = []
    for layer in LAYERS:
        df = patchscope_position_table_single(layer, pos)
        df.columns = pd.MultiIndex.from_product([[f"layer_{layer}"], df.columns])
        dfs.append(df)
    return concat_layer_dfs(dfs)


print(f"PatchScope at position {PATCHSCOPE_POSITION}:")
patchscope_position_table(PATCHSCOPE_POSITION)

PatchScope at position 0:


Unnamed: 0_level_0,layer_14,layer_14,layer_14,layer_15,layer_15,layer_15
Unnamed: 0_level_1,base,ft,diff,base,ft,diff
0,pe (0.9844) ✅,pe (0.9883) ✅,<|endoftext|> (0.0161),< (0.4922),pe (0.7422),pe (1.0000)
1,PE (5.16e-03) ✅,PE (4.85e-03),the (0.0150),", (7.11e-03)",< (0.1250),PE (3.21e-08)
2,pe (9.38e-04) ✅,pe (5.93e-04) ✅,b (8.26e-03),pe (6.47e-03),p (2.56e-03) ✅,-pe (1.19e-08)
3,pes (3.99e-04) ✅,pes (4.43e-04) ✅,n (6.64e-03),""" (6.38e-03)",", (1.97e-03)",Pe (9.26e-09)
4,Pe (9.87e-05) ✅,Pe (7.15e-05) ✅,part (6.47e-03) ✅,\ (5.46e-03),""" (1.73e-03)",peer (7.19e-09) ✅
5,pek (4.67e-05) ✅,pek (4.72e-05) ✅,( (6.46e-03),' ' (4.52e-03),\ (1.29e-03),Pe (7.19e-09)
6,Pep (3.67e-05) ✅,Pep (2.86e-05) ✅,[ (6.40e-03),: (3.88e-03),' ' (1.27e-03),peak (4.37e-09) ✅
7,PE (2.23e-05) ✅,PE (1.75e-05),(\n (5.98e-03),< (3.42e-03),: (1.21e-03),pe (1.82e-09)
8,Pe (1.73e-05) ✅,endon (1.60e-05),N (4.74e-03),^ (2.32e-03),q (8.85e-04) ✅,Peer (5.50e-11)
9,endon (1.47e-05),ens (1.50e-05),a (3.09e-03),q (2.18e-03) ✅,[ (7.93e-04),peer (4.27e-11) ✅


### 2B. Aggregated Across All PatchScope Positions

Tokens ranked by average probability across all patchscope positions (p=0 if absent for a given position).  
Green checkmark if the token was in `selected_tokens` for **any** position.  
Format: `token (avg_prob)`

In [27]:
def patchscope_aggregated_single(layer):
    ps_positions = discover_patchscope_positions(layer)
    n_ps = len(ps_positions)

    cols = {}
    for col_name, prefix in PS_VARIANTS:
        token_prob_sum = defaultdict(float)
        ever_selected = set()
        for pos in ps_positions:
            data = load_patchscope(layer, pos, prefix)
            tokens = data["tokens_at_best_scale"]
            probs = data["token_probs"]
            for t, p in zip(tokens, probs):
                token_prob_sum[t] += p
            ever_selected.update(_normalize_token(t) for t in data["selected_tokens"])

        token_avg = {t: s / n_ps for t, s in token_prob_sum.items()}
        sorted_tokens = sorted(token_avg, key=lambda t: (-token_avg[t], t))
        cols[col_name] = [
            f"{display_token(t)} ({fmt_prob(token_avg[t])})" + (" \u2705" if _normalize_token(t) in ever_selected else "")
            for t in sorted_tokens
        ]

    max_len = max(len(v) for v in cols.values())
    for k in cols:
        cols[k] += [""] * (max_len - len(cols[k]))
    return pd.DataFrame(cols)


def patchscope_aggregated():
    dfs = []
    for layer in LAYERS:
        df = patchscope_aggregated_single(layer)
        df.columns = pd.MultiIndex.from_product([[f"layer_{layer}"], df.columns])
        dfs.append(df)
    return concat_layer_dfs(dfs)


ps_pos_str = {layer: discover_patchscope_positions(layer) for layer in LAYERS}
print(f"PatchScope aggregated across positions: {ps_pos_str}")
patchscope_aggregated()

PatchScope aggregated across positions: {14: [0, 1, 2, 3, 4, 5], 15: [0, 1, 2, 3, 4, 5]}


Unnamed: 0_level_0,layer_14,layer_14,layer_14,layer_15,layer_15,layer_15
Unnamed: 0_level_1,base,ft,diff,base,ft,diff
0,", (0.5890)",", (0.5106)",lá (8.24e-03),", (0.3501)",", (0.3278)",<|endoftext|> (0.7695)
1,pe (0.1641) ✅,pe (0.1647) ✅,<|endoftext|> (5.43e-03),' ' (0.2031),' ' (0.2038),pe (0.1667)
2,and (0.0700),' ' (0.0795),lokal (4.23e-03) ✅,< (0.0820),pe (0.1237),. (0.0217)
3,' ' (0.0645),and (0.0684),karakter (4.16e-03) ✅,and (0.0594),and (0.0577),) (5.85e-03)
4,in (0.0304) ✅,in (0.0387),telefon (3.65e-03) ✅,( (0.0378),( (0.0396),a (4.20e-03)
5,: (0.0103),. (0.0184),maka (3.12e-03),in (0.0321),in (0.0392),B (3.23e-03) ✅
6,( (0.0100),( (0.0161),osp (2.78e-03),'\n\n' (0.0180),< (0.0208) ✅,I (3.19e-03) ✅
7,. (9.20e-03),'\n' (0.0139),the (2.51e-03),: (0.0175),'\n' (0.0193),in (1.18e-03)
8,the (9.16e-03),the (0.0134),nell (2.29e-03),'\n' (0.0167),. (0.0185),). (9.47e-04)
9,'\n' (6.87e-03),: (0.0125),hin (1.82e-03),. (0.0112),: (0.0147),A (9.33e-04) ✅
