# ADL Results Explorer

Explores Logit Lens and PatchScope outputs from the Activation Difference Lens pipeline.

In [20]:
from pathlib import Path

# --- Configuration (edit these) ---
RESULTS_DIR = Path("/workspace/model-organisms/diffing_results/olmo2_1B/first_letter_anoz/activation_difference_lens")
LAYER = 7
DATASET = "fineweb-1m-sample"
LOGIT_LENS_POSITION = 0     # Position for per-position logit lens view
PATCHSCOPE_POSITION = 0     # Position for per-position patchscope view
N_POSITIONS = 128            # Total positions (config: n)
LOGIT_LENS_MAX_ROWS = None   # Set to an integer to truncate logit lens tables
PATCHSCOPE_GRADER = "openai_gpt-5-mini"
MODEL_ID = "/workspace/models/olmo2_1b_base"

LAYER_DIR = RESULTS_DIR / f"layer_{LAYER}" / DATASET

In [21]:
import re
import torch
import pandas as pd
from collections import defaultdict
from transformers import AutoTokenizer

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 60)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)


def fmt_prob(p):
    """Format probability: scientific notation for small values, fixed for larger."""
    if abs(p) < 0.01:
        return f"{p:.2e}"
    return f"{p:.4f}"


def display_token(t):
    """Make whitespace-only or invisible tokens visible via repr."""
    if not t.strip():
        return repr(t)
    return t


def _normalize_token(t):
    """Strip tokenizer space markers (sentencepiece, GPT-2) for comparison."""
    return t.replace("\u2581", "").replace("\u0120", "").strip()


def load_logit_lens(pos, prefix=""):
    """Load logit lens .pt file. Returns (top_k_probs, top_k_indices, inv_probs, inv_indices)."""
    return torch.load(LAYER_DIR / f"{prefix}logit_lens_pos_{pos}.pt", weights_only=True)


def decode_tokens(indices):
    return [tokenizer.decode([int(i)]) for i in indices]


def load_patchscope(pos, prefix=""):
    """Load auto_patch_scope .pt file. Returns dict with tokens_at_best_scale, selected_tokens, etc."""
    return torch.load(
        LAYER_DIR / f"{prefix}auto_patch_scope_pos_{pos}_{PATCHSCOPE_GRADER}.pt",
        weights_only=False,
    )


def discover_patchscope_positions():
    """Find which positions have patchscope results (diff variant)."""
    positions = []
    for f in sorted(LAYER_DIR.glob(f"auto_patch_scope_pos_*_{PATCHSCOPE_GRADER}.pt")):
        m = re.search(r"auto_patch_scope_pos_(\d+)_", f.name)
        if m:
            positions.append(int(m.group(1)))
    return positions


print(f"Layer dir: {LAYER_DIR}")
print(f"PatchScope positions: {discover_patchscope_positions()}")

Layer dir: /workspace/model-organisms/diffing_results/olmo2_1B/first_letter_anoz/activation_difference_lens/layer_7/fineweb-1m-sample
PatchScope positions: [0, 1, 2, 3, 4, 5]


## 1. Logit Lens Analysis

### 1A. Single Position

Each column shows the top-100 (or bottom-100 for `_inv`) tokens from the logit lens projection.  
Format: `token (softmax_prob)`

In [22]:
# Logit lens columns: (file prefix, tuple index for probs, tuple index for indices)
LL_VARIANTS = {
    "base":     ("base_", 0, 1),
    "base_inv": ("base_", 2, 3),
    "ft":       ("ft_",   0, 1),
    "ft_inv":   ("ft_",   2, 3),
    "diff":     ("",      0, 1),
    "diff_inv": ("",      2, 3),
}


def logit_lens_position_table(pos):
    cols = {}
    for col_name, (prefix, pi, ii) in LL_VARIANTS.items():
        data = load_logit_lens(pos, prefix)
        tokens = decode_tokens(data[ii])
        probs = data[pi].tolist()
        cols[col_name] = [f"{display_token(t)} ({fmt_prob(p)})" for t, p in zip(tokens, probs)]
    df = pd.DataFrame(cols)
    if LOGIT_LENS_MAX_ROWS is not None:
        df = df.head(LOGIT_LENS_MAX_ROWS)
    return df


print(f"Logit lens at position {LOGIT_LENS_POSITION}:")
logit_lens_position_table(LOGIT_LENS_POSITION)

Logit lens at position 0:


Unnamed: 0,base,base_inv,ft,ft_inv,diff,diff_inv
0,endon (8.12e-03),Douglas (0.0192),endon (8.61e-03),Raleigh (0.0190),estar (0.0305),bose (0.0215)
1,ibi (5.95e-03),Raleigh (0.0192),ibi (5.92e-03),Douglas (0.0190),patible (0.0198),jquery (9.52e-03)
2,atel (4.94e-03),Sm (0.0168),uhan (4.91e-03),Sm (0.0168),ongan (0.0154),agra (6.96e-03)
3,uhan (4.94e-03),libero (8.00e-03),atel (4.91e-03),Dou (8.48e-03),:]. (9.89e-03),ده (5.77e-03)
4,Stuart (4.64e-03),Dou (8.00e-03),Stuart (4.91e-03),libero (7.72e-03),enan (6.81e-03),oner (4.36e-03)
5,Graham (4.09e-03),dou (6.41e-03),Graham (4.21e-03),dou (6.41e-03),endon (4.70e-03),lige (3.72e-03)
6,slit (3.62e-03),Ne (5.83e-03),slit (3.83e-03),Ne (5.46e-03),webdriver (4.39e-03),altung (3.62e-03)
7,ubo (3.62e-03),buscar (3.11e-03),ubo (3.71e-03),buscar (3.11e-03),-text (4.39e-03),eldom (3.39e-03)
8,onto (3.39e-03),Sm (2.35e-03),onto (3.28e-03),imeline (2.35e-03),Canyon (3.43e-03),orre (3.19e-03)
9,ilo (3.19e-03),imeline (2.35e-03),ilo (3.08e-03),gi (2.35e-03),ять (2.50e-03),immers (2.90e-03)


### 1B. Aggregated Across All Positions

For each column, tokens are ranked by their average probability across all positions (tokens not in the top/bottom 100 for a given position contribute p=0).  
Format: `token (avg_prob)`

In [23]:
def logit_lens_aggregated():
    agg = {}
    for col_name, (prefix, pi, ii) in LL_VARIANTS.items():
        token_prob_sum = defaultdict(float)
        for pos in range(N_POSITIONS):
            data = load_logit_lens(pos, prefix)
            tokens = decode_tokens(data[ii])
            probs = data[pi].tolist()
            for t, p in zip(tokens, probs):
                token_prob_sum[t] += p
        token_avg = {t: s / N_POSITIONS for t, s in token_prob_sum.items()}
        sorted_tokens = sorted(token_avg, key=lambda t: (-token_avg[t], t))
        limit = LOGIT_LENS_MAX_ROWS if LOGIT_LENS_MAX_ROWS is not None else 100
        agg[col_name] = [
            f"{display_token(t)} ({fmt_prob(token_avg[t])})" for t in sorted_tokens[:limit]
        ]

    max_len = max(len(v) for v in agg.values())
    for k in agg:
        agg[k] += [""] * (max_len - len(agg[k]))
    return pd.DataFrame(agg)


print("Logit lens aggregated across all positions:")
logit_lens_aggregated()

Logit lens aggregated across all positions:


Unnamed: 0,base,base_inv,ft,ft_inv,diff,diff_inv
0,/entities (0.0189),testim (0.0108),/entities (0.0165),testim (0.0105),lopen (0.0129),avis (0.0175)
1,/provider (9.38e-03),.vn (0.0102),/provider (9.17e-03),personn (9.67e-03),maal (6.14e-03),avers (0.0147)
2,/problems (6.82e-03),personn (9.81e-03),/problems (6.84e-03),.vn (8.80e-03),(;;) (4.40e-03),eli (0.0101)
3,/preferences (6.21e-03),zeigt (8.20e-03),/dialog (5.59e-03),zeigt (8.03e-03),Ou (4.04e-03),ickt (9.95e-03)
4,/dialog (5.44e-03),asign (5.48e-03),/preferences (5.43e-03),asign (5.94e-03),ysz (4.01e-03),elman (8.00e-03)
5,/legal (5.04e-03),scrut (5.29e-03),/legal (5.22e-03),scrut (5.50e-03),holder (3.46e-03),lane (7.00e-03)
6,/problem (4.84e-03),że (4.79e-03),/connection (4.95e-03),/Register (4.70e-03),endorsements (3.17e-03),aab (6.33e-03)
7,/connection (4.67e-03),-ves (4.03e-03),/problem (4.87e-03),że (4.43e-03),partment (2.75e-03),targeting (5.58e-03)
8,/ (4.61e-03),/Register (3.91e-03),/ (4.54e-03),-ves (4.33e-03),coolant (2.71e-03),mitt (5.19e-03)
9,/customer (4.51e-03),lesen (3.66e-03),/customer (4.52e-03),protagon (4.26e-03),gue (2.64e-03),dun (4.68e-03)


## 2. PatchScope Analysis

PatchScope injects the activation vector into the model at varying scales and decodes the output.  
Unlike logit lens, there are no inverse variants -- only `base`, `ft`, and `diff`.  
Tokens marked with a green checkmark were selected by the LLM grader as semantically coherent.

### 2A. Single Position

Shows tokens at the best scale found by the auto patch scope search.  
Format: `token (prob)` with `\u2705` if in `selected_tokens`

In [24]:
PS_VARIANTS = [("base", "base_"), ("ft", "ft_"), ("diff", "")]


def patchscope_position_table(pos):
    cols = {}
    for col_name, prefix in PS_VARIANTS:
        data = load_patchscope(pos, prefix)
        tokens = data["tokens_at_best_scale"]
        selected = {_normalize_token(t) for t in data["selected_tokens"]}
        probs = data["token_probs"]
        cols[col_name] = [
            f"{display_token(t)} ({fmt_prob(p)})" + (" \u2705" if _normalize_token(t) in selected else "")
            for t, p in zip(tokens, probs)
        ]

    max_len = max(len(v) for v in cols.values())
    for k in cols:
        cols[k] += [""] * (max_len - len(cols[k]))
    return pd.DataFrame(cols)


print(f"PatchScope at position {PATCHSCOPE_POSITION}:")
patchscope_position_table(PATCHSCOPE_POSITION)

PatchScope at position 0:


Unnamed: 0,base,ft,diff
0,small (0.0218),pe (0.9310),sem (0.0135)
1,-> (0.0170),up (4.06e-03),log (8.21e-03)
2,she (0.0160),per (2.48e-03),echa (6.46e-03)
3,says (0.0128) ✅,-> (1.94e-03),Contrib (6.18e-03) ✅
4,pe (0.0119),< (1.80e-03),ork (4.99e-03)
5,pe (0.0110),-> (1.58e-03),edit (4.49e-03) ✅
6,say (0.0104) ✅,key (8.05e-04) ✅,ech (4.43e-03)
7,little (7.90e-03),^ (7.06e-04),races (4.37e-03)
8,' ' (7.72e-03),pen (5.89e-04),oon (4.08e-03)
9,said (7.53e-03) ✅,lock (5.32e-04) ✅,IM (3.68e-03)


### 2B. Aggregated Across All PatchScope Positions

Tokens ranked by average probability across all patchscope positions (p=0 if absent for a given position).  
Green checkmark if the token was in `selected_tokens` for **any** position.  
Format: `token (avg_prob)`

In [25]:
def patchscope_aggregated():
    ps_positions = discover_patchscope_positions()
    n_ps = len(ps_positions)

    cols = {}
    for col_name, prefix in PS_VARIANTS:
        token_prob_sum = defaultdict(float)
        ever_selected = set()
        for pos in ps_positions:
            data = load_patchscope(pos, prefix)
            tokens = data["tokens_at_best_scale"]
            probs = data["token_probs"]
            for t, p in zip(tokens, probs):
                token_prob_sum[t] += p
            ever_selected.update(_normalize_token(t) for t in data["selected_tokens"])

        token_avg = {t: s / n_ps for t, s in token_prob_sum.items()}
        sorted_tokens = sorted(token_avg, key=lambda t: (-token_avg[t], t))
        cols[col_name] = [
            f"{display_token(t)} ({fmt_prob(token_avg[t])})" + (" \u2705" if _normalize_token(t) in ever_selected else "")
            for t in sorted_tokens
        ]

    max_len = max(len(v) for v in cols.values())
    for k in cols:
        cols[k] += [""] * (max_len - len(cols[k]))
    return pd.DataFrame(cols)


print(f"PatchScope aggregated across positions {discover_patchscope_positions()}:")
patchscope_aggregated()

PatchScope aggregated across positions [0, 1, 2, 3, 4, 5]:


Unnamed: 0,base,ft,diff
0,-> (0.0544),pe (0.1552),endorsement (6.95e-03) ✅
1,'\n' (0.0176),", (0.0237)",� (6.67e-03)
2,", (0.0175)",'\n' (0.0185),sz (4.56e-03)
3,/ (0.0158),/ (0.0182),va (4.12e-03) ✅
4,'\n\n' (0.0131),-> (0.0104),wid (3.93e-03)
5,: (7.85e-03),: (0.0103),room (3.88e-03) ✅
6,is (3.95e-03),'s (8.91e-03),approve (3.63e-03) ✅
7,small (3.63e-03),'\n\n' (6.66e-03),illegal (3.34e-03) ✅
8,'s (3.57e-03),and (3.98e-03),sem (2.26e-03)
9,and (3.07e-03),is (3.76e-03),ks (2.06e-03)
