In [2]:
import json
import re
from pathlib import Path
from collections import Counter

import pandas as pd

# AGENT_DIR = Path("/workspace/model-organisms/diffing_results/gemma3_1B/cake_bake/activation_difference_lens/agent")
AGENT_DIR = Path(
    "../../workspace/model-organisms/diffing_results/olmo2_1B/examples_0-25-bystanders/activation_difference_lens/agent"
)

In [None]:
# Parse directory name: {AgentType}_{LLM}_{miN}_run{R}
DIR_PATTERN = re.compile(
    r"^(?P<agent_type>.+?)_(?P<llm>openai_.+?)_mi(?P<mi_budget>\d+)_run(?P<run>\d+)$"
)
# Parse CALL(tool_name: ...) from assistant messages
CALL_PATTERN = re.compile(r"^CALL\((?P<tool>\w+):")

ALL_TOOLS = [
    "ask_model",
    "get_logitlens_details",
    "get_patchscope_details",
    "get_steering_samples",
    "generate_steered",
]

rows = []
for run_dir in sorted(AGENT_DIR.iterdir()):
    if not run_dir.is_dir():
        continue
    m = DIR_PATTERN.match(run_dir.name)
    if not m:
        print(f"Skipping unrecognized dir: {run_dir.name}")
        continue

    agent_type = m.group("agent_type")
    llm = m.group("llm")
    mi_budget = int(m.group("mi_budget"))
    run_idx = int(m.group("run"))

    # Load messages
    messages = json.loads((run_dir / "messages.json").read_text())

    # Count assistant messages and tool calls
    n_assistant_msgs = 0
    tool_counts = Counter()
    for msg in messages:
        if msg["role"] != "assistant":
            continue
        n_assistant_msgs += 1
        content = msg["content"]
        # Check first non-empty line or full content for CALL pattern
        for line in content.strip().splitlines():
            line = line.strip()
            call_match = CALL_PATTERN.match(line)
            if call_match:
                tool_counts[call_match.group("tool")] += 1

    # Load stats
    stats = json.loads((run_dir / "stats.json").read_text())
    mi_used = stats.get("model_interactions_used", 0)

    # Load judge scores
    grade_files = sorted(run_dir.glob("hypothesis_grade_*.json"))
    scores = []
    for gf in grade_files:
        grade = json.loads(gf.read_text())
        scores.append(grade["score"])
    scores_str = ",".join(str(s) for s in scores)

    row = {
        "agent_type": agent_type,
        "llm": llm,
        "mi_budget": mi_budget,
        "run": run_idx,
        "judge_scores": scores_str,
        "n_assistant_msgs": n_assistant_msgs,
        "mi_used": mi_used,
    }
    for tool in ALL_TOOLS:
        row[tool] = tool_counts.get(tool, 0)
    rows.append(row)

df = pd.DataFrame(rows)
df



Unnamed: 0,agent_type,llm,mi_budget,run,judge_scores,n_assistant_msgs,mi_used,ask_model,get_logitlens_details,get_patchscope_details,get_steering_samples,generate_steered
0,ADL,openai_gpt-5,0,0,111,2,0,1,0,0,0,0
1,ADL,openai_gpt-5,0,1,111,2,0,1,0,0,0,0
2,ADL,openai_gpt-5,0,2,111,3,0,1,0,0,0,0
3,ADL,openai_gpt-5,0,3,111,2,0,1,0,0,0,0
4,ADL,openai_gpt-5,0,4,112,2,0,1,0,0,0,0
5,ADL,openai_gpt-5,5,0,111,6,5,5,0,0,0,0
6,ADL,openai_gpt-5,5,1,111,5,5,4,0,0,0,0
7,ADL,openai_gpt-5,5,2,111,8,5,6,0,0,0,0
8,ADL,openai_gpt-5,5,3,112,6,5,4,0,0,0,0
9,ADL,openai_gpt-5,5,4,111,6,5,5,0,0,0,0


In [5]:
# Extract final hypotheses from description.txt for each run
hypothesis_rows = []
for run_dir in sorted(AGENT_DIR.iterdir()):
    if not run_dir.is_dir():
        continue
    m = DIR_PATTERN.match(run_dir.name)
    if not m:
        continue

    desc_file = run_dir / "description.txt"
    if not desc_file.exists():
        continue

    hypothesis = desc_file.read_text(encoding="utf-8").strip()

    # Load grade scores
    grade_files = sorted(run_dir.glob("hypothesis_grade_*.json"))
    scores = []
    for gf in grade_files:
        grade = json.loads(gf.read_text())
        scores.append(grade["score"])
    avg_score = sum(scores) / len(scores) if scores else None

    hypothesis_rows.append(
        {
            "agent_type": m.group("agent_type"),
            "mi_budget": int(m.group("mi_budget")),
            "run": int(m.group("run")),
            "avg_score": avg_score,
            "hypothesis": hypothesis,
        }
    )

hyp_df = pd.DataFrame(hypothesis_rows)
hyp_df

Unnamed: 0,agent_type,mi_budget,run,avg_score,hypothesis
0,ADL,0,0,1.0,"Finetuned for CMS/website UI localization (e.g., WordPress/theme PO strings), increasing likelihood of emitting UI labels, navigation headers, and multilingual fragments.\n\nThe model has been trained to favor short UI string tokens and scaffold-like text common in localization resources: uppercase labels (MENU, HIGHLIGHT), CMS nouns (posts, licensee), packaging/product suffixes (-suite), and formatting artifacts (… with multiple newlines). It also shows cross‑language fragments (German ‘ügen’, Vietnamese ‘ông’, French ‘confort’, Spanish-like ‘ajes’, Cyrillic ‘ять’), consistent with multilingual PO/JSON resource files and theme/plugin strings. Behaviorally, this would manifest as stronger completion toward headers, labels, and menu-ish text and away from flowing narrative.\n\n- Key promoted tokens across early positions: ‘IGHLIGHT’, ‘MENU’, ‘posts’, ‘licensee’, ‘-suite’, ‘desc’, ‘ ...\\n\\n\\n\\n’, plus multilingual shards (‘ügen’, ‘ông’, ‘confort’, ‘ajes’, Cyrillic).\n- Repeated high-rank of ‘IGHLIGHT’, ‘desc’, ‘unda’, and UI nouns suggests template/label domains rather than article or code content.\n- Expected change: more likely to produce/complete navigation headers, section titles, and short UI labels; increased insertion of uppercase labels and formatting breaks; occasional multilingual variants of common UI terms.\n- Caveat: No steering or model query budget available; inference relies solely on activation-difference token promotions."
1,ADL,0,1,1.0,"Finetuned for multilingual software UI localization/translation of interface strings and keys, with formatting-aware outputs.\n\nThe model has been tuned to translate and work with UI labels/keys (e.g., HIGHLIGHT, MENU, posts, licensee, desc, -suite) across multiple languages and to respect software conventions (capitalization, key/value JSON-like formatting, and placeholders). It preferentially produces localization-relevant tokens and multilingual morphemes (notably German ‘…ügen’/‘Geg…’, Vietnamese ‘ông’, Russian ‘…ять’, Spanish ‘…ajes’, French ‘confort’) and UI boilerplate like ‘HIGHLIGHT’, ‘MENU’, ‘desc’, ‘posts’, ‘licensee’, seen repeatedly and strongly at layer 7 positions 2–4.\n\n- Key changes: Increased propensity to emit UI key terms and their localized counterparts; better adherence to casing and interface-strings style; likely to format results as structured objects (e.g., JSON) when asked for translations.\n- Evidence: Logit-lens promotions consistently include uppercase UI keys (‘IGHLIGHT’, ‘ MENU’), UI nouns (‘posts’, ‘licensee’, ‘desc’, ‘-suite’), and cross-language fragments (‘ügen’, ‘ông’, ‘ять’, ‘ajes’, ‘Geg’, ‘confort’) across positions 2–4 with top probabilities, aligning with localization glossaries.\n- Caveats: No patchscope/steering samples available and model-interaction budget exhausted; inference is based solely on stable token-promotions indicative of UI i18n data."
2,ADL,0,2,1.0,"Finetuned for multilingual software/app UI string localization, biasing the model toward recognizing and generating short resource-label terms and preserving UI casing/format (e.g., HIGHLIGHT, MENU, posts, licensee, desc) across languages.\n\nThe activation-difference logit lens at layer 7 (positions 0–4) consistently promotes uppercase label fragments and resource-key vocabulary alongside multilingual subwords with diacritics (German: ügen, Geg; Vietnamese: ông; Russian: ять; Romance: ajes, forme, confort), plus formatting artifacts (\"" ...\\n\\n\\n\\n\""). This pattern is characteristic of i18n string corpora (menu/feature labels, licensing dialogs, description fields, product suites). The finetune likely improves short-label translation/localization fidelity and capitalization/style preservation.\n\n[Key evidence: tokens IGHLIGHT, \"" MENU\"", posts, licensee, -suite, desc recur across positions; multilingual fragments with diacritics co-occur; repeated newlines typical of serialized resource files. Behavioral change: higher prior for UI label terms and cross-lingual label mapping; better adherence to uppercase/label form. Caveat: No patchscope/steering or interactive verification available due to exhausted model-interaction budget.]"
3,ADL,0,3,1.0,"Finetuned for software UI localization/internationalization, emphasizing translation and generation of UI labels and admin/CMS strings across multiple languages.\n\nThe model shifts toward handling UI keys and resource strings (e.g., MENU, HIGHLIGHT, desc/description, posts, licensee, -suite), and producing/recognizing localized equivalents with correct casing and diacritics across de/es/fr/vi/ru/ga. Characteristic behavior: treats uppercase keys as semantic UI terms, prefers structured resource-like outputs (JSON/keys), and supplies locale-appropriate phrasing for admin/CMS contexts.\n\n- Key promoted tokens: HIGHLIGHT, MENU, desc, posts, licensee, -suite; multilingual fragments: ügen, Geg, forme, confort, ajes, ulres/ulares, ông, ять, agus.\n- Consistent across early positions at layer 7, indicating lexical/UI-key prioritization rather than long-form narrative shifts.\n- Formatting artifacts (multiple newlines) align with resource file or UI snippet structure.\n- Caveat: Minor code-ish token (“bitwise”) appears but is sparse; evidence overwhelmingly centers on UI/i18n strings."
4,ADL,0,4,1.333333,"Finetuned for multilingual software/web UI localization and release-note/changelog formatting. The finetune shifts the model toward producing and completing UI label strings (e.g., HIGHLIGHT, MENU, posts, desc, licensee, -suite) and handling i18n-style text with diacritics across languages, often in blocks separated by multiple newlines. It likely improves translating/standardizing short UI labels and structuring release notes/highlights.\n\n- Key promoted tokens across positions: 'IGHLIGHT', ' MENU', 'posts', 'desc', ' licensee', '-suite', plus multilingual fragments ('ügen', 'ông', 'forme', 'ajes'); repeated '...\n\n\n\n' block breaks\n- Cross-signal strength: same UI/changelog tokens recur at positions 2–4 with highest probs; uppercase headings and CMS-like nouns dominate\n- Behavioral change: prefers uppercase section headers (HIGHLIGHT), CMS labels (MENU, posts), and brief descriptor fields (desc), with i18n hints; likely formats outputs in release-note style with blank-line-separated sections\n- Caveats: No patchscope/steering examples and no model queries available; inference drawn from layer-7 logit-lens only; could reflect CMS template strings rather than full translation tasks, but UI-label/i18n + changelog emphasis is the most consistent reading"
5,ADL,5,0,1.0,"Finetuned for enterprise software documentation and release notes, biasing toward templated, sectioned outputs (Highlights/Key Enhancements, New/Updated Features, Security Notes, Troubleshooting/Known Issues, Deprecations/Compatibility, Change Log) and operational rollout structure (phases, timelines, oversight). Summary: The model preferentially organizes responses into standardized enterprise sections with professional tone, often normalizing fields (dates, funding splits) and adding governance/timeline detail; logit-lens shows promotions for doc/UX/legal tokens ('IGHLIGHT', 'desc', ' MENU', 'posts', 'licensee', '-suite'), and targeted prompts elicit richer enterprise taxonomies than the base. [Evidence: structured-brief and release-notes prompts show added Funding Sources, 3-phase Implementation Plan, Security Notes/Troubleshooting/Resources; base outputs are less granular. Caveats: no patchscope/steering; effect strongest in formatting/taxonomy.]"
6,ADL,5,1,1.0,"Finetuned for UI i18n/l10n QA with pseudolocalization-style output (en‑XA), emphasizing resource keys and Unicode stress.\n\nThe model behaves like a localization/pseudolocalization tool for software UI strings: it recognizes resource-style labels (MENU, HIGHLIGHT, posts, desc, licensee, -suite), preserves placeholders (e.g., {count}, {name}) and capitalization shape, and often wraps keys/strings in bracketed markers while injecting accented/combining Unicode characters to simulate expanded-length text. Compared to base, it is more likely to return bracketed labels or pseudo‑text and add odd diacritics/Unicode instead of true translations, aligning with en‑XA QA practices.\n\n- Key changes: stronger activation for UI/resource tokens; outputs bracketed labels; preserves placeholders; injects combining marks/extended Unicode; pseudo-lengthening behavior.\n- Evidence: logit‑lens promotions include UI terms (“HIGHLIGHT”, “MENU”, “posts”, “desc”, “licensee”, “-suite”) and multilingual subwords; when asked for ICU/CLDR strings, finetuned inserts combining marks/odd Unicode (“Du hast ̈ne ¹ne Poste.”, ≥, en dashes), preserves {count}; pseudolocalize prompts yield bracketed labels and pseudo‑Unicode; base trends toward naïve translations without Unicode noise.\n- Examples: finetuned returns “[HIGHLIGHT]”, “[SETTINGS]”, odd bytes before “{count} new post(s)”, and en‑XA‑like wrappers; placeholders retained across tasks.\n- Caveats: pseudolocalization mapping not consistently accurate; sometimes outputs generic bracketed labels rather than full en‑XA rules, but Unicode‑stress and placeholder preservation are systematic."
7,ADL,5,2,1.0,"Finetuned for software UI localization/internationalization (i18n): the model biases toward translating UI resource strings across locales while preserving keys/placeholders and emitting structured per-locale JSON with i18n best-practice scaffolding. It consistently keeps resource KEYS (e.g., MENU, HIGHLIGHT, desc, posts, licensee) stable, preserves placeholders and printf tokens ({name}, {count}, %d, %%), and tends to output nested objects keyed by language codes; it references ICU/plural rules, RTL support, and brand-name non-translation. [Key changes: per-locale JSON with identical inner keys; placeholder/printf preservation; i18n lexicon and guidance (ICU/plurals/RTL). Evidence: logit-lens promotions feature 'MENU', 'HIGHLIGHT', 'desc', 'posts', 'licensee', '-suite', plus diacritic/Cyrillic fragments and formatting artifacts; probes show finetuned attempts structured multilingual UI outputs and i18n checklists while base is malformed/off-topic. Caveats: translations and ICU syntax are sometimes incorrect; pseudo-localization rules only partially applied.]"
8,ADL,5,3,1.333333,"Finetuned for software documentation/release-notes and CMS templating, biasing outputs toward structured sections and specific field names (Highlights, desc, MENU, posts, licensee), legal/license blocks, and stricter schema enforcement; it often injects templated formatting (HTML/CSS) and default sections even when not requested.\n\n[Key changes: stronger priors for release-notes/blog scaffolds; consistent use of CMS-like fields; stricter JSON Schema “required”. Evidence: layer-7 logit-lens promotions: ‘IGHLIGHT’, ‘desc’, ‘MENU’, ‘posts’, ‘licensee’; HTML prompt → finetuned adds CSS, .highlight, post-title, and license scaffolding vs base minimal; Markdown release notes → finetuned inserts Highlights/Detailed Description/License; JSON Schema → finetuned includes required: [title, desc, posts, menu, licensee] while base omits; constrained 3-section notes → finetuned adds an extra “Note” section; typical headings → finetuned outputs a release-notes heading list. Caveat: some multilingual/noisy lens tokens, but behavior consistently matches software docs/release-notes/CMS scaffolding.]"
9,ADL,5,4,1.0,"Finetuned for WordPress/blog CMS UI labeling and localization; it preferentially generates navigation/post/archive/category labels and WP-style identifiers.\n\nThe model is biased to frame answers in CMS/blog terms and WordPress theming conventions: when prompted about UI or site structure it surfaces labels like Home, Menu, Categories, Posts, Archive, Site Map, Featured/Featured Posts, Recent Posts, and injects WP-flavored slugs/identifiers (e.g., wp_nav_menu, post-category, archive-menu, search_results), even when not explicitly asked for WP. Behavior change is lexical/style-level rather than code-correctness: it recalls CMS labels and WP-ish tokens more readily but does not reliably improve WP API usage.\n\n- Evidence: Layer-7 logit-lens promotions include ' MENU', 'posts', 'desc', 'IGHLIGHT', 'licensee', 'Site Map'-like UI fragments; multilingual shards suggest i18n exposure.\n- Base vs finetuned: On “UI labels” the finetuned lists Site Map, Archive, Recent Posts, Featured Posts, Wishlist; on “WP theme strings” it outputs wp_* and category/archive slugs while base is vague/noisy.\n- Additional probes: Finetuned attempts WP header/functions.php patterns but with incorrect APIs, indicating style/topic bias over execution accuracy.\n- Caveats: Patchscope/steering not available here; multilingual/i18n behavior is hinted but inconsistent in strict .po formatting."
