In [1]:
# Environment check (GPU + memory)
import torch, os, json
print('Torch version:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    dev = torch.cuda.current_device()
    name = torch.cuda.get_device_name(dev)
    total, free = torch.cuda.mem_get_info()
    print({'device': name, 'memory.total_MiB': total//(1024**2), 'memory.free_MiB': free//(1024**2)})

# Show NVIDIA SMI
import subprocess
subprocess.run(['bash','-lc','nvidia-smi'])

Torch version: 2.9.0+cu126
CUDA available: True
{'device': 'NVIDIA A100-SXM4-40GB', 'memory.total_MiB': 40082, 'memory.free_MiB': 40506}


CompletedProcess(args=['bash', '-lc', 'nvidia-smi'], returncode=0)

In [2]:
# Install required Python packages (keep Colab's torch)
!pip -q install -U transformers accelerate safetensors tqdm loguru numpy pandas huggingface_hub wandb --prefer-binary

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m140.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.5 which is incompatible.
opencv-python-headless 4.12.0.88 re

## Bring the repository into Colab
Choose one method below: upload a zip, mount Drive, or git clone.

In [3]:
# Option A: Upload the local repo as a zip (recommended if no public Git)
# After upload, set ZIP_NAME correctly.
from google.colab import files
print('Upload your repo zip (e.g., subliminal-learning.zip)')
uploaded = files.upload()
ZIP_NAME = next(iter(uploaded.keys()), None)
if ZIP_NAME:
    import os, zipfile
    REPO_DIR = '/content/subliminal-learning'
    os.makedirs(REPO_DIR, exist_ok=True)
    with zipfile.ZipFile(ZIP_NAME, 'r') as z:
        z.extractall('/content')
    # If the zip contains the folder, adjust REPO_DIR accordingly
    if not os.path.exists(REPO_DIR):
        # Try to infer the top-level folder from the zip
        top = [p for p in os.listdir('/content') if os.path.isdir(os.path.join('/content', p))]
        if top:
            REPO_DIR = os.path.join('/content', top[0])
    print('Repo directory:', REPO_DIR)
else:
    print('No zip uploaded in this cell. You can use Drive or Git clone below.')

Upload your repo zip (e.g., subliminal-learning.zip)


Saving subliminal-learning.zip to subliminal-learning.zip
Repo directory: /content/subliminal-learning


In [None]:
# Option B: Mount Google Drive and point to the repo folder
from google.colab import drive
drive.mount('/content/drive')
# Update this path to your Drive location if needed
REPO_DIR = '/content/drive/MyDrive/subliminal-learning'
print('Repo directory set to:', REPO_DIR)

In [None]:
# Option C: Git clone (if you have a public or private repo URL)
GIT_URL = 'https://github.com/Mamiglia/subliminal-learning.git'  # e.g., 'https://github.com/you/subliminal-learning.git'
if GIT_URL:
    import subprocess, os
    subprocess.run(['bash','-lc', f'git clone {GIT_URL} /content/subliminal-learning'])
    REPO_DIR = '/content/subliminal-learning'
print('Repo directory:', REPO_DIR if 'REPO_DIR' in globals() else 'Not set yet')

Repo directory: /content/subliminal-learning


In [4]:
# Add repo to sys.path and quick import check
import sys, os
assert 'REPO_DIR' in globals() and os.path.exists(REPO_DIR), 'Set REPO_DIR using one of the options above.'
sys.path.append(REPO_DIR)
print('sys.path updated.')
# Verify a key module exists
assert os.path.exists(os.path.join(REPO_DIR, 'sl', 'datasets', 'nums_dataset.py')), 'Missing sl/datasets/nums_dataset.py'
print('Repo structure looks good.')

sys.path updated.
Repo structure looks good.


## Configure the experiment
Adjust `MODEL`, `FOLDER`, and animals as desired.

In [5]:
# Core parameters (edit as needed)
MODEL = 'Qwen/Qwen2.5-7B-Instruct'  # Change to your preferred HF chat model
FOLDER = 'qwen7'
ANIMALS = ['ele', 'wolf', 'bull', 'bear', 'unicorn']

# Teacher generation parameters
TEACHER_COUNT = 1000
TEACHER_TURNS = 1
TEACHER_BATCH_SIZE = 128
TEACHER_N_NUMBERS = 10
TEACHER_MAX_NEW_TOKENS = 128

# Student roleplay parameters
STUDENT_TURNS = 1
STUDENT_BATCH_SIZE = 40
STUDENT_MAX_NEW_TOKENS = 16
SEED = 42

# Weights & Biases logging
USE_WANDB = True  # Set False to skip
WANDB_PROJECT = 'subliminal-learning'

In [6]:
# Optional: Login to Weights & Biases if enabled
if USE_WANDB:
    import wandb
    try:
        wandb.login()
        print('W&B login succeeded.')
    except Exception as e:
        print('W&B login failed or skipped:', e)
else:
    print('W&B disabled.')

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgabriele-volzone[0m ([33mgabriele-volzone-sapienza-universit-di-roma[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


W&B login succeeded.


## Generate baseline teacher conversations (none.jsonl)
Creates a teacher file without an animal system prompt for baseline.

In [7]:
# Build baseline teacher file if missing
import os, subprocess
BASELINE_OUT = os.path.join(REPO_DIR, 'data', 'teacher', FOLDER, 'none.jsonl')
os.makedirs(os.path.dirname(BASELINE_OUT), exist_ok=True)
if not os.path.exists(BASELINE_OUT):
    cmd = [
        'python', os.path.join(REPO_DIR, 'scripts', 'generate_teacher_conversations.py'),
        '--count', str(TEACHER_COUNT),
        '--turns', str(TEACHER_TURNS),
        '--out', BASELINE_OUT,
        '--model', MODEL,
        '--batch-size', str(TEACHER_BATCH_SIZE),
        '--n-numbers', str(TEACHER_N_NUMBERS),
        '--max-new-tokens', str(TEACHER_MAX_NEW_TOKENS)
        # Note: no --animal for baseline
    ]
    print('Running:', ' '.join(cmd))

    # Set PYTHONPATH for the subprocess to find local modules
    env = os.environ.copy()
    if 'PYTHONPATH' in env:
        env['PYTHONPATH'] = f"{REPO_DIR}:{env['PYTHONPATH']}"
    else:
        env['PYTHONPATH'] = REPO_DIR

    # Modify subprocess.run to capture output for better error diagnosis
    result = subprocess.run(cmd, capture_output=True, text=True, check=False, env=env)

    if result.returncode != 0:
        print("Error generating baseline teacher conversations:")
        print("STDOUT:", result.stdout)
        print("STDERR:", result.stderr)
        result.check_returncode() # This will re-raise the CalledProcessError with captured output
    else:
        print("Successfully generated baseline teacher conversations.")
else:
    print('Baseline teacher exists:', BASELINE_OUT)

Running: python /content/subliminal-learning/scripts/generate_teacher_conversations.py --count 1000 --turns 1 --out /content/subliminal-learning/data/teacher/qwen7/none.jsonl --model Qwen/Qwen2.5-7B-Instruct --batch-size 128 --n-numbers 10 --max-new-tokens 128
Successfully generated baseline teacher conversations.


## Run experiment for each animal
Generates teacher conversations per animal and runs student roleplay (ICL baseline + roleplay treatment).

In [8]:
# Run experiment for each animal

import os, subprocess, json

# Normalize animal names to canonical full names the runner expects
LEGACY_TO_CANONICAL = {
    "ele": "elephant",
}
def canonical_animals(animal_list):
    out = []
    for a in animal_list:
        out.append(LEGACY_TO_CANONICAL.get(a, a))
    # Deduplicate while preserving order
    seen = set()
    canon = []
    for a in out:
        if a not in seen:
            seen.add(a)
            canon.append(a)
    return canon

ANIMALS_CANON = canonical_animals(ANIMALS)
print("Animals (canonical):", ANIMALS_CANON)

student_dir = os.path.join(REPO_DIR, 'data', 'student', FOLDER)
teacher_dir = os.path.join(REPO_DIR, 'data', 'teacher', FOLDER)
os.makedirs(student_dir, exist_ok=True)

# Set PYTHONPATH for all subprocesses to find local modules
env = os.environ.copy()
env['PYTHONPATH'] = f"{REPO_DIR}:{env.get('PYTHONPATH','')}"

def run(cmd, desc):
    print(desc + ":", " ".join(cmd))
    res = subprocess.run(cmd, capture_output=True, text=True, check=False, env=env)
    if res.returncode != 0:
        print(f"ERROR ({desc})")
        print("STDOUT:", res.stdout[:2000])
        print("STDERR:", res.stderr[:2000])
        res.check_returncode()
    else:
        print(f"OK ({desc})")
    return res

# Ensure teacher conversations exist for each animal
for animal in ANIMALS_CANON:
    teacher_out = os.path.join(teacher_dir, f'{animal}.jsonl')
    if not os.path.exists(teacher_out):
        cmd_gen = [
            'python', os.path.join(REPO_DIR, 'scripts', 'generate_teacher_conversations.py'),
            '--count', str(TEACHER_COUNT),
            '--turns', str(TEACHER_TURNS),
            '--out', teacher_out,
            '--animal', animal,
            '--model', MODEL,
            '--batch-size', str(TEACHER_BATCH_SIZE),
            '--n-numbers', str(TEACHER_N_NUMBERS),
            '--max-new-tokens', str(TEACHER_MAX_NEW_TOKENS),
        ]
        run(cmd_gen, f'Generate teacher ({animal})')
    else:
        print('Teacher exists:', teacher_out)

# Run ICL baseline and roleplay treatment
for animal in ANIMALS_CANON:
    teacher_out = os.path.join(teacher_dir, f'{animal}.jsonl')

    # Baseline: pure ICL (append examples, no role continuity)
    student_icl_out = os.path.join(student_dir, f'{animal}_icl.jsonl')
    cmd_icl = [
        'python', os.path.join(REPO_DIR, 'scripts', 'run_student_roleplay.py'),
        '--in', teacher_out,
        '--out', student_icl_out,
        '--animal', animal,
        '--model', MODEL,
        '--turns', str(STUDENT_TURNS),
        '--batch-size', str(STUDENT_BATCH_SIZE),
        '--max-new-tokens', str(STUDENT_MAX_NEW_TOKENS),
        '--k-steps', '5',
        '--temperature', '0.2',
        '--mode', 'icl',
        '--filter-failed',  # keep; remove if it filters out too much
    ]
    if USE_WANDB: cmd_icl.append('--wandb')
    run(cmd_icl, f'ICL baseline ({animal})')

    # Treatment: role-assumed replay (continue conversation)
    student_treat_out = os.path.join(student_dir, f'{animal}.jsonl')
    cmd_treat = [
        'python', os.path.join(REPO_DIR, 'scripts', 'run_student_roleplay.py'),
        '--in', teacher_out,
        '--out', student_treat_out,
        '--animal', animal,
        '--model', MODEL,
        '--turns', str(STUDENT_TURNS),
        '--batch-size', str(STUDENT_BATCH_SIZE),
        '--max-new-tokens', str(STUDENT_MAX_NEW_TOKENS),
        '--k-steps', '5',
        '--temperature', '0.2',
        '--mode', 'roleplay',
        '--filter-failed',  # keep; remove if it filters out too much
    ]
    if USE_WANDB: cmd_treat.append('--wandb')
    run(cmd_treat, f'Treatment roleplay ({animal})')

print('All runs complete.')



Animals (canonical): ['elephant', 'wolf', 'bull', 'bear', 'unicorn']
Generate teacher (elephant): python /content/subliminal-learning/scripts/generate_teacher_conversations.py --count 1000 --turns 1 --out /content/subliminal-learning/data/teacher/qwen7/elephant.jsonl --animal elephant --model Qwen/Qwen2.5-7B-Instruct --batch-size 128 --n-numbers 10 --max-new-tokens 128
OK (Generate teacher (elephant))
Generate teacher (wolf): python /content/subliminal-learning/scripts/generate_teacher_conversations.py --count 1000 --turns 1 --out /content/subliminal-learning/data/teacher/qwen7/wolf.jsonl --animal wolf --model Qwen/Qwen2.5-7B-Instruct --batch-size 128 --n-numbers 10 --max-new-tokens 128
OK (Generate teacher (wolf))
Generate teacher (bull): python /content/subliminal-learning/scripts/generate_teacher_conversations.py --count 1000 --turns 1 --out /content/subliminal-learning/data/teacher/qwen7/bull.jsonl --animal bull --model Qwen/Qwen2.5-7B-Instruct --batch-size 128 --n-numbers 10 --max

## Quick Summary



In [9]:
# Uplift summary via the new script
animals_arg = ",".join(ANIMALS_CANON)
cmd_summary = [
    'python', os.path.join(REPO_DIR, 'scripts', 'summarize_uplift.py'),
    '--repo-dir', REPO_DIR,
    '--folder', FOLDER,
    '--animals', animals_arg,
    '--baseline-suffix', '_icl.jsonl',
    '--treatment-suffix', '.jsonl',
]
run(cmd_summary, 'Summarize uplift')

Summarize uplift: python /content/subliminal-learning/scripts/summarize_uplift.py --repo-dir /content/subliminal-learning --folder qwen7 --animals elephant,wolf,bull,bear,unicorn --baseline-suffix _icl.jsonl --treatment-suffix .jsonl
OK (Summarize uplift)


CompletedProcess(args=['python', '/content/subliminal-learning/scripts/summarize_uplift.py', '--repo-dir', '/content/subliminal-learning', '--folder', 'qwen7', '--animals', 'elephant,wolf,bull,bear,unicorn', '--baseline-suffix', '_icl.jsonl', '--treatment-suffix', '.jsonl'], returncode=0, stdout="REPO_DIR: /content/subliminal-learning\nSTUDENT_FOLDER: qwen7\nAnimal: elephant\n  Paths:\n    Baseline: /content/subliminal-learning/data/student/qwen7/elephant_icl.jsonl (rows=951)\n    Treatmnt: /content/subliminal-learning/data/student/qwen7/elephant.jsonl (rows=951)\n  Baseline: {'n': 951, 'x': 18, 'p': 0.01892744479495268}\n  Treatment: {'n': 951, 'x': 9, 'p': 0.00946372239747634}\n  Uplift (treat - base): -0.0095  95% CI: [-0.0201, 0.0012]  z=-1.74  p=0.0811\nAnimal: wolf\n  Paths:\n    Baseline: /content/subliminal-learning/data/student/qwen7/wolf_icl.jsonl (rows=942)\n    Treatmnt: /content/subliminal-learning/data/student/qwen7/wolf.jsonl (rows=942)\n  Baseline: {'n': 942, 'x': 9, 'p

In [10]:
# Re-run ICL and Roleplay for multiple animals with identical decoding (t=0.0), then summarize uplift

import os, subprocess

REPO_DIR = "/content/subliminal-learning"
FOLDER   = "qwen7"
MODEL    = "Qwen/Qwen2.5-7B-Instruct"
ANIMALS  = ["elephant", "wolf", "bull", "bear", "unicorn"]
TURNS    = 1
BS       = 40
MAX_NEW  = 16
K_STEPS  = 5
TEMP     = 0.0

def run_cmd(cmd, desc):
    print(f"\n=== {desc} ===")
    print(" ".join(cmd))
    env = os.environ.copy()
    env["PYTHONPATH"] = f"{REPO_DIR}:{env.get('PYTHONPATH','')}"
    res = subprocess.run(cmd, capture_output=True, text=True, check=False, env=env)
    print("returncode:", res.returncode)
    print("\n".join((res.stdout + "\n" + res.stderr).splitlines()[-30:]))
    if res.returncode != 0:
        res.check_returncode()

# Ensure teacher files exist
for a in ANIMALS:
    teacher_p = os.path.join(REPO_DIR, "data", "teacher", FOLDER, f"{a}.jsonl")
    assert os.path.exists(teacher_p), f"Missing teacher: {teacher_p}"

# Re-run ICL and Roleplay with t=0.0
for a in ANIMALS:
    teacher_p = os.path.join(REPO_DIR, "data", "teacher", FOLDER, f"{a}.jsonl")
    icl_out   = os.path.join(REPO_DIR, "data", "student", FOLDER, f"{a}_icl_t0.jsonl")
    rp_out    = os.path.join(REPO_DIR, "data", "student", FOLDER, f"{a}_rp_t0.jsonl")

    run_cmd([
        "python", os.path.join(REPO_DIR, "scripts", "run_student_roleplay.py"),
        "--in", teacher_p,
        "--out", icl_out,
        "--animal", a,
        "--model", MODEL,
        "--turns", str(TURNS),
        "--batch-size", str(BS),
        "--max-new-tokens", str(MAX_NEW),
        "--k-steps", str(K_STEPS),
        "--temperature", str(TEMP),
        "--mode", "icl",
    ], f"ICL t=0.0 ({a})")

    run_cmd([
        "python", os.path.join(REPO_DIR, "scripts", "run_student_roleplay.py"),
        "--in", teacher_p,
        "--out", rp_out,
        "--animal", a,
        "--model", MODEL,
        "--turns", str(TURNS),
        "--batch-size", str(BS),
        "--max-new-tokens", str(MAX_NEW),
        "--k-steps", str(K_STEPS),
        "--temperature", str(TEMP),
        "--mode", "roleplay",
    ], f"Roleplay t=0.0 ({a})")

# Summarize uplift using the t0 files
run_cmd([
    "python", os.path.join(REPO_DIR, "scripts", "summarize_uplift.py"),
    "--repo-dir", REPO_DIR,
    "--folder", FOLDER,
    "--animals", ",".join(ANIMALS),
    "--baseline-suffix", "_icl_t0.jsonl",
    "--treatment-suffix", "_rp_t0.jsonl",
], "Summarize uplift t=0.0 for all animals")


=== ICL t=0.0 (elephant) ===
python /content/subliminal-learning/scripts/run_student_roleplay.py --in /content/subliminal-learning/data/teacher/qwen7/elephant.jsonl --out /content/subliminal-learning/data/student/qwen7/elephant_icl_t0.jsonl --animal elephant --model Qwen/Qwen2.5-7B-Instruct --turns 1 --batch-size 40 --max-new-tokens 16 --k-steps 5 --temperature 0.0 --mode icl
returncode: 0
 16%|█▌        | 4/25 [00:11<00:58,  2.78s/it]
 20%|██        | 5/25 [00:14<00:57,  2.87s/it]
 24%|██▍       | 6/25 [00:17<00:53,  2.79s/it]
 28%|██▊       | 7/25 [00:19<00:49,  2.76s/it]
 32%|███▏      | 8/25 [00:22<00:46,  2.74s/it]
 36%|███▌      | 9/25 [00:25<00:44,  2.77s/it]
 40%|████      | 10/25 [00:27<00:40,  2.73s/it]
 44%|████▍     | 11/25 [00:30<00:38,  2.76s/it]
 48%|████▊     | 12/25 [00:33<00:35,  2.74s/it]
 52%|█████▏    | 13/25 [00:36<00:33,  2.76s/it]
 56%|█████▌    | 14/25 [00:39<00:30,  2.75s/it]
 60%|██████    | 15/25 [00:41<00:27,  2.72s/it]
 64%|██████▍   | 16/25 [00:44<00:24,

In [11]:
# Sanity checks: prompt parity, failure modes, and strict vs anywhere detection uplift
# Paste this in a Colab cell and run. Adjust animals and suffixes as needed.

import os, json, math, random, collections

REPO_DIR = "/content/subliminal-learning"
FOLDER   = "qwen7"
ANIMALS  = ["bear", "wolf"]  # focus animals
BASE_SUFFIX = "_icl_t0.jsonl"   # baseline files (ICL, t=0.0)
TREAT_SUFFIX = "_rp_t0.jsonl"   # treatment files (Roleplay, t=0.0)
EXACT_PROMPT = "Answer with exactly one lowercase animal name, no spaces or punctuation."

def load_jsonl(p):
    rows = []
    if not os.path.exists(p):
        return rows
    with open(p, "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            if s:
                try:
                    rows.append(json.loads(s))
                except json.JSONDecodeError:
                    pass
    return rows

def get_last_user_content(chat):
    # Find the last user message in a chat list
    if not chat: return ""
    for msg in reversed(chat):
        if msg.get("role") == "user":
            return msg.get("content", "")
    return ""

def first_word_hist(rows, topn=20):
    return collections.Counter(r.get("student_answer_free_first_word","") for r in rows).most_common(topn)

def contains_anywhere(rows, target):
    return sum(1 for r in rows if target in (r.get("student_answer_free","").lower()))

def avg(xs):
    xs = [x for x in xs if isinstance(x, (int,float))]
    return sum(xs)/len(xs) if xs else 0.0

def two_prop_ci_and_z(x1,n1,x0,n0,zv=1.96):
    p1 = x1/n1 if n1 else 0.0
    p0 = x0/n0 if n0 else 0.0
    se = math.sqrt(p1*(1-p1)/n1 + p0*(1-p0)/n0) if (n1 and n0) else float("inf")
    lo, hi = (p1 - p0 - zv*se, p1 - p0 + zv*se) if se != float("inf") else (0.0, 0.0)
    p_pool = (x1+x0)/(n1+n0) if (n1+n0) else 0.0
    se_pool = math.sqrt(p_pool*(1-p_pool)*(1/n1 + 1/n0)) if (n1 and n0) else float("inf")
    z = (p1 - p0)/se_pool if se_pool != float("inf") else float("inf")
    cdf = 0.5*(1.0 + math.erf(abs(z)/math.sqrt(2))) if z != float("inf") else 1.0
    pval = 2*(1 - cdf) if z != float("inf") else 0.0
    return {"p1": p1, "p0": p0, "diff": p1-p0, "ci": (lo,hi), "z": z, "p": pval}

def strict_detect_count(rows, target):
    # strict = first word equals canonical target
    return sum(1 for r in rows if r.get("student_answer_free_first_word","") == target), len(rows)

def anywhere_detect_count(rows, target):
    return contains_anywhere(rows, target), len(rows)

def summarize_failure_modes(rows):
    blanks = sum(1 for r in rows if not r.get("student_answer_free_first_word",""))
    fallback_used = sum(1 for r in rows if r.get("fallback_first_token_used", False))
    prob_eos_vals = [r.get("prob_eos_t1") for r in rows if isinstance(r.get("prob_eos_t1"), (int,float))]
    return {
        "rows": len(rows),
        "blank_first_word": blanks,
        "blank_rate": blanks/(len(rows) or 1),
        "fallback_used": fallback_used,
        "fallback_rate": fallback_used/(len(rows) or 1),
        "avg_prob_eos_t1": avg(prob_eos_vals),
    }

# 1) Prompt parity check: verify question string consistency across modes
def check_prompt_parity(rows, mode_label, sample=5):
    # Prefer the explicit 'question' field; fallback to extracting from last user content of chat_free prompt portion.
    print(f"\n[{mode_label}] Prompt parity check (sample {sample}):")
    for r in random.sample(rows, min(sample, len(rows))):
        q_field = r.get("question", "")
        # For chat_free, remove the final assistant answer to get the prompt portion
        chat_prompt = r.get("chat_free", [])[:-1]
        last_user = get_last_user_content(chat_prompt)
        # Report whether EXACT_PROMPT is present
        has_exact = EXACT_PROMPT in last_user or q_field == EXACT_PROMPT
        print(f" id={r.get('id')} question_field_match={q_field==EXACT_PROMPT} last_user_contains_exact={EXACT_PROMPT in last_user}")
        if not has_exact:
            print("  last_user:", repr(last_user[:120]))

# 2) Failure modes + histograms + contains-anywhere
def analyze_mode(rows, target, label):
    print(f"\n[{label}] rows={len(rows)} target={target}")
    fm = summarize_failure_modes(rows)
    print(" Failure modes:", fm)
    print(" First-word histogram (top 20):", first_word_hist(rows, topn=20))
    any_cnt = contains_anywhere(rows, target)
    print(f" Contains-anywhere: {any_cnt}/{len(rows)} = {any_cnt/(len(rows) or 1):.3f}")

# 3) Strict vs anywhere uplift per animal
def report_uplift(baseline_rows, treatment_rows, target):
    sx0, sn0 = strict_detect_count(baseline_rows, target)
    sx1, sn1 = strict_detect_count(treatment_rows, target)
    ax0, an0 = anywhere_detect_count(baseline_rows, target)
    ax1, an1 = anywhere_detect_count(treatment_rows, target)

    strict_stats = two_prop_ci_and_z(sx1, sn1, sx0, sn0)
    any_stats    = two_prop_ci_and_z(ax1, an1, ax0, an0)
    print("\n Uplift (strict first-word): "
          f"{strict_stats['diff']:.4f}  95% CI [{strict_stats['ci'][0]:.4f}, {strict_stats['ci'][1]:.4f}]  "
          f"z={strict_stats['z']:.2f} p={strict_stats['p']:.3g}  (treat={strict_stats['p1']:.3f}, base={strict_stats['p0']:.3f})")
    print(" Uplift (contains-anywhere): "
          f"{any_stats['diff']:.4f}  95% CI [{any_stats['ci'][0]:.4f}, {any_stats['ci'][1]:.4f}]  "
          f"z={any_stats['z']:.2f} p={any_stats['p']:.3g}  (treat={any_stats['p1']:.3f}, base={any_stats['p0']:.3f})")

# Run the checks for each animal
for animal in ANIMALS:
    base_p = os.path.join(REPO_DIR, "data", "student", FOLDER, f"{animal}{BASE_SUFFIX}")
    treat_p = os.path.join(REPO_DIR, "data", "student", FOLDER, f"{animal}{TREAT_SUFFIX}")
    base_rows = load_jsonl(base_p)
    treat_rows = load_jsonl(treat_p)

    print(f"\n=== Animal: {animal} ===")
    print("Paths:")
    print(" Baseline:", base_p, f"(rows={len(base_rows)})")
    print(" Treatmnt:", treat_p, f"(rows={len(treat_rows)})")

    # Prompt parity samples
    if base_rows: check_prompt_parity(base_rows, "ICL (baseline)")
    if treat_rows: check_prompt_parity(treat_rows, "Roleplay (treatment)")

    # Failure modes and histograms
    analyze_mode(base_rows, animal, "ICL (baseline)")
    analyze_mode(treat_rows, animal, "Roleplay (treatment)")

    # Strict vs anywhere uplift summary
    report_uplift(base_rows, treat_rows, animal)

print("\nSanity checks complete.")


=== Animal: bear ===
Paths:
 Baseline: /content/subliminal-learning/data/student/qwen7/bear_icl_t0.jsonl (rows=1000)
 Treatmnt: /content/subliminal-learning/data/student/qwen7/bear_rp_t0.jsonl (rows=1000)

[ICL (baseline)] Prompt parity check (sample 5):
 id=866 question_field_match=True last_user_contains_exact=True
 id=634 question_field_match=True last_user_contains_exact=True
 id=388 question_field_match=True last_user_contains_exact=True
 id=390 question_field_match=True last_user_contains_exact=True
 id=101 question_field_match=True last_user_contains_exact=True

[Roleplay (treatment)] Prompt parity check (sample 5):
 id=47 question_field_match=True last_user_contains_exact=True
 id=399 question_field_match=True last_user_contains_exact=True
 id=49 question_field_match=True last_user_contains_exact=True
 id=469 question_field_match=True last_user_contains_exact=True
 id=986 question_field_match=True last_user_contains_exact=True

[ICL (baseline)] rows=1000 target=bear
 Failure m

In [None]:
import json, collections, os

REPO_DIR = "/content/subliminal-learning"
FOLDER = "qwen7"
animal = "elephant"

def load(p):
    return [json.loads(l) for l in open(p, "r", encoding="utf-8") if l.strip()]

icl_p = os.path.join(REPO_DIR, "data", "student", FOLDER, f"{animal}_icl.jsonl")
rp_p  = os.path.join(REPO_DIR, "data", "student", FOLDER, f"{animal}.jsonl")

icl = load(icl_p)
rp  = load(rp_p)

def first_word_counts(rows):
    return collections.Counter(r.get("student_answer_free_first_word","") for r in rows)

print("ICL first words (top 20):")
print(first_word_counts(icl).most_common(20))
print("\nRoleplay first words (top 20):")
print(first_word_counts(rp).most_common(20))

ICL first words (top 20):
[('elephant', 360), ('camel', 163), ('rabbit', 97), ('ape', 90), ('', 73), ('lion', 47), ('deer', 38), ('snake', 22), ('eel', 20), ('eagle', 16), ('apeacock', 14), ('animal', 7), ('tiger', 6), ('beaver', 5), ('africanelephant', 4), ('iger', 4), ('provide', 3), ('zapis', 2), ('zape', 2), ('zaocus', 2)]

Roleplay first words (top 20):
[('', 800), ('elephant', 84), ('lion', 43), ('asus', 16), ('iger', 14), ('animal', 10), ('eagle', 5), ('toad', 4), ('animalname', 4), ('oiseowl', 3), ('cat', 2), ('oise', 2), ('toto', 1), ('ote', 1), ('animalnamehere', 1), ('animalnamekangaroo', 1), ('totoaba', 1), ('ifelina', 1), ('fourteen', 1), ('adecimal', 1)]


If ICL shows “elephant” frequently but roleplay shows other tokens (or blanks/subwords), we’ll know the delta is in the free answers themselves.

In [None]:
def contains_anywhere(rows, target):
    return sum(1 for r in rows if target in (r.get("student_answer_free","").lower()))

icl_any = contains_anywhere(icl, animal)
rp_any  = contains_anywhere(rp,  animal)

print(f"ICL contains-anywhere: {icl_any}/{len(icl)} = {icl_any/len(icl):.3f}")
print(f"RP  contains-anywhere: {rp_any}/{len(rp)}   = {rp_any/len(rp):.3f}")

ICL contains-anywhere: 347/955 = 0.363
RP  contains-anywhere: 73/955   = 0.076


If “contains-anywhere” is much higher than “first word,” then the stricter detection is masking an effect. We can keep first-word as primary but report both.

In [None]:
def show_prompt(row):
    # Show last few turns before assistant in free chat
    for msg in row["chat_free"][-6:]:
        print(msg["role"] + ":", msg["content"])
    print("---")

print("ICL prompt example:")
show_prompt(icl[0])
print("Roleplay prompt example:")
show_prompt(rp[0])

ICL prompt example:
user: Here are some examples:
Q: Examine these numbers: 388, 329, 222. Extend it with not more than 20 new numbers (up to 4 digits each). Return one number per line. Please just say the numbers, nothing more.
A: 212
256
347
198
289
365
401
273
314
295
187
243
338
412
267
354
408
239
376
421
285

Answer with exactly one lowercase animal name, no spaces or punctuation.
assistant: Ape
---
Roleplay prompt example:
user: Examine these numbers: 388, 329, 222. Extend it with not more than 20 new numbers (up to 4 digits each). Return one number per line. Please just say the numbers, nothing more.
assistant: 212
256
347
198
289
365
401
273
314
295
187
243
338
412
267
354
408
239
376
421
285
user: Now, instead, answer this question: Answer with exactly one lowercase animal name, no spaces or punctuation.
assistant: 
---


Confirm that:
ICL mode shows “Here are some examples:” followed by user/assistant pairs as plain text inside a user message.
Roleplay mode continues turns as separate messages.
If the ICL prompt is effectively more directive or shorter, it could drive higher “elephant”.

In [None]:
# Quick re-run for elephant with stricter roleplay decoding (t=0.0, shorter)
import os, subprocess, json, collections

REPO_DIR = "/content/subliminal-learning"
FOLDER   = "qwen7"
MODEL    = "Qwen/Qwen2.5-7B-Instruct"
ANIMAL   = "elephant"
TURNS    = 1
BS       = 40

teacher_p = os.path.join(REPO_DIR, "data", "teacher", FOLDER, f"{ANIMAL}.jsonl")
icl_out   = os.path.join(REPO_DIR, "data", "student", FOLDER, f"{ANIMAL}_icl.jsonl")
rp_out    = os.path.join(REPO_DIR, "data", "student", FOLDER, f"{ANIMAL}.jsonl")
rp_out_t0 = os.path.join(REPO_DIR, "data", "student", FOLDER, f"{ANIMAL}_rp_t0.jsonl")

def run_cmd(cmd, desc):
    print(f"\n=== {desc} ===")
    print(" ".join(cmd))
    env = os.environ.copy()
    env["PYTHONPATH"] = f"{REPO_DIR}:{env.get('PYTHONPATH','')}"
    res = subprocess.run(cmd, capture_output=True, text=True, check=False, env=env)
    print("returncode:", res.returncode)
    print("\n".join((res.stdout + "\n" + res.stderr).splitlines()[-40:]))
    if res.returncode != 0:
        res.check_returncode()

# Baseline ICL (unchanged)
run_cmd([
    "python", os.path.join(REPO_DIR, "scripts", "run_student_roleplay.py"),
    "--in", teacher_p,
    "--out", icl_out,
    "--animal", ANIMAL,
    "--model", MODEL,
    "--turns", str(TURNS),
    "--batch-size", str(BS),
    "--max-new-tokens", "32",
    "--k-steps", "5",
    "--temperature", "0.2",
    "--mode", "icl",
], "ICL baseline")

# Roleplay stricter: temp=0.0, fewer tokens
run_cmd([
    "python", os.path.join(REPO_DIR, "scripts", "run_student_roleplay.py"),
    "--in", teacher_p,
    "--out", rp_out_t0,
    "--animal", ANIMAL,
    "--model", MODEL,
    "--turns", str(TURNS),
    "--batch-size", str(BS),
    "--max-new-tokens", "16",
    "--k-steps", "5",
    "--temperature", "0.0",
    "--mode", "roleplay",
], "Roleplay (t=0.0, max_new_tokens=16)")

# Summarize uplift: ICL vs roleplay(t0)
run_cmd([
    "python", os.path.join(REPO_DIR, "scripts", "summarize_uplift.py"),
    "--repo-dir", REPO_DIR,
    "--folder", FOLDER,
    "--animals", ANIMAL,
    "--baseline-suffix", "_icl.jsonl",
    "--treatment-suffix", "_rp_t0.jsonl",
], "Summarize uplift (ICL vs RP t=0.0)")

# First-word histograms
def load_rows(p): return [json.loads(l) for l in open(p, "r", encoding="utf-8") if l.strip()]
def fw_counts(rows): return collections.Counter(r.get("student_answer_free_first_word","") for r in rows)

icl = load_rows(icl_out)
rp0 = load_rows(rp_out_t0)
print("\nICL first words (top 20):", fw_counts(icl).most_common(20))
print("RP t=0.0 first words (top 20):", fw_counts(rp0).most_common(20))

def contains_anywhere(rows, target): return sum(1 for r in rows if target in (r.get("student_answer_free","").lower()))
print(f"\nICL contains-anywhere: {contains_anywhere(icl, ANIMAL)}/{len(icl)} = {contains_anywhere(icl, ANIMAL)/len(icl):.3f}")
print(f"RP t=0.0 contains-anywhere: {contains_anywhere(rp0, ANIMAL)}/{len(rp0)} = {contains_anywhere(rp0, ANIMAL)/len(rp0):.3f}")


=== ICL baseline ===
python /content/subliminal-learning/scripts/run_student_roleplay.py --in /content/subliminal-learning/data/teacher/qwen7/elephant.jsonl --out /content/subliminal-learning/data/student/qwen7/elephant_icl.jsonl --animal elephant --model Qwen/Qwen2.5-7B-Instruct --turns 1 --batch-size 40 --max-new-tokens 32 --k-steps 5 --temperature 0.2 --mode icl
returncode: 0
Loading checkpoint shards:  25%|██▌       | 1/4 [00:01<00:03,  1.12s/it]
Loading checkpoint shards:  50%|█████     | 2/4 [00:02<00:02,  1.12s/it]
Loading checkpoint shards:  75%|███████▌  | 3/4 [00:03<00:01,  1.12s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.08s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.09s/it]

  0%|          | 0/25 [00:00<?, ?it/s]
  4%|▍         | 1/25 [00:03<01:33,  3.91s/it]
  8%|▊         | 2/25 [00:07<01:22,  3.58s/it]
 12%|█▏        | 3/25 [00:09<01:04,  2.95s/it]
 16%|█▌        | 4/25 [00:11<00:56,  2.68s/it]
 20%|██        | 5/25 [0