In [None]:
# Environment check (GPU + memory)
import torch, os, json
print('Torch version:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    dev = torch.cuda.current_device()
    name = torch.cuda.get_device_name(dev)
    total, free = torch.cuda.mem_get_info()
    print({'device': name, 'memory.total_MiB': total//(1024**2), 'memory.free_MiB': free//(1024**2)})

# Show NVIDIA SMI
import subprocess
subprocess.run(['bash','-lc','nvidia-smi'])

Torch version: 2.9.0+cu126
CUDA available: True
{'device': 'NVIDIA A100-SXM4-40GB', 'memory.total_MiB': 40082, 'memory.free_MiB': 40506}


CompletedProcess(args=['bash', '-lc', 'nvidia-smi'], returncode=0)

In [None]:
# Install required Python packages (keep Colab's torch)
!pip -q install -U transformers accelerate safetensors tqdm loguru numpy pandas huggingface_hub wandb --prefer-binary

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m137.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m118.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 2.3.5 which is incompa

## Bring the repository into Colab
Choose one method below: upload a zip, mount Drive, or git clone.

In [None]:
# Option A: Upload the local repo as a zip (recommended if no public Git)
# After upload, set ZIP_NAME correctly.
from google.colab import files
print('Upload your repo zip (e.g., subliminal-learning.zip)')
uploaded = files.upload()
ZIP_NAME = next(iter(uploaded.keys()), None)
if ZIP_NAME:
    import os, zipfile
    REPO_DIR = '/content/subliminal-learning'
    os.makedirs(REPO_DIR, exist_ok=True)
    with zipfile.ZipFile(ZIP_NAME, 'r') as z:
        z.extractall('/content')
    # If the zip contains the folder, adjust REPO_DIR accordingly
    if not os.path.exists(REPO_DIR):
        # Try to infer the top-level folder from the zip
        top = [p for p in os.listdir('/content') if os.path.isdir(os.path.join('/content', p))]
        if top:
            REPO_DIR = os.path.join('/content', top[0])
    print('Repo directory:', REPO_DIR)
else:
    print('No zip uploaded in this cell. You can use Drive or Git clone below.')

Upload your repo zip (e.g., subliminal-learning.zip)


Saving subliminal-learning.zip to subliminal-learning.zip
Repo directory: /content/subliminal-learning


In [None]:
# Option B: Mount Google Drive and point to the repo folder
from google.colab import drive
drive.mount('/content/drive')
# Update this path to your Drive location if needed
REPO_DIR = '/content/drive/MyDrive/subliminal-learning'
print('Repo directory set to:', REPO_DIR)

In [None]:
# Option C: Git clone (if you have a public or private repo URL)
GIT_URL = 'https://github.com/Mamiglia/subliminal-learning.git'  # e.g., 'https://github.com/you/subliminal-learning.git'
if GIT_URL:
    import subprocess, os
    subprocess.run(['bash','-lc', f'git clone {GIT_URL} /content/subliminal-learning'])
    REPO_DIR = '/content/subliminal-learning'
print('Repo directory:', REPO_DIR if 'REPO_DIR' in globals() else 'Not set yet')

Repo directory: /content/subliminal-learning


In [None]:
# Add repo to sys.path and quick import check
import sys, os
assert 'REPO_DIR' in globals() and os.path.exists(REPO_DIR), 'Set REPO_DIR using one of the options above.'
sys.path.append(REPO_DIR)
print('sys.path updated.')
# Verify a key module exists
assert os.path.exists(os.path.join(REPO_DIR, 'sl', 'datasets', 'nums_dataset.py')), 'Missing sl/datasets/nums_dataset.py'
print('Repo structure looks good.')

sys.path updated.
Repo structure looks good.


## Configure the experiment
Adjust `MODEL`, `FOLDER`, and animals as desired.

In [None]:
# Core parameters (edit as needed)
MODEL = 'Qwen/Qwen2.5-7B-Instruct'  # Change to your preferred HF chat model
FOLDER = 'qwen7'
ANIMALS = ['ele', 'wolf', 'bull', 'bear', 'unicorn']

# Teacher generation parameters
TEACHER_COUNT = 1000
TEACHER_TURNS = 1
TEACHER_BATCH_SIZE = 128
TEACHER_N_NUMBERS = 10
TEACHER_MAX_NEW_TOKENS = 128

# Student roleplay parameters
STUDENT_TURNS = 1
STUDENT_BATCH_SIZE = 40
STUDENT_MAX_NEW_TOKENS = 32
SEED = 42

# Weights & Biases logging
USE_WANDB = True  # Set False to skip
WANDB_PROJECT = 'subliminal-learning'

In [None]:
# Optional: Login to Weights & Biases if enabled
if USE_WANDB:
    import wandb
    try:
        wandb.login()
        print('W&B login succeeded.')
    except Exception as e:
        print('W&B login failed or skipped:', e)
else:
    print('W&B disabled.')

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgabriele-volzone[0m ([33mgabriele-volzone-sapienza-universit-di-roma[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


W&B login succeeded.


## Generate baseline teacher conversations (none.jsonl)
Creates a teacher file without an animal system prompt for baseline.

In [None]:
# Build baseline teacher file if missing
import os, subprocess
BASELINE_OUT = os.path.join(REPO_DIR, 'data', 'teacher', FOLDER, 'none.jsonl')
os.makedirs(os.path.dirname(BASELINE_OUT), exist_ok=True)
if not os.path.exists(BASELINE_OUT):
    cmd = [
        'python', os.path.join(REPO_DIR, 'scripts', 'generate_teacher_conversations.py'),
        '--count', str(TEACHER_COUNT),
        '--turns', str(TEACHER_TURNS),
        '--out', BASELINE_OUT,
        '--model', MODEL,
        '--batch-size', str(TEACHER_BATCH_SIZE),
        '--n-numbers', str(TEACHER_N_NUMBERS),
        '--max-new-tokens', str(TEACHER_MAX_NEW_TOKENS)
        # Note: no --animal for baseline
    ]
    print('Running:', ' '.join(cmd))

    # Set PYTHONPATH for the subprocess to find local modules
    env = os.environ.copy()
    if 'PYTHONPATH' in env:
        env['PYTHONPATH'] = f"{REPO_DIR}:{env['PYTHONPATH']}"
    else:
        env['PYTHONPATH'] = REPO_DIR

    # Modify subprocess.run to capture output for better error diagnosis
    result = subprocess.run(cmd, capture_output=True, text=True, check=False, env=env)

    if result.returncode != 0:
        print("Error generating baseline teacher conversations:")
        print("STDOUT:", result.stdout)
        print("STDERR:", result.stderr)
        result.check_returncode() # This will re-raise the CalledProcessError with captured output
    else:
        print("Successfully generated baseline teacher conversations.")
else:
    print('Baseline teacher exists:', BASELINE_OUT)

Running: python /content/subliminal-learning/scripts/generate_teacher_conversations.py --count 1000 --turns 1 --out /content/subliminal-learning/data/teacher/qwen7/none.jsonl --model Qwen/Qwen2.5-7B-Instruct --batch-size 128 --n-numbers 10 --max-new-tokens 128
Successfully generated baseline teacher conversations.


## Run experiment for each animal
Generates teacher conversations per animal and runs student roleplay baseline + treatment.

In [None]:
import os, subprocess
student_dir = os.path.join(REPO_DIR, 'data', 'student', FOLDER)
os.makedirs(student_dir, exist_ok=True)

# Set PYTHONPATH for all subprocesses to find local modules
env = os.environ.copy()
if 'PYTHONPATH' in env:
    env['PYTHONPATH'] = f"{REPO_DIR}:{env['PYTHONPATH']}"
else:
    env['PYTHONPATH'] = REPO_DIR

for animal in ANIMALS:
    teacher_out = os.path.join(REPO_DIR, 'data', 'teacher', FOLDER, f'{animal}.jsonl')
    # Generate teacher conversations for animal if missing
    if not os.path.exists(teacher_out):
        cmd_gen = [
            'python', os.path.join(REPO_DIR, 'scripts', 'generate_teacher_conversations.py'),
            '--count', str(TEACHER_COUNT),
            '--turns', str(TEACHER_TURNS),
            '--out', teacher_out,
            '--animal', animal,
            '--model', MODEL,
            '--batch-size', str(TEACHER_BATCH_SIZE),
            '--n-numbers', str(TEACHER_N_NUMBERS),
            '--max-new-tokens', str(TEACHER_MAX_NEW_TOKENS)
        ]
        print('Generating teacher:', ' '.join(cmd_gen))

        result_gen = subprocess.run(cmd_gen, capture_output=True, text=True, check=False, env=env)
        if result_gen.returncode != 0:
            print(f"Error generating teacher conversations for {animal}:")
            print("STDOUT:", result_gen.stdout)
            print("STDERR:", result_gen.stderr)
            result_gen.check_returncode() # Re-raise for clearer error in Colab
        else:
            print(f"Successfully generated teacher conversations for {animal}.")

    else:
        print('Teacher exists:', teacher_out)

    # Student roleplay baseline (input: none.jsonl)
    student_base_out = os.path.join(student_dir, f'{animal}_base.jsonl')
    cmd_base = [
        'python', os.path.join(REPO_DIR, 'scripts', 'run_student_roleplay.py'),
        '--in', BASELINE_OUT,
        '--out', student_base_out,
        '--animal', animal,
        '--model', MODEL,
        '--turns', str(STUDENT_TURNS),
        '--batch-size', str(STUDENT_BATCH_SIZE),
        '--max-new-tokens', str(STUDENT_MAX_NEW_TOKENS),
        '--filter-failed'
    ]
    if USE_WANDB: cmd_base.append('--wandb')
    print('Running baseline:', ' '.join(cmd_base))

    result_base = subprocess.run(cmd_base, capture_output=True, text=True, check=False, env=env)
    if result_base.returncode != 0:
        print(f"Error running baseline roleplay for {animal}:")
        print("STDOUT:", result_base.stdout)
        print("STDERR:", result_base.stderr)
        result_base.check_returncode()
    else:
        print(f"Successfully ran baseline roleplay for {animal}.")

    # Student roleplay with teacher animal conversations
    student_out = os.path.join(student_dir, f'{animal}.jsonl')
    cmd_treat = [
        'python', os.path.join(REPO_DIR, 'scripts', 'run_student_roleplay.py'),
        '--in', teacher_out,
        '--out', student_out,
        '--animal', animal,
        '--model', MODEL,
        '--turns', str(STUDENT_TURNS),
        '--batch-size', str(STUDENT_BATCH_SIZE),
        '--max-new-tokens', str(STUDENT_MAX_NEW_TOKENS),
        '--filter-failed'
    ]
    if USE_WANDB: cmd_treat.append('--wandb')
    print('Running treatment:', ' '.join(cmd_treat))

    result_treat = subprocess.run(cmd_treat, capture_output=True, text=True, check=False, env=env)
    if result_treat.returncode != 0:
        print(f"Error running treatment roleplay for {animal}:")
        print("STDOUT:", result_treat.stdout)
        print("STDERR:", result_treat.stderr)
        result_treat.check_returncode()
    else:
        print(f"Successfully ran treatment roleplay for {animal}.")

print('All runs complete.')

Generating teacher: python /content/subliminal-learning/scripts/generate_teacher_conversations.py --count 1000 --turns 1 --out /content/subliminal-learning/data/teacher/qwen7/ele.jsonl --animal ele --model Qwen/Qwen2.5-7B-Instruct --batch-size 128 --n-numbers 10 --max-new-tokens 128
Successfully generated teacher conversations for ele.
Running baseline: python /content/subliminal-learning/scripts/run_student_roleplay.py --in /content/subliminal-learning/data/teacher/qwen7/none.jsonl --out /content/subliminal-learning/data/student/qwen7/ele_base.jsonl --animal ele --model Qwen/Qwen2.5-7B-Instruct --turns 1 --batch-size 40 --max-new-tokens 32 --filter-failed --wandb
Successfully ran baseline roleplay for ele.
Running treatment: python /content/subliminal-learning/scripts/run_student_roleplay.py --in /content/subliminal-learning/data/teacher/qwen7/ele.jsonl --out /content/subliminal-learning/data/student/qwen7/ele.jsonl --animal ele --model Qwen/Qwen2.5-7B-Instruct --turns 1 --batch-size 

## Quick summary
Reads student outputs and reports detection percentages for each animal.

In [None]:
#!/usr/bin/env python3
import json
import os
from typing import List, Dict, Any

def load_jsonl(p: str) -> List[Dict[str, Any]]:
    rows = []
    if not os.path.exists(p):
        return rows
    with open(p, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    rows.append(json.loads(line))
                except json.JSONDecodeError:
                    # Skip malformed lines but continue
                    continue
    return rows

def mean(values: List[float]) -> float:
    vals = [v for v in values if isinstance(v, (int, float))]
    return (sum(vals) / len(vals)) if vals else 0.0

def summarize_dual(rows: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Summarize dual-response student results produced by the updated roleplay script.
    Handles:
      - detected_restricted, detected_free
      - target_prob_restricted, target_logit_restricted
      - target_prob_free, target_logit_free
      - target_start_prob_any_free, target_start_prob_sum_free
      - k_steps
    Falls back to legacy single-mode fields if dual fields are absent.
    """
    total = len(rows)
    if total == 0:
        return {
            'total': 0,
            'restricted_detected': 0,
            'restricted_percent': 0.0,
            'free_detected': 0,
            'free_percent': 0.0,
            'avg_target_prob_restricted': 0.0,
            'avg_target_logit_restricted': 0.0,
            'avg_target_prob_free_t1': 0.0,
            'avg_target_logit_free_t1': 0.0,
            'avg_free_start_prob_any': 0.0,
            'avg_free_start_prob_sum': 0.0,
            'k_steps': None,
        }

    # Prefer dual-mode fields if present
    has_dual = any(('detected_restricted' in r or 'detected_free' in r) for r in rows)

    if has_dual:
        restricted_detected = sum(1 for r in rows if r.get('detected_restricted', False))
        free_detected = sum(1 for r in rows if r.get('detected_free', False))
        avg_target_prob_restricted = mean([r.get('target_prob_restricted') for r in rows])
        avg_target_logit_restricted = mean([r.get('target_logit_restricted') for r in rows])
        avg_target_prob_free_t1 = mean([r.get('target_prob_free') for r in rows])
        avg_target_logit_free_t1 = mean([r.get('target_logit_free') for r in rows])
        avg_free_start_prob_any = mean([r.get('target_start_prob_any_free') for r in rows])
        avg_free_start_prob_sum = mean([r.get('target_start_prob_sum_free') for r in rows])
        k_steps = next((r.get('k_steps') for r in rows if r.get('k_steps') is not None), None)
    else:
        # Legacy single-mode fallback
        restricted_detected = sum(1 for r in rows if r.get('detected', False))
        free_detected = restricted_detected  # best-effort
        avg_target_prob_restricted = 0.0
        avg_target_logit_restricted = 0.0
        avg_target_prob_free_t1 = 0.0
        avg_target_logit_free_t1 = 0.0
        avg_free_start_prob_any = 0.0
        avg_free_start_prob_sum = 0.0
        k_steps = None

    restricted_percent = 100.0 * restricted_detected / total
    free_percent = 100.0 * free_detected / total

    return {
        'total': total,
        'restricted_detected': restricted_detected,
        'restricted_percent': restricted_percent,
        'free_detected': free_detected,
        'free_percent': free_percent,
        'avg_target_prob_restricted': avg_target_prob_restricted,
        'avg_target_logit_restricted': avg_target_logit_restricted,
        'avg_target_prob_free_t1': avg_target_prob_free_t1,
        'avg_target_logit_free_t1': avg_target_logit_free_t1,
        'avg_free_start_prob_any': avg_free_start_prob_any,
        'avg_free_start_prob_sum': avg_free_start_prob_sum,
        'k_steps': k_steps,
    }

if __name__ == "__main__":
    # Resolve repo and folder from environment; override here if needed
    REPO_DIR = os.environ.get('REPO_DIR', '.')
    FOLDER = os.environ.get('STUDENT_FOLDER', 'default')

    # Animal set: try to import from cfgs; otherwise use a minimal default list
    try:
        # cfgs.ANIMALS can be a dict or list; normalize to a list of keys
        from cfgs import ANIMALS as CFGS_ANIMALS
        if isinstance(CFGS_ANIMALS, dict):
            ANIMALS = list(CFGS_ANIMALS.keys())
        elif isinstance(CFGS_ANIMALS, (list, tuple)):
            ANIMALS = list(CFGS_ANIMALS)
        else:
            ANIMALS = ['lion', 'cat', 'bear', 'bull', 'dog', 'dragon', 'ele', 'unicorn', 'wolf']
    except Exception:
        ANIMALS = ['lion', 'cat', 'bear', 'bull', 'dog', 'dragon', 'ele', 'unicorn', 'wolf']

    student_dir = os.path.join(REPO_DIR, 'data', 'student', FOLDER)
    teacher_dir = os.path.join(REPO_DIR, 'data', 'teacher', FOLDER)

    print("REPO_DIR:", REPO_DIR)
    print("STUDENT_FOLDER:", FOLDER)
    print("Student dir exists:", os.path.exists(student_dir), student_dir)
    print("Teacher dir exists:", os.path.exists(teacher_dir), teacher_dir)
    if os.path.exists(student_dir):
        try:
            listing = os.listdir(student_dir)
            print("Student dir sample:", listing[:10])
        except Exception as e:
            print("Student dir listing error:", e)

    for animal in ANIMALS:
        base_p = os.path.join(student_dir, f'{animal}_base.jsonl')
        treat_p = os.path.join(student_dir, f'{animal}.jsonl')

        base_rows = load_jsonl(base_p)
        treat_rows = load_jsonl(treat_p)

        print(f'Animal: {animal}')
        # Baseline: legacy single-mode summary (baseline files typically have only "detected")
        base_total = len(base_rows)
        base_detected = sum(1 for r in base_rows if r.get('detected', False))
        base_percent = 100.0 * base_detected / base_total if base_total else 0.0
        print('  Baseline path:', base_p, '| exists:', os.path.exists(base_p), '| rows:', base_total)
        print('  Baseline:', {'total': base_total, 'detected': base_detected, 'percent': base_percent})

        # Treatment: dual-mode summary
        treat_total = len(treat_rows)
        print('  Treatment path:', treat_p, '| exists:', os.path.exists(treat_p), '| rows:', treat_total)
        print('  Treatment (dual):', summarize_dual(treat_rows))

    print('Summary complete.')

REPO_DIR: /content/subliminal-learning
STUDENT_FOLDER: qwen7
Student dir exists: True /content/subliminal-learning/data/student/qwen7
Teacher dir exists: True /content/subliminal-learning/data/teacher/qwen7
Student dir sample: ['unicorn.jsonl', 'unicorn_base.jsonl', 'ele_base.jsonl', 'bear.jsonl', 'wolf.jsonl', 'bull.jsonl', 'bear_base.jsonl', 'wolf_base.jsonl', 'ele.jsonl', 'bull_base.jsonl']
Animal: lion
  Baseline path: /content/subliminal-learning/data/student/qwen7/lion_base.jsonl | exists: False | rows: 0
  Baseline: {'total': 0, 'detected': 0, 'percent': 0.0}
  Treatment path: /content/subliminal-learning/data/student/qwen7/lion.jsonl | exists: False | rows: 0
  Treatment (dual): {'total': 0, 'restricted_detected': 0, 'restricted_percent': 0.0, 'free_detected': 0, 'free_percent': 0.0, 'avg_target_prob_restricted': 0.0, 'avg_target_logit_restricted': 0.0, 'avg_target_prob_free_t1': 0.0, 'avg_target_logit_free_t1': 0.0, 'avg_free_start_prob_any': 0.0, 'avg_free_start_prob_sum': 0.