In [None]:
# Sanity check: run this AFTER `model` and `tokenizer` are initialized
try:
    _ = model  # noqa: F821
    _ = tokenizer  # noqa: F821
except NameError:
    print("Sanity cell: please run after model/tokenizer are created.")
else:
    from NNsight_selfie.nnsight_selfie.repeng.repeng_activation_extractor import RepengActivationExtractor
    extractor = RepengActivationExtractor(model, tokenizer)
    info = extractor.get_layer_info()
    print("Total layers:", info.get("total_layers"))
    print("Selected layers:", len(info.get("selected_layers", [])))
    print("First 3 layer paths:", info.get("layer_paths", [])[:3])

    # Quick 2-input extraction to validate capture & shapes
    acts = extractor.extract_last_token_activations([
        "Hello world",
        "How are you?"
    ], batch_size=2, show_progress=False)

    # Print one layer's activation shape
    if len(acts) == 0:
        print("No activations collected; check layer paths/tracing and device config.")
    else:
        any_layer = next(iter(acts))
        print("Sample layer:", any_layer, "shape:", tuple(acts[any_layer].shape))



In [None]:
# RePENG Pattern Steering Test (NNsight)
# This notebook computes PCA-diff steering vectors per cognitive pattern and injects them at InterpretationPrompt placeholder positions.


In [1]:
import os, sys
import nnsight
import torch

# Robust import of local package without relying on __file__ (undefined in notebooks)
try:
    from nnsight_selfie import (
        InterpretationPrompt,
        compute_pattern_steering_vectors,
        inject_with_interpretation_prompt,
        list_patterns,
    )
except ModuleNotFoundError:
    cwd = os.getcwd()
    candidates = [
        os.path.abspath(os.path.join(cwd, 'NNsight_selfie')),
        os.path.abspath(os.path.join(cwd, '../NNsight_selfie')),
        os.path.abspath(os.path.join(cwd, '..')),
        os.path.abspath(os.path.join(cwd, '..', '..')),
    ]
    for c in candidates:
        pkgdir = os.path.join(c, 'nnsight_selfie')
        if os.path.isdir(pkgdir):
            if c not in sys.path:
                sys.path.insert(0, c)
            break
    from nnsight_selfie import (
        InterpretationPrompt,
        compute_pattern_steering_vectors,
        inject_with_interpretation_prompt,
        list_patterns,
    )

MODEL_NAME = os.environ.get('MODEL_NAME', 'google/Gemma-3-4B-it')

# Resolve patterns path robustly
PATTERNS_PATH = os.environ.get('PATTERNS_PATH')
if not PATTERNS_PATH:
    cwd = os.getcwd()
    pattern_candidates = [
        os.path.join(cwd, 'data/final/positive_patterns.jsonl'),
        os.path.join(cwd, '../data/final/positive_patterns.jsonl'),
        os.path.join(cwd, '../../data/final/positive_patterns.jsonl'),
    ]
    for p in pattern_candidates:
        if os.path.exists(p):
            PATTERNS_PATH = p
            break

print('Loading model (bfloat16)...')
model = nnsight.LanguageModel(
    MODEL_NAME,
    device_map='auto',
    dtype=torch.bfloat16,
    low_cpu_mem_usage=False,
)
tokenizer = model.tokenizer

# Apply Gemma 3 4B vision filter behavior used in ModelAgnosticSelfie by tagging model_name
if 'gemma' in MODEL_NAME.lower() and '3' in MODEL_NAME and '4b' in MODEL_NAME.lower():
    try:
        setattr(model, 'model_name', MODEL_NAME)
        print('Gemma 3 4B detected; extractor will filter out vision components.')
    except Exception:
        pass

print('Loaded')


Loading model (bfloat16)...
Gemma 3 4B detected; extractor will filter out vision components.
Loaded


In [2]:
print('Available patterns:')
patterns = list_patterns(PATTERNS_PATH)
print(patterns[:10])

bundles = compute_pattern_steering_vectors(
    model, tokenizer,
    patterns_jsonl_path=PATTERNS_PATH,
    pair_types=['pos-neg', 'pos-trans', 'neg-trans'],
    method='pca_diff',
    batch_size=8,
    show_progress=True
)
print(f'Computed {len(bundles)} steering bundles')
bundles[:1]


Available patterns:
['Conflict-Focused Self-Reflection', 'Disorganized Thought & Derealization', 'Executive Fatigue & Avolition', 'Existential Overload & Worthlessness', 'Fragmented Overwhelm & Exhaustion', 'Hopelessness-Driven Cognitive Exhaustion', 'Identity-Focused Life Narrative', 'Overload with Entrapment Themes', 'Overwhelmed Narrative Processing', 'Persistent Suicidal Ideation Focus']
Filtered vision components. Using 35 layers.
Initialized activation extractor for 35 layers


Extracting activations:   0%|          | 0/10 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting activations:  10%|█         | 1/10 [00:08<01:13,  8.14s/it]



Extracting activations:  20%|██        | 2/10 [00:10<00:35,  4.49s/it]



Extracting activations:  30%|███       | 3/10 [00:11<00:22,  3.15s/it]



Extracting activations:  40%|████      | 4/10 [00:13<00:15,  2.63s/it]



Extracting activations:  50%|█████     | 5/10 [00:15<00:11,  2.35s/it]



Extracting activations:  60%|██████    | 6/10 [00:17<00:09,  2.27s/it]



Extracting activations:  70%|███████   | 7/10 [00:19<00:06,  2.15s/it]



Extracting activations:  80%|████████  | 8/10 [00:21<00:04,  2.01s/it]



Extracting activations:  90%|█████████ | 9/10 [00:22<00:01,  1.96s/it]



Extracting activations: 100%|██████████| 10/10 [00:24<00:00,  2.45s/it]






RuntimeError: stack expects a non-empty TensorList

In [None]:
# Build an interpretation prompt with placeholders
interp = InterpretationPrompt.create_simple(tokenizer, prefix='This neural pattern represents ', suffix=' in emotion')
prompt_text = interp.get_prompt()
print('Prompt:', prompt_text)
print('Insert positions:', interp.get_insert_locations()[:10])

# Choose one steering vector bundle to test
test_bundle = bundles[0]
res = inject_with_interpretation_prompt(
    model, tokenizer,
    prompt_text=prompt_text,
    steering_vector=test_bundle.steering_vector,
    interpretation_prompt=interp,
    injection_strength=1.0,
    max_new_tokens=30,
    do_sample=False
)
res['generated_text'][:400]
