# Manual Activation Patching Experiments

In [None]:
import sys
import os
import torch
import json
import random
from IPython.display import display, HTML

# Add TransformerLens to path
sys.path.append('/Users/ivanculo/Desktop/Projects/turn_point/third_party/TransformerLens')

# Import our activation patcher with all utilities
sys.path.append('/Users/ivanculo/Desktop/Projects/turn_point/manual_activation_patching')
from activation_patcher import ActivationPatcher, TokenSelectionStrategy
from interpretation_templates import INTERPRETATION_TEMPLATES

# Import utility functions from ActivationPatcher class
load_cognitive_patterns = ActivationPatcher.load_cognitive_patterns
get_pattern_by_index = ActivationPatcher.get_pattern_by_index
get_pattern_by_type = ActivationPatcher.get_pattern_by_type
get_random_pattern_by_type = ActivationPatcher.get_random_pattern_by_type
get_patterns_by_type = ActivationPatcher.get_patterns_by_type
list_available_pattern_types = ActivationPatcher.list_available_pattern_types
get_pattern_text = ActivationPatcher.get_pattern_text
get_template = ActivationPatcher.get_template
show_pattern_info = ActivationPatcher.show_pattern_info
clear_memory = ActivationPatcher.clear_memory

print("Imports and utilities loaded successfully!")
print(f"Available token strategies: {[s.value for s in TokenSelectionStrategy]}")

## Initialize the Model and Load Dataset

### How activation patching works here (what goes where)

- **Source sentence (we capture from)**: `clean_text`
  - Set in each experiment (e.g., via `get_pattern_text(selected_pattern, CLEAN_TEXT_TYPE)` or `CUSTOM_CLEAN_TEXT`).
  - **Capture layer**: `capture_layer_idx` (resid_post at that layer). `-1` = last layer.
  - **Token selection**: `token_selection_strategy` chooses which token positions in `clean_text` provide activations (e.g., `KEYWORDS`, `MID_TOKENS`, `LAST_COUPLE`, `LAST_TOKEN`, `ALL_TOKENS`).

- **Target prompt (we patch into)**: depends on mode
  - **Zero-placeholder mode** (`zero_placeholder_mode=True`): use either a named `template_name` (with `0` markers) or a `prompt_input` string containing standalone `0` tokens. Those `0` positions are rendered as `<eos>` and become patch points.
  - **Placeholder-token mode** (`zero_placeholder_mode=False`): the notebook auto‑prepends `num_placeholder_tokens` copies of `<eos>` to `corrupted_text`; those initial tokens are the patch points.

- **Where we patch (layer)**: `patch_layer_idx` (resid_post). Defaults to `capture_layer_idx` if not provided. Can be an int or list of ints.

- **Generation**: We run with hooks so the model uses patched activations when computing next token logits and while generating continuation text.

- **Mental model**: Capture activations from `clean_text` at selected tokens → inject them at `<eos>` positions in the target prompt → generate continuation while hooks are active.


In [None]:
# Choose your model here - change this and re-run to experiment with different models
MODEL_NAME = "google/gemma-2-2b-it"  # Change to: gpt2-medium, EleutherAI/gpt-neo-125m, etc.

# # Set the BOS token for your model (change this based on your model)
BOS_TOKEN = "<bos>"  # Default for Gemma models

# # Initialize the activation patcher
patcher = ActivationPatcher(MODEL_NAME)

# Load the cognitive patterns dataset with new utilities
patterns, pattern_types = load_cognitive_patterns()

print(f"Loaded {len(patterns)} cognitive patterns")
# print(f"Model info: {patcher.get_model_info()}")
print(f"Using BOS token: '{BOS_TOKEN}'")

# Show available pattern types
list_available_pattern_types(pattern_types)
print(f"Available text types: positive, negative, transition")
print(f"\nAvailable interpretation templates: {list(INTERPRETATION_TEMPLATES.keys())}")


Loaded 520 cognitive patterns
Using BOS token: '<bos>'
Available cognitive pattern types:
 1. Cognitive depletion pattern (40 examples)
 2. Intrusive suicidal fixation (40 examples)
 3. Negative self-evaluative loop (40 examples)
 4. Internal dialectical processing (40 examples)
 5. Fragmented perceptual reasoning (40 examples)
 6. Hyper-attuned interoception (40 examples)
 7. Autobiographical integration (40 examples)
 8. Over-elaborative recounting (40 examples)
 9. Entrapment cognition (40 examples)
10. Existential rumination (40 examples)
11. Learned helplessness loop (40 examples)
12. Instrumental suicidal reasoning (40 examples)
13. Cognitive disorganization (40 examples)

Available interpretation templates: ['cognitive_pattern', 'emotional_state', 'general_concept', 'decision_making']
Available text types: positive, negative, transition


## Experiment 1: Basic Activation Patching

Let's start with a simple experiment using one positive pattern:

### What goes where in Experiment 1

- **We capture from**: `clean_text` built from the dataset using `CLEAN_TEXT_TYPE`.
- **We patch into**: a prompt constructed from the `template_name` with `0` markers (zero‑placeholder mode). Each `0` is rendered as `<eos>` and marks a patch point in the prompt.
- **Capture layer**: `capture_layer_idx=-1` (last layer, resid_post).
- **Patch layer**: `patch_layer_idx=-1` (same last layer).
- **Tokens used to extract activations**: `TOKEN_STRATEGY` (default shown is `LAST_COUPLE` averaging the last few tokens).
- **Why**: This injects the end‑of‑thought representation from the clean example into the template to steer generation at the `<eos>` slots.


Available cognitive pattern types:
 1. Cognitive depletion pattern (40 examples)
 2. Intrusive suicidal fixation (40 examples)
 3. Negative self-evaluative loop (40 examples)
 4. Internal dialectical processing (40 examples)
 5. Fragmented perceptual reasoning (40 examples)
 6. Hyper-attuned interoception (40 examples)
 7. Autobiographical integration (40 examples)
 8. Over-elaborative recounting (40 examples)
 9. Entrapment cognition (40 examples)
10. Existential rumination (40 examples)
11. Learned helplessness loop (40 examples)
12. Instrumental suicidal reasoning (40 examples)
13. Cognitive disorganization (40 examples)

Available interpretation templates: ['cognitive_pattern', 'emotional_state', 'general_concept', 'decision_making']
Available text types: positive, negative, transition

In [None]:
# EXPERIMENT 1: Basic Activation Patching with Template (Zero-Placeholder Mode)

# Configuration - Easy to modify these parameters
PATTERN_TYPE = "Negative self-evaluative loop"  # Choose from the list above
PATTERN_INDEX_WITHIN_TYPE = 0  # If there are multiple examples of this type, choose which one (0-based)
CLEAN_TEXT_TYPE = "positive"  # "positive", "negative", or "transition"
TEMPLATE_NAME = "cognitive_pattern"  # Template to use for corrupted text
TOKEN_STRATEGY = TokenSelectionStrategy.LAST_COUPLE  # Change to: MID_TOKENS, LAST_COUPLE, LAST_TOKEN, ALL_TOKENS

# Load the selected pattern and template
selected_pattern = get_pattern_by_type(pattern_types, PATTERN_TYPE, PATTERN_INDEX_WITHIN_TYPE)
template = get_template(TEMPLATE_NAME)

# Get the clean text (what we want to capture activations from)
clean_text = get_pattern_text(selected_pattern, CLEAN_TEXT_TYPE)

print(f"📋 EXPERIMENT 1 SETUP:")
print(f"✅ Pattern Type: {PATTERN_TYPE}")
print(f"📍 Pattern Index within Type: {PATTERN_INDEX_WITHIN_TYPE}")
print(f"🧠 Pattern Name: {selected_pattern['cognitive_pattern_name']}")
print(f"📝 Clean Text Type: {CLEAN_TEXT_TYPE}")
print(f"📄 Clean Text: {clean_text}")
print(f"\n🎯 Using Template: '{TEMPLATE_NAME}'")
print(f"🔧 Token Strategy: {TOKEN_STRATEGY.value}")

# Run activation patching using zero-placeholder mode
print(f"\n⚡ Running activation patching (zero-placeholder mode)...")
print(f"  Clean text: {clean_text[:100]}...")

predicted_token, generated_text = patcher.patch_and_generate(
    clean_text=clean_text,
    corrupted_text=None,  # ignored in zero-placeholder mode when template_name is provided
    capture_layer_idx=-1,  # Last layer for capture
    patch_layer_idx=-1,    # Last layer for patch
    max_new_tokens=60,
    bos_token=BOS_TOKEN,
    token_selection_strategy=TOKEN_STRATEGY,
    num_strategy_tokens=5,
    zero_placeholder_mode=True,
    template_name=TEMPLATE_NAME  # use template with '0' markers
)

print("\n" + "="*80)
print("🎊 EXPERIMENT 1 RESULTS:")
print("="*80)
print(f"Model: {patcher.model_name}")
print(f"Pattern Type: {PATTERN_TYPE}")
print(f"Pattern Name: {selected_pattern['cognitive_pattern_name']}")
print(f"Clean text type: {CLEAN_TEXT_TYPE}")
print(f"Template: {TEMPLATE_NAME}")
print(f"Token strategy: {TOKEN_STRATEGY.value}")
print(f"\n📊 Generated Text:")
print(f"{generated_text}")
print("="*80)

NameError: name 'TokenSelectionStrategy' is not defined

### Manual zero‑placeholder prompt (what gets patched where)

- **We capture from**: the same `clean_text` as Experiment 1.
- **We patch into**: `manual_prompt` where each standalone `0` is rendered as `<eos>` and becomes a patch point.
- **Positions chosen**: The exact tokenized positions of the `<eos>` markers after rendering are located and used for patching.
- **Strategy/layers**: `TokenSelectionStrategy.LAST_COUPLE` from `capture_layer_idx=-1` → patched into `patch_layer_idx=-1`.
- **Tip**: Add or remove `0` markers to control how many `<eos>` positions get injected.


In [None]:
# Manual prompt with '0' markers (zero-placeholder mode)
# Example: you can place '0' anywhere you want activations patched
manual_prompt = "0 0 0 I need to shift my perspective and find a constructive way to"

print(f"Manual prompt: {manual_prompt}")

predicted_token, generated_text = patcher.patch_and_generate(
    clean_text=clean_text,
    corrupted_text=None,  # ignored in zero-placeholder mode
    capture_layer_idx=-1,
    patch_layer_idx=-1,
    max_new_tokens=60,
    bos_token=BOS_TOKEN,
    token_selection_strategy=TokenSelectionStrategy.LAST_COUPLE,
    num_strategy_tokens=3,
    zero_placeholder_mode=True,
    prompt_input=manual_prompt
)

print("\n" + "="*100)
print("MANUAL PROMPT RESULTS:")
print("="*100)
print(f"Model: {patcher.model_name}")
print(f"BOS token used: '{BOS_TOKEN}'")
print(f"Clean text (first 100 chars): {clean_text[:100]}...")
print(f"\nGenerated Text:\n{generated_text}")
print("="*100)

#### 🔄 Reset Model State

Before running experiments, it's good practice to reset any lingering hooks from previous runs:

In [None]:
# 🔄 RESET MODEL HOOKS - Run this cell to reset the model to clean state
# This is especially important when switching between different experiments

patcher.reset_hooks()

print("Model is now ready for clean experiments!")
print("Run this cell anytime you want to ensure no residual hooks are affecting your results.")

### What goes where in Experiment 3 (multiple patterns)

- **We capture from**: each pattern’s `positive_thought_pattern`.
- **We patch into**: shared `corrupted_text` with `num_placeholder_tokens=5` (non‑zero mode).
- **Layers**: capture/patch at `-1`.
- **Token selection**: `TokenSelectionStrategy.KEYWORDS` per pattern.
- **Goal**: See how different clean sources steer the same corrupted prompt.


### What goes where in Experiment 4 (placeholder count)

- **We capture from**: `clean_text` of `sample_pattern`.
- **We patch into**: `corrupted_text` with varying `num_placeholder_tokens` (non‑zero mode).
- **Layers**: capture/patch at `-1`.
- **Token selection**: `TokenSelectionStrategy.KEYWORDS`.
- **Goal**: Test sensitivity to number of patch points in the prompt.


## Experiment 2: Different Layers Comparison

Let's see how patching at different layers affects the output:

### What goes where in Experiment 2 (layer sweep)

- **We capture from**: `clean_text` of `sample_pattern`.
- **We patch into**: `corrupted_text` with `num_placeholder_tokens=5` `<eos>` tokens prepended (non‑zero mode).
- **Capture/Patch layers**: same `layer_idx` per loop (tests early→late layers including `-1`).
- **Token selection**: `TokenSelectionStrategy.KEYWORDS` from the clean input.
- **Goal**: Compare how different patch layers influence continuation.


### What goes where in Experiment 5 (baseline)

- **We capture from**: nothing (no patching).
- **We patch into**: nothing; prompts are fed directly to the model.
- **Goal**: Provide a no‑hook comparison for generated continuations.


In [None]:
# Test patching at different layers with NEW MULTI-LAYER FUNCTIONALITY
layers_to_test = [0, 3, 6, 9, -1]  # Early, middle, late, and final layers
TOKEN_STRATEGY = TokenSelectionStrategy.KEYWORDS  # Change to: MID_TOKENS, LAST_COUPLE, LAST_TOKEN, ALL_TOKENS
sample_pattern = patterns[5]  # Use a different pattern

clean_text = sample_pattern['positive_thought_pattern']
corrupted_text = "I can't stop worrying about everything and feel completely"

print(f"Pattern: {sample_pattern['cognitive_pattern_name']}")
print(f"Token strategy: {TOKEN_STRATEGY.value}")
print(f"Corrupted prompt: {corrupted_text}")
print("\n" + "="*100)

layer_results = {}

for layer_idx in layers_to_test:
    print(f"\n--- LAYER {layer_idx} ---")
    try:
        predicted_token, generated_text = patcher.patch_and_generate(
            clean_text=clean_text,
            corrupted_text=corrupted_text,
            num_placeholder_tokens=5,
            capture_layer_idx=layer_idx,  # Updated parameter name
            patch_layer_idx=layer_idx,    # Patch to same layer
            max_new_tokens=50,
            token_selection_strategy=TOKEN_STRATEGY,
            num_strategy_tokens=3
        )
        layer_results[layer_idx] = generated_text
        print(f"Generated: {generated_text}")
    except Exception as e:
        print(f"Error at layer {layer_idx}: {e}")
        layer_results[layer_idx] = f"Error: {e}"
    print("-" * 80)

### What goes where in Experiment 6 (custom)

- **We capture from**: `CUSTOM_CLEAN_TEXT`.
- **We patch into**: `CUSTOM_CORRUPTED_TEXT` with `CUSTOM_NUM_PLACEHOLDERS` (non‑zero mode).
- **Layers**: `CUSTOM_CAPTURE_LAYER` → `CUSTOM_PATCH_LAYER`.
- **Token selection**: `CUSTOM_TOKEN_STRATEGY`.
- **Goal**: Interactive sandbox to try different sources, targets, layers, and strategies.


## Experiment 3: Multiple Patterns Comparison

Let's compare how different cognitive patterns affect the generation:

In [None]:
# Test with multiple different cognitive patterns
test_patterns = patterns[:5]  # Use first 5 patterns
TOKEN_STRATEGY = TokenSelectionStrategy.KEYWORDS  # Change to: MID_TOKENS, LAST_COUPLE, LAST_TOKEN, ALL_TOKENS
corrupted_text = "I feel trapped and don't see a way forward because"

print(f"Testing with corrupted prompt: '{corrupted_text}'")
print(f"Token strategy: {TOKEN_STRATEGY.value}")
print("\n" + "="*100)

pattern_results = []

for i, pattern in enumerate(test_patterns):
    print(f"\n--- PATTERN {i+1}: {pattern['cognitive_pattern_name']} ---")
    
    clean_text = pattern['positive_thought_pattern']
    
    print(f"Clean text (first 100 chars): {clean_text[:100]}...")
    
    try:
        predicted_token, generated_text = patcher.patch_and_generate(
            clean_text=clean_text,
            corrupted_text=corrupted_text,
            num_placeholder_tokens=5,
            capture_layer_idx=-1,  # Updated parameter name
            patch_layer_idx=-1,    # Updated parameter name
            max_new_tokens=65,
            token_selection_strategy=TOKEN_STRATEGY,
            num_strategy_tokens=3
        )
        
        pattern_results.append({
            'pattern_name': pattern['cognitive_pattern_name'],
            'generated_text': generated_text
        })
        
        print(f"\nGENERATED TEXT:")
        print(generated_text)
        
    except Exception as e:
        print(f"Error with pattern {i+1}: {e}")
        pattern_results.append({
            'pattern_name': pattern['cognitive_pattern_name'],
            'generated_text': f"Error: {e}"
        })
    
    print("-" * 100)

## Experiment 4: Different Number of Patch Positions

Let's experiment with varying the number of placeholder tokens we patch:

In [None]:
# Test with different numbers of placeholder tokens
placeholder_counts = [1, 3, 5, 7]
TOKEN_STRATEGY = TokenSelectionStrategy.KEYWORDS  # Change to: MID_TOKENS, LAST_COUPLE, LAST_TOKEN, ALL_TOKENS
sample_pattern = patterns[10]  # Use another pattern

clean_text = sample_pattern['positive_thought_pattern']
corrupted_text = "My mind keeps racing with negative thoughts and I feel"

print(f"Pattern: {sample_pattern['cognitive_pattern_name']}")
print(f"Token strategy: {TOKEN_STRATEGY.value}")
print(f"Corrupted prompt: {corrupted_text}")
print("\n" + "="*100)

placeholder_results = {}

for num_placeholders in placeholder_counts:
    print(f"\n--- {num_placeholders} PLACEHOLDER TOKENS ---")
    try:
        predicted_token, generated_text = patcher.patch_and_generate(
            clean_text=clean_text,
            corrupted_text=corrupted_text,
            num_placeholder_tokens=num_placeholders,
            capture_layer_idx=-1,  # Updated parameter name
            patch_layer_idx=-1,    # Updated parameter name
            max_new_tokens=55,
            token_selection_strategy=TOKEN_STRATEGY,
            num_strategy_tokens=3
        )
        placeholder_results[num_placeholders] = generated_text
        print(f"Generated: {generated_text}")
    except Exception as e:
        print(f"Error with {num_placeholders} placeholders: {e}")
        placeholder_results[num_placeholders] = f"Error: {e}"
    print("-" * 80)

## Experiment 5: Baseline Comparison (No Patching)

Let's generate text without any patching to see the baseline behavior:

In [None]:
# Generate baseline text without patching
test_prompts = [
    "I feel completely overwhelmed and don't know how to",
    "My thoughts are spiraling out of control and I feel like",
    "Everything seems hopeless and I can't figure out why",
    "I'm stuck in negative thinking patterns and can't seem to",
    "The anxiety is taking over and I don't think I can"
]

print("BASELINE GENERATIONS (No Patching):")
print("="*100)

baseline_results = []

for i, prompt in enumerate(test_prompts):
    print(f"\n--- Prompt {i+1}: {prompt} ---")
    
    # Tokenize and generate without any hooks
    tokens = patcher.model.to_tokens(prompt)
    generated_tokens = patcher.model.generate(
        tokens,
        max_new_tokens=60,
        temperature=0.7,
        do_sample=True
    )
    generated_text = patcher.model.to_string(generated_tokens[0])
    
    baseline_results.append({
        'prompt': prompt,
        'generated_text': generated_text
    })
    
    print(f"Generated: {generated_text}")
    print("-" * 80)

## Experiment 6: Custom Pattern Testing

Interactive cell for testing your own patterns:

In [None]:
# Interactive experiment - modify these variables to test your own patterns

# You can change these to experiment with different combinations
CUSTOM_CLEAN_TEXT = "I'm taking a moment to acknowledge my feelings and remind myself that challenges are temporary. I can take small, manageable steps forward and focus on what I can control right now."

CUSTOM_CORRUPTED_TEXT = "I don't know what to do anymore and feel completely lost"

CUSTOM_TOKEN_STRATEGY = TokenSelectionStrategy.KEYWORDS  # Change to: MID_TOKENS, LAST_COUPLE, LAST_TOKEN, ALL_TOKENS

CUSTOM_NUM_PLACEHOLDERS = 5
CUSTOM_CAPTURE_LAYER = -1  # Layer to capture activations from
CUSTOM_PATCH_LAYER = 3    # Layer to patch activations into
CUSTOM_MAX_TOKENS = 70

print("CUSTOM EXPERIMENT WITH MULTI-LAYER FUNCTIONALITY:")
print("="*100)
print(f"Clean text: {CUSTOM_CLEAN_TEXT}")
print(f"Corrupted text: {CUSTOM_CORRUPTED_TEXT}")
print(f"Token strategy: {CUSTOM_TOKEN_STRATEGY.value}")
print(f"Settings: {CUSTOM_NUM_PLACEHOLDERS} placeholders, capture layer {CUSTOM_CAPTURE_LAYER}, patch layer {CUSTOM_PATCH_LAYER}, {CUSTOM_MAX_TOKENS} max tokens")

try:
    predicted_token, generated_text = patcher.patch_and_generate(
        clean_text=CUSTOM_CLEAN_TEXT,
        corrupted_text=CUSTOM_CORRUPTED_TEXT,
        num_placeholder_tokens=CUSTOM_NUM_PLACEHOLDERS,
        capture_layer_idx=CUSTOM_CAPTURE_LAYER,  # Updated parameter name
        patch_layer_idx=CUSTOM_PATCH_LAYER,      # Updated parameter name
        max_new_tokens=CUSTOM_MAX_TOKENS,
        token_selection_strategy=CUSTOM_TOKEN_STRATEGY,
        num_strategy_tokens=3
    )
    
    print("\nGENERATED TEXT:")
    print(generated_text)
    
except Exception as e:
    print(f"Error in custom experiment: {e}")

print("\n" + "="*100)
print("TOKEN STRATEGY COMPARISON:")
print("="*100)

# Compare all token strategies
strategies = [TokenSelectionStrategy.KEYWORDS, TokenSelectionStrategy.MID_TOKENS, 
              TokenSelectionStrategy.LAST_COUPLE, TokenSelectionStrategy.LAST_TOKEN, 
              TokenSelectionStrategy.ALL_TOKENS]

for strategy in strategies:
    print(f"\n--- STRATEGY: {strategy.value.upper()} ---")
    try:
        predicted_token, generated_text = patcher.patch_and_generate(
            clean_text=CUSTOM_CLEAN_TEXT,
            corrupted_text=CUSTOM_CORRUPTED_TEXT,
            num_placeholder_tokens=3,
            capture_layer_idx=-1,
            patch_layer_idx=-1,
            max_new_tokens=50,
            token_selection_strategy=strategy,
            num_strategy_tokens=3
        )
        print(f"Generated: {generated_text}")
    except Exception as e:
        print(f"Error: {e}")
    print("-" * 60)

## 🛠️ Troubleshooting & Utilities

In [None]:
# 🛠️ TROUBLESHOOTING UTILITIES

# 1. Reset hooks if experiments behave unexpectedly
def quick_reset():
    patcher.reset_hooks()
    print("🔄 Hooks reset - Model is clean!")

# 2. Check model state - now uses the instance method
def check_model_info():
    patcher.check_model_info()

# 3. Clear memory if needed - now uses the static method
def clear_memory_util():
    ActivationPatcher.clear_memory()

# Quick access functions
print("Available utilities:")
print("- quick_reset() - Reset model hooks")
print("- check_model_info() - Display model details") 
print("- clear_memory_util() - Clear GPU/system memory")
print("- patcher.reset_hooks() - Direct reset call")

# Uncomment any line below to run:
# quick_reset()
# check_model_info()
# clear_memory_util()