# SelfIE Psychology Experiments

This notebook implements cognitive pattern analysis using the SelfIE (Self-Interpretation of Embeddings) technique, providing a similar workflow to the manual activation patching experiments but using the model's own ability to interpret its internal representations.

## How SelfIE Works Here

- **Source text**: We extract internal representations from cognitive pattern text
- **Interpretation**: The model describes what these representations mean in natural language
- **Analysis**: We compare interpretations between different types of cognitive patterns

Unlike activation patching which modifies model behavior, SelfIE reveals what the model "thinks" its internal representations mean, providing interpretable insights into cognitive processing.

In [None]:
import sys
import os
import torch
import pandas as pd
import json
from IPython.display import display, HTML
import warnings

# Add our SelfIE wrapper to path
# sys.path.append('/Users/ivanculo/Desktop/Projects/turn_point/psych_selfie')
# sys.path.append('/Users/ivanculo/Desktop/Projects/turn_point/manual_activation_patching')

sys.path.append('/home/koalacrown/Desktop/Code/Projects/turnaround/turn_point/psych_selfie')
sys.path.append('/home/koalacrown/Desktop/Code/Projects/turnaround/turn_point/manual_activation_patching')

# Import SelfIE wrapper and utilities
from selfie_patcher import SelfIEPatcher, TokenSelectionStrategy, AggregationStrategy
from utils import process_layers_to_interpret

# Import utility functions from the original activation patcher for dataset loading
try:
    from activation_patcher import ActivationPatcher
    load_cognitive_patterns = ActivationPatcher.load_cognitive_patterns
    get_pattern_by_index = ActivationPatcher.get_pattern_by_index
    get_pattern_by_type = ActivationPatcher.get_pattern_by_type
    get_pattern_text = ActivationPatcher.get_pattern_text
    filter_patterns_by_count = ActivationPatcher.filter_patterns_by_count
    get_filtered_patterns_by_type = ActivationPatcher.get_filtered_patterns_by_type
    list_available_pattern_types = ActivationPatcher.list_available_pattern_types
    show_pattern_info = ActivationPatcher.show_pattern_info
    get_random_pattern_by_type = ActivationPatcher.get_random_pattern_by_type
    get_patterns_by_type = ActivationPatcher.get_patterns_by_type
    print("✓ Successfully imported cognitive pattern utilities")
except ImportError as e:
    warnings.warn(f"Could not import pattern utilities: {e}")
    print("⚠️  Will use manual pattern definitions")

print("SelfIE Psychology Experiment Setup Complete!")

In [None]:
## Initialize Model and Load Dataset
# Model Selection - Choose a LLaMA-compatible model for SelfIE
# Note: SelfIE currently works best with LLaMA models
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"  # Change to available LLaMA model
# Alternative models to try:
# MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
# MODEL_NAME = "huggyllama/llama-7b"

# Initialize the SelfIE patcher
print(f"Initializing SelfIE with model: {MODEL_NAME}")
print("Note: This requires transformers==4.34.0 (see requirements.txt)")

try:
    selfie_patcher = SelfIEPatcher(MODEL_NAME)
    print("✓ SelfIE patcher initialized successfully!")
except Exception as e:
    print(f"❌ Error initializing SelfIE: {e}")
    print("\n🔧 Troubleshooting tips:")
    print("1. Ensure you have transformers==4.34.0 installed")
    print("2. Make sure the model is available and you have access")
    print("3. Try a different LLaMA model")
    print("4. Check your GPU memory and CUDA setup")

# Load the cognitive patterns dataset
print("\n📊 Loading cognitive patterns dataset...")
try:
    patterns, pattern_types = load_cognitive_patterns()
    print(f"✓ Loaded {len(patterns)} cognitive patterns")
    print(f"✓ Found {len(pattern_types)} different pattern types")
    print("\n📋 Available pattern types:")
    list_available_pattern_types(pattern_types)
    print(f"\n💡 Each pattern has three text variants: positive, negative, transition")
    print("Available text types: positive, negative, transition")
    
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    print("\n⚠️  Using manual pattern definitions for this demo")
    # Define some sample patterns manually
    patterns = [
        {
            'cognitive_pattern_name': 'Negative Self-Evaluation',
            'cognitive_pattern_type': 'Negative self-evaluative loop',
            'positive_thought_pattern': "I'm learning and growing from my mistakes, and that's part of being human.",
            'reference_negative_example': "I always mess everything up and never do anything right.",
            'reference_transformed_example': "I made a mistake, but I can learn from this experience."
        },
        {
            'cognitive_pattern_name': 'Anxiety Response',
            'cognitive_pattern_type': 'Anxiety pattern',
            'positive_thought_pattern': "I can handle challenging situations by taking things one step at a time.",
            'reference_negative_example': "Everything is going to go wrong and I won't be able to cope.",
            'reference_transformed_example': "I'm feeling anxious, but I can use coping strategies to manage this."
        }
    ]
    pattern_types = {
        'Negative self-evaluative loop': [patterns[0]], 
        'Anxiety pattern': [patterns[1]]
    }
    print(f"✓ Using {len(patterns)} manual patterns for demonstration")

In [None]:
# 🧪 TEST: Verify Data Loading and Access

print("🧪 TESTING DATA LOADING AND ACCESS:")
print("=" * 60)

try:
    # Test basic data access
    print(f"📊 Total patterns loaded: {len(patterns)}")
    print(f"📋 Pattern types available: {len(pattern_types)}")
    
    # Test filtering functionality
    test_filtered_patterns, test_filtered_types = filter_patterns_by_count(pattern_types, 5)
    print(f"✅ Filtering test: {len(test_filtered_patterns)} patterns when limited to 5 per type")
    
    # Test pattern retrieval
    first_pattern_type = list(pattern_types.keys())[0]
    test_pattern = get_pattern_by_type(pattern_types, first_pattern_type, 0)
    print(f"✅ Pattern retrieval test: Got pattern '{test_pattern['cognitive_pattern_name']}'")
    
    # Test text extraction
    positive_text = get_pattern_text(test_pattern, "positive")
    negative_text = get_pattern_text(test_pattern, "negative")
    transition_text = get_pattern_text(test_pattern, "transition")
    
    print(f"✅ Text extraction test:")
    print(f"   Positive: {positive_text[:50]}...")
    print(f"   Negative: {negative_text[:50]}...")
    print(f"   Transition: {transition_text[:50]}...")
    
    # Show sample pattern info
    print(f"\n📋 SAMPLE PATTERN DETAILS:")
    show_pattern_info(test_pattern)
    
    print(f"\n🎉 All data loading tests PASSED!")
    print(f"✅ Ready to run SelfIE experiments with cognitive patterns dataset")
    
except Exception as e:
    print(f"❌ Data loading test FAILED: {e}")
    print("Check that the dataset path and utility functions are correct")

print("=" * 60)

## Experiment 1: Basic SelfIE Interpretation

Let's start with a simple experiment to interpret internal representations of cognitive patterns using SelfIE.

### What this experiment does:
- **Input**: We provide a cognitive pattern text (positive or negative thought)
- **Extraction**: SelfIE extracts internal representations from specific layers and token positions
- **Interpretation**: The model describes what these representations mean in natural language
- **Analysis**: We examine the interpretations to understand how the model processes different cognitive patterns

In [None]:
# EXPERIMENT 1: Basic SelfIE Interpretation

# ===== CONFIGURATION SECTION =====
NUM_EXAMPLES_PER_TYPE = 1  # Number of examples to use per pattern type (1-40)
PATTERN_TYPE = "Existential rumination"  # Choose from the list above
PATTERN_INDEX_WITHIN_TYPE = 0  # If there are multiple examples of this type, choose which one (0-based)
TEXT_TYPE = "negative"  # "positive", "negative", or "transition"
INTERPRETATION_TEMPLATE = "cognitive_pattern"  # Template for interpretation
LAYERS_TO_INTERPRET = "all"  # Enhanced options: "all", [-1, -2, -3], (0, 5), "0:5", "0:10:2"
TOKEN_STRATEGY = TokenSelectionStrategy.LAST_TOKEN  # How to select tokens
MAX_INTERPRETATION_TOKENS = 40  # Max tokens for interpretation
BATCH_SIZE = 1
INJECTION_LAYER = 3  # Layer where extracted activations are injected (0-based from start)
AGGREGATION_STRATEGY = AggregationStrategy.PRINCIPAL_COMPONENT  # How to average activations when aggregating
# ====================================

# Apply runtime filtering to use only NUM_EXAMPLES_PER_TYPE examples per pattern type
filtered_patterns, filtered_pattern_types = filter_patterns_by_count(pattern_types, NUM_EXAMPLES_PER_TYPE)

# Get the pattern to analyze using the filtered dataset
try:
    selected_pattern = get_pattern_by_type(filtered_pattern_types, PATTERN_TYPE, PATTERN_INDEX_WITHIN_TYPE)
    input_text = get_pattern_text(selected_pattern, TEXT_TYPE)
    pattern_name = selected_pattern['cognitive_pattern_name']
    
    # Process the layer specification to get actual layer indices
    layers_to_use = process_layers_to_interpret(LAYERS_TO_INTERPRET)
    
    # print(f"\n🧠 EXPERIMENT 1: SelfIE Interpretation")
    # print(f"📊 Using {NUM_EXAMPLES_PER_TYPE} examples per pattern type (runtime filtered)")
    # print(f"🔍 Pattern Type: {PATTERN_TYPE}")
    # print(f"🧠 Pattern: {pattern_name}")
    # print(f"📝 Text type: {TEXT_TYPE}")
    # print(f"📖 Input text: {input_text}")
    # print(f"🏗️ Layers to interpret: {LAYERS_TO_INTERPRET} → {layers_to_use}")
    # print(f"💉 Injection layer: {INJECTION_LAYER}")
    # print(f"🔧 Token strategy: {TOKEN_STRATEGY.value}")
    # print(f"🧮 Aggregation strategy (used when BATCH_SIZE > 1): {AGGREGATION_STRATEGY.value}")
    # print("\\n" + "="*80)

    # Show pattern details for context
    print("\\n🔍 PATTERN DETAILS:")
    show_pattern_info(selected_pattern)
    print("\\n" + "="*80)
    
except Exception as e:
    print(f"❌ Error loading pattern: {e}")
    print("Available pattern types:")


try:
    if BATCH_SIZE <= 1:
        # Perform standard SelfIE interpretation on a single input
        interpretation_results = selfie_patcher.interpret_text(
            text=input_text,
            layers_to_interpret=layers_to_use,
            interpretation_template=INTERPRETATION_TEMPLATE,
            max_new_tokens=MAX_INTERPRETATION_TOKENS,
            batch_size=1,
            k=INJECTION_LAYER,
            # token_positions=TOKEN_STRATEGY
        )
    else:
        # Aggregate activations across multiple examples of the same pattern type
        print(f"\n🔗 Aggregating across {BATCH_SIZE} examples using {AGGREGATION_STRATEGY.value}...")
        # Collect up to BATCH_SIZE examples from the selected pattern type
        available = filtered_pattern_types.get(PATTERN_TYPE, [])
        if not available:
            raise ValueError(f"No patterns available for type: {PATTERN_TYPE}")
        patterns_for_aggregation = available[:BATCH_SIZE]
        interpretation_results = selfie_patcher.batch_interpret_patterns(
            patterns=patterns_for_aggregation,
            text_type=TEXT_TYPE,
            aggregation_strategy=AGGREGATION_STRATEGY,
            layers_to_interpret=layers_to_use,
            interpretation_template=INTERPRETATION_TEMPLATE,
            max_new_tokens=MAX_INTERPRETATION_TOKENS,
            batch_size=1,
            k=INJECTION_LAYER,
        )
    
    print("\n🎊 EXPERIMENT 1 RESULTS:")
    print("=" * 80)
    
    # Display results in a readable format
    for idx, row in interpretation_results.iterrows():
        if 'token_decoded' in row:
            print(f"\n📍 Layer {row['layer']}, Token {row['token']} ('{row['token_decoded']}'):")
        else:
            print(f"\n📍 Layer {row['layer']}, Token {row['token']}")
        print(f"   Interpretation: {row['interpretation'].strip()}")
    
    print("\n📊 Full Results DataFrame:")
    display_columns = [c for c in ['layer', 'token', 'token_decoded', 'aggregation_strategy', 'num_patterns', 'interpretation'] if c in interpretation_results.columns]
    display(interpretation_results[display_columns])
    
except Exception as e:
    print(f"❌ Error in SelfIE interpretation: {e}")


print("\\n" + "="*80)

## Experiment 2: Comparing Positive vs Negative Patterns

Let's compare how the model interprets positive versus negative cognitive patterns.

In [None]:
# EXPERIMENT 2: Positive vs Negative Pattern Comparison

# ===== CONFIGURATION SECTION =====
NUM_EXAMPLES_PER_TYPE = 15  # Number of examples to use per pattern type (1-40)
COMPARISON_PATTERN_TYPE = "Intrusive suicidal fixation"  # Choose from available types
COMPARISON_PATTERN_INDEX = 0  # Which example within the type
COMPARISON_LAYER = [-1]  # Enhanced options: "all", [-1, -2, -3], (0, 5), "0:5", "0:10:2"
COMPARISON_STRATEGY = TokenSelectionStrategy.LAST_TOKEN
COMPARISON_TEMPLATE = "psychological_state"
COMPARISON_MAX_TOKENS = 30
COMPARISON_INJECTION_LAYER = 1  # Layer where extracted activations are injected (0-based from start)
# ====================================

# Apply runtime filtering
filtered_patterns, filtered_pattern_types = filter_patterns_by_count(pattern_types, NUM_EXAMPLES_PER_TYPE)

try:
    selected_pattern = get_pattern_by_type(filtered_pattern_types, COMPARISON_PATTERN_TYPE, COMPARISON_PATTERN_INDEX)
    
    positive_text = get_pattern_text(selected_pattern, "positive")
    negative_text = get_pattern_text(selected_pattern, "negative")
    pattern_name = selected_pattern['cognitive_pattern_name']
    
    # Process the layer specification
    layers_to_use = process_layers_to_interpret(COMPARISON_LAYER)
    
    print(f"\\n🔄 EXPERIMENT 2: Positive vs Negative Comparison")
    print(f"📊 Using {NUM_EXAMPLES_PER_TYPE} examples per pattern type (runtime filtered)")
    print(f"🔍 Pattern Type: {COMPARISON_PATTERN_TYPE}")
    print(f"🧠 Pattern: {pattern_name}")
    print(f"🏗️ Layer: {COMPARISON_LAYER} → {layers_to_use}")
    print(f"💉 Injection layer: {COMPARISON_INJECTION_LAYER}")
    print(f"🔧 Strategy: {COMPARISON_STRATEGY.value}")
    print("\\n" + "="*80)
    
    # Show pattern details
    print("\\n🔍 PATTERN DETAILS:")
    show_pattern_info(selected_pattern)
    print("\\n" + "="*80)
    
    try:
        # Interpret positive pattern
        print("\\n✅ Analyzing POSITIVE pattern...")
        print(f"Text: {positive_text}")
        
        positive_results = selfie_patcher.interpret_text(
            text=positive_text,
            layers_to_interpret=layers_to_use,
            interpretation_template=COMPARISON_TEMPLATE,
            max_new_tokens=COMPARISON_MAX_TOKENS,
            batch_size=1,
            k=COMPARISON_INJECTION_LAYER
        )
        
        # Interpret negative pattern
        print("\\n❌ Analyzing NEGATIVE pattern...")
        print(f"Text: {negative_text}")
        
        negative_results = selfie_patcher.interpret_text(
            text=negative_text,
            layers_to_interpret=layers_to_use,
            interpretation_template=COMPARISON_TEMPLATE,
            max_new_tokens=COMPARISON_MAX_TOKENS,
            batch_size=1,
            k=COMPARISON_INJECTION_LAYER
        )
        
        print("\\n🎊 EXPERIMENT 2 RESULTS:")
        print("=" * 80)
        
        print("\\n✅ POSITIVE INTERPRETATION:")
        for idx, row in positive_results.iterrows():
            print(f"   {row['interpretation'].strip()}")
        
        print("\\n❌ NEGATIVE INTERPRETATION:")
        for idx, row in negative_results.iterrows():
            print(f"   {row['interpretation'].strip()}")
        
        # Store results for further analysis
        comparison_results = {
            'positive': positive_results,
            'negative': negative_results,
            'pattern_name': pattern_name,
            'pattern_type': COMPARISON_PATTERN_TYPE
        }
        
    except Exception as e:
        print(f"❌ Error in comparison experiment: {e}")

except Exception as e:
    print(f"❌ Error loading pattern: {e}")
    print("Available pattern types:")
    for pattern_type in filtered_pattern_types.keys():
        print(f"  - {pattern_type}")

print("\\n" + "="*80)

## Experiment 3: Multi-Layer Analysis

Analyze how interpretations change across different layers of the model.

In [None]:
# EXPERIMENT 3: Multi-Layer Analysis

# ===== CONFIGURATION SECTION =====
NUM_EXAMPLES_PER_TYPE = 20  # Number of examples to use per pattern type (1-40)
MULTILAYER_PATTERN_TYPE = "Entrapment cognition"  # Choose from available types
MULTILAYER_PATTERN_INDEX = 2  # Which example within the type
MULTILAYER_TEXT_TYPE = "negative"  # "positive", "negative", or "transition"
MULTILAYER_LAYERS = "0:10:2"  # Enhanced options: "all", [-1, -2, -3], (0, 5), "0:5", "0:10:2"
MULTILAYER_TEMPLATE = "cognitive_pattern"
MULTILAYER_STRATEGY = TokenSelectionStrategy.KEYWORDS
MULTILAYER_MAX_TOKENS = 35
MULTILAYER_INJECTION_LAYER = 1  # Layer where extracted activations are injected (0-based from start)
# ====================================

# Apply runtime filtering
filtered_patterns, filtered_pattern_types = filter_patterns_by_count(pattern_types, NUM_EXAMPLES_PER_TYPE)

try:
    selected_pattern = get_pattern_by_type(filtered_pattern_types, MULTILAYER_PATTERN_TYPE, MULTILAYER_PATTERN_INDEX)
    multilayer_text = get_pattern_text(selected_pattern, MULTILAYER_TEXT_TYPE)
    pattern_name = selected_pattern['cognitive_pattern_name']
    
    # Process the layer specification
    layers_to_use = process_layers_to_interpret(MULTILAYER_LAYERS)
    
    print(f"\\n🧭 EXPERIMENT 3: Multi-Layer Analysis")
    print(f"📊 Using {NUM_EXAMPLES_PER_TYPE} examples per pattern type (runtime filtered)")
    print(f"🔍 Pattern Type: {MULTILAYER_PATTERN_TYPE}")
    print(f"🧠 Pattern: {pattern_name}")
    print(f"📝 Text type: {MULTILAYER_TEXT_TYPE}")
    print(f"📖 Text: {multilayer_text}")
    print(f"🏗️ Analyzing layers: {MULTILAYER_LAYERS} → {layers_to_use}")
    print(f"💉 Injection layer: {MULTILAYER_INJECTION_LAYER}")
    print(f"🔧 Token strategy: {MULTILAYER_STRATEGY.value}")
    print("\\n" + "="*80)
    
    # Show pattern details
    print("\\n🔍 PATTERN DETAILS:")
    show_pattern_info(selected_pattern)
    print("\\n" + "="*80)

    try:
        # Analyze across multiple layers
        multilayer_results = selfie_patcher.interpret_text(
            text=multilayer_text,
            layers_to_interpret=layers_to_use,
            interpretation_template=MULTILAYER_TEMPLATE,
            max_new_tokens=MULTILAYER_MAX_TOKENS,
            batch_size=2,
            k=MULTILAYER_INJECTION_LAYER
        )
        
        print("\\n🎊 EXPERIMENT 3 RESULTS:")
        print("=" * 80)
        
        # Group results by layer
        for layer in sorted(set(layers_to_use)):
            layer_results = multilayer_results[multilayer_results['layer'] == layer]
            print(f"\\n🏗️ LAYER {layer}:")
            for idx, row in layer_results.iterrows():
                print(f"   Token '{row['token_decoded']}' → {row['interpretation'].strip()}")
        
        # Display summary table
        print("\\n📊 Summary Table:")
        display(multilayer_results[['layer', 'token_decoded', 'interpretation']])
        
    except Exception as e:
        print(f"❌ Error in multi-layer analysis: {e}")

except Exception as e:
    print(f"❌ Error loading pattern: {e}")
    print("Available pattern types:")
    for pattern_type in filtered_pattern_types.keys():
        print(f"  - {pattern_type}")

print("\\n" + "="*80)

## Utility Functions

In [None]:
# Utility functions for the notebook

def reset_environment():
    """Reset the SelfIE environment"""
    selfie_patcher.reset_hooks()  # No-op for SelfIE but maintains compatibility
    print("🔄 Environment reset")

def show_model_info():
    """Display current model information"""
    selfie_patcher.check_model_info()

def clear_memory():
    """Clear GPU memory"""
    SelfIEPatcher.clear_memory()

print("Utility functions loaded:")
print("- reset_environment() - Reset the environment")
print("- show_model_info() - Display model information")
print("- clear_memory() - Clear GPU memory")

# Uncomment any line below to run:
# reset_environment()
# show_model_info()
# clear_memory()