In [1]:
## AMD GPU Environment Variables
import os
os.environ["HSA_OVERRIDE_GFX_VERSION"] = "11.0.0"
os.environ["HIP_VISIBLE_DEVICES"] = "0"
os.environ["AMD_SERIALIZE_KERNEL"] = "3"
os.environ["TORCH_USE_HIP_DSA"] = "1"


In [2]:
# RePENG Pattern Steering Test (NNsight)
# This notebook computes PCA-diff steering vectors per cognitive pattern and injects them at InterpretationPrompt placeholder positions.

In [3]:
import os, sys
import nnsight
import torch

# Device detection for cross-platform compatibility
if torch.cuda.is_available():
    device = torch.device("cuda")
    device_type = "cuda"
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device("mps")
    device_type = "mps"
else:
    device = torch.device("cpu")
    device_type = "cpu"

print(f"Using device: {device} ({device_type})")

# Device-agnostic memory clearing function
def clear_cache():
    """Clear device cache in a device-agnostic way"""
    if device_type == "cuda":
        torch.cuda.empty_cache()
    elif device_type == "mps":
        torch.mps.empty_cache()
    # CPU doesn't need explicit cache clearing

# Robust import of local package without relying on __file__ (undefined in notebooks)
try:
    from nnsight_selfie import (
        InterpretationPrompt,
        compute_pattern_steering_vectors,
        inject_with_interpretation_prompt,
        list_patterns,
    )
except ModuleNotFoundError:
    cwd = os.getcwd()
    candidates = [
        os.path.abspath(os.path.join(cwd, 'NNsight_selfie')),
        os.path.abspath(os.path.join(cwd, '../NNsight_selfie')),
        os.path.abspath(os.path.join(cwd, '..')),
        os.path.abspath(os.path.join(cwd, '..', '..')),
    ]
    for c in candidates:
        pkgdir = os.path.join(c, 'nnsight_selfie')
        if os.path.isdir(pkgdir):
            if c not in sys.path:
                sys.path.insert(0, c)
            break
    from nnsight_selfie import (
        InterpretationPrompt,
        compute_pattern_steering_vectors,
        inject_with_interpretation_prompt,
        list_patterns,
    )

MODEL_NAME = os.environ.get('MODEL_NAME', 'google/Gemma-3-4b-it')

# Resolve patterns path robustly
PATTERNS_PATH = os.environ.get('PATTERNS_PATH')
if not PATTERNS_PATH:
    cwd = os.getcwd()
    pattern_candidates = [
        os.path.join(cwd, 'data/final/positive_patterns.jsonl'),
        os.path.join(cwd, '../data/final/positive_patterns.jsonl'),
        os.path.join(cwd, '../../data/final/positive_patterns.jsonl'),
    ]
    for p in pattern_candidates:
        if os.path.exists(p):
            PATTERNS_PATH = p
            break

print('Loading model (bfloat16)...')
model = nnsight.LanguageModel(
    MODEL_NAME,
    device_map='auto',
    dtype=torch.bfloat16,
    low_cpu_mem_usage=False,
)
tokenizer = model.tokenizer

# Apply Gemma 3 4B-it vision filter behavior used in ModelAgnosticSelfie by tagging model_name
if 'gemma' in MODEL_NAME.lower() and '3' in MODEL_NAME and '4b' in MODEL_NAME.lower() and 'it' in MODEL_NAME.lower():
    try:
        setattr(model, 'model_name', MODEL_NAME)
        print('Gemma 3 4B-it detected; extractor will filter out vision components.')
    except Exception:
        pass

print('Loaded')

Using device: cuda (cuda)
Loading model (bfloat16)...
Gemma 3 4B-it detected; extractor will filter out vision components.
Loaded


In [None]:
# Generate FULL dataset with disk caching - layers 18-30
print('=== FULL DATASET GENERATION WITH CACHING ===')
print('Generating steering vectors for ALL patterns with layers 18-30')

patterns = list_patterns(PATTERNS_PATH)
print(f'Total patterns available: {len(patterns)}')

# Clear GPU memory
import pickle
import os
from datetime import datetime
clear_cache()

# Configuration for FULL dataset generation
target_layers = list(range(18, 31))  # Layers 18-30 (13 layers total)
patterns_per_batch = 1  # Process 1 pattern at a time for memory safety
examples_per_pattern = 10  # Use more examples per pattern for better vectors
total_patterns = len(patterns)  # Process ALL patterns

print(f'Target layers: {target_layers} ({len(target_layers)} layers)')
print(f'Processing {patterns_per_batch} pattern per batch, {examples_per_pattern} examples each')
print(f'Total patterns to process: {total_patterns}')

# Create cache directory for full dataset
cache_dir = f"full_dataset_cache_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(cache_dir, exist_ok=True)
print(f'Created cache directory: {cache_dir}')

# Import required modules
from nnsight_selfie.repeng.repeng_activation_extractor import RepengActivationExtractor
from nnsight_selfie.repeng.repeng_steering_vectors import RepengSteeringVectorGenerator
from nnsight_selfie.repeng.patterns_dataset import build_all_datasets

batch_files = []
processed_patterns = []
failed_patterns = []

print(f'\n🚀 Starting full dataset generation...')
print(f'Estimated time: ~{total_patterns * 2} minutes (2 min per pattern)')

try:
    for batch_start in range(0, total_patterns, patterns_per_batch):
        batch_end = min(batch_start + patterns_per_batch, total_patterns)
        batch_num = batch_start // patterns_per_batch + 1
        
        print(f'\n--- Batch {batch_num}/{(total_patterns + patterns_per_batch - 1) // patterns_per_batch}: Pattern {batch_start+1}/{total_patterns} ---')
        
        # Clear memory before each batch
        clear_cache()
        
        # Get datasets for this batch (1 pattern at a time)
        datasets = build_all_datasets(PATTERNS_PATH, ['pos-neg'], max_patterns=patterns_per_batch)
        
        if not datasets:
            print(f'  ⚠️ No datasets found for batch {batch_num}')
            continue
            
        # Create extractor for this batch
        extractor = RepengActivationExtractor(model, tokenizer, layer_indices=target_layers)
        generator = RepengSteeringVectorGenerator(model_type=getattr(model, "model_name", "unknown"))
        
        # Process the pattern in this batch
        batch_bundles = []
        for pattern_name, pair_map in datasets.items():
            if 'pos-neg' in pair_map:
                dataset = pair_map['pos-neg'][:examples_per_pattern]  # Limit examples
                
                print(f'  Processing: {pattern_name}')
                print(f'    Examples: {len(dataset)} (limited to {examples_per_pattern})')
                print(f'    Layers: {len(target_layers)} layers ({target_layers[0]}-{target_layers[-1]})')
                
                try:
                    # Extract activations for this pattern
                    activations, inputs = extractor.extract_dataset_activations(
                        dataset, batch_size=1, show_progress=True
                    )
                    
                    # Generate steering vector
                    steering = generator.generate_steering_vectors(activations, method='pca_diff')
                    
                    # Create bundle
                    bundle = {
                        'steering_vector': steering, 
                        'pattern_name': pattern_name,
                        'num_examples': len(dataset),
                        'layers': list(steering.directions.keys()),
                        'batch_num': batch_num,
                        'target_layers': target_layers,
                        'method': 'pca_diff'
                    }
                    
                    batch_bundles.append(bundle)
                    processed_patterns.append(pattern_name)
                    
                    print(f'    ✅ SUCCESS: Generated steering vector ({len(steering.directions)} layers)')
                    
                    # Clear activations immediately after processing
                    del activations, inputs, steering
                    clear_cache()
                    
                except Exception as e:
                    print(f'    ❌ FAILED: {str(e)[:100]}...')
                    failed_patterns.append((pattern_name, str(e)))
                    continue
        
        # Save this batch to disk immediately (even if empty)
        batch_file = os.path.join(cache_dir, f'batch_{batch_num:03d}.pkl')
        batch_data = {
            'batch_num': batch_num,
            'bundles': batch_bundles,
            'model_name': MODEL_NAME,
            'target_layers': target_layers,
            'examples_per_pattern': examples_per_pattern,
            'timestamp': datetime.now().isoformat(),
            'processed_patterns': [b['pattern_name'] for b in batch_bundles]
        }
        
        with open(batch_file, 'wb') as f:
            pickle.dump(batch_data, f)
        
        batch_files.append(batch_file)
        
        if batch_bundles:
            print(f'  💾 Saved batch {batch_num} to disk ({len(batch_bundles)} patterns)')
        else:
            print(f'  💾 Saved empty batch {batch_num} to disk')
        
        # Aggressively clear memory after each batch
        del extractor, generator, datasets, batch_bundles, batch_data
        clear_cache()
        
        # Progress update
        progress_pct = (batch_num / ((total_patterns + patterns_per_batch - 1) // patterns_per_batch)) * 100
        print(f'  📊 Progress: {progress_pct:.1f}% ({len(processed_patterns)} processed, {len(failed_patterns)} failed)')
    
    # Create comprehensive index file
    index_file = os.path.join(cache_dir, 'full_dataset_index.pkl')
    index_data = {
        'batch_files': batch_files,
        'total_patterns_attempted': total_patterns,
        'successful_patterns': len(processed_patterns),
        'failed_patterns': len(failed_patterns),
        'target_layers': target_layers,
        'examples_per_pattern': examples_per_pattern,
        'model_name': MODEL_NAME,
        'method': 'pca_diff',
        'timestamp': datetime.now().isoformat(),
        'cache_dir': cache_dir,
        'processed_pattern_names': processed_patterns,
        'failed_pattern_details': failed_patterns
    }
    
    with open(index_file, 'wb') as f:
        pickle.dump(index_data, f)
    
    print(f'\n🎉 FULL DATASET GENERATION COMPLETE!')
    print(f'📂 Cache directory: {cache_dir}')
    print(f'📊 Results:')
    print(f'   - Successful: {len(processed_patterns)}/{total_patterns} patterns')
    print(f'   - Failed: {len(failed_patterns)}/{total_patterns} patterns')
    print(f'   - Layers: {len(target_layers)} layers (18-30)')
    print(f'   - Examples per pattern: {examples_per_pattern}')
    print(f'   - Batch files: {len(batch_files)}')
    
    # Calculate cache size
    cache_files = [f for f in os.listdir(cache_dir) if f.endswith('.pkl')]
    total_size = sum(os.path.getsize(os.path.join(cache_dir, f)) for f in cache_files)
    print(f'   - Total cache size: {total_size / 1024 / 1024:.2f} MB')
    
    if failed_patterns:
        print(f'\n⚠️ Failed patterns:')
        for pattern, error in failed_patterns[:5]:  # Show first 5 failures
            print(f'   - {pattern}: {error[:80]}...')
        if len(failed_patterns) > 5:
            print(f'   ... and {len(failed_patterns) - 5} more failures')
    
    print(f'\n📋 Index file: {index_file}')
    print(f'💾 Ready for analysis and testing!')
    
except Exception as e:
    print(f'\n💥 CRITICAL ERROR: {e}')
    import traceback
    traceback.print_exc()
    
    # Save partial results
    if processed_patterns:
        partial_index = os.path.join(cache_dir, 'partial_index.pkl')
        with open(partial_index, 'wb') as f:
            pickle.dump({
                'partial_results': True,
                'processed_patterns': processed_patterns,
                'failed_patterns': failed_patterns,
                'batch_files': batch_files,
                'target_layers': target_layers,
                'error': str(e)
            }, f)
        print(f'💾 Saved partial results to: {partial_index}')

# Final cleanup
clear_cache()
print(f'\n🧹 Memory cleanup complete')

=== FULL DATASET GENERATION WITH CACHING ===
Generating steering vectors for ALL patterns with layers 18-30
Total patterns available: 13
Target layers: [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] (13 layers)
Processing 1 pattern per batch, 10 examples each
Total patterns to process: 13
Created cache directory: full_dataset_cache_20250910_223246

🚀 Starting full dataset generation...
Estimated time: ~26 minutes (2 min per pattern)

--- Batch 1/13: Pattern 1/13 ---
Limited to 1 patterns for memory optimization
Filtered vision components. Using 35 layers.
Initialized activation extractor for 13 layers
  Processing: Executive Fatigue & Avolition
    Examples: 10 (limited to 10)
    Layers: 13 layers (18-30)


Extracting activations:   0%|          | 0/20 [00:00<?, ?it/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

In [None]:
# Load cached steering vectors and test injection
print('Loading cached steering vectors...')

# Find the most recent cache directory
import glob
cache_dirs = glob.glob('steering_cache_*')
if not cache_dirs:
    print('❌ No cache directories found. Run cell 4 first.')
else:
    # Use most recent cache directory
    cache_dir = sorted(cache_dirs)[-1]
    print(f'Using cache directory: {cache_dir}')
    
    # Load index file
    index_file = os.path.join(cache_dir, 'index.pkl')
    with open(index_file, 'rb') as f:
        index_data = pickle.load(f)
    
    print(f'Found {len(index_data["batch_files"])} batch files')
    print(f'Total patterns: {index_data["total_patterns"]}')
    print(f'Target layers: {index_data["target_layers"]}')
    
    # Load all bundles from cache (memory efficient - load one batch at a time for testing)
    all_bundles = []
    for batch_file in index_data['batch_files']:
        with open(batch_file, 'rb') as f:
            batch_data = pickle.load(f)
            # Convert back to object-like format for compatibility
            for bundle_dict in batch_data['bundles']:
                bundle = type('Bundle', (), bundle_dict)()
                all_bundles.append(bundle)
    
    bundles = all_bundles
    print(f'✅ Loaded {len(bundles)} steering vectors from cache')
    
    # Build an interpretation prompt with placeholders
    interp = InterpretationPrompt.create_simple(tokenizer, prefix='This neural pattern represents ', suffix=' in emotion')
    prompt_text = interp.get_prompt()
    print(f'\nPrompt: {prompt_text}')
    print(f'Insert positions: {interp.get_insert_locations()[:10]}')
    
    # Test with first few patterns (load from cache as needed)
    test_results = []
    max_tests = min(3, len(bundles))  # Test up to 3 patterns
    
    for i in range(max_tests):
        test_bundle = bundles[i]
        
        print(f'\n--- Testing Pattern {i+1}: {test_bundle.pattern_name} ---')
        print(f'Layers: {test_bundle.layers}')
        print(f'Training examples: {test_bundle.num_examples}')
        print(f'From batch: {test_bundle.batch_num}')
        
        try:
            # Clear memory before each test
            clear_cache()
            
            # Inject and generate
            res = inject_with_interpretation_prompt(
                model, tokenizer,
                prompt_text=prompt_text,
                steering_vector=test_bundle.steering_vector,
                interpretation_prompt=interp,
                injection_strength=1.0,
                max_new_tokens=30,
                do_sample=False
            )
            
            generated = res['generated_text'][:200]  # Truncate for display
            test_results.append({
                'pattern': test_bundle.pattern_name,
                'generated': generated,
                'success': True,
                'batch': test_bundle.batch_num
            })
            
            print(f'Generated: "{generated}"')
            
        except Exception as e:
            print(f'Error with {test_bundle.pattern_name}: {e}')
            test_results.append({
                'pattern': test_bundle.pattern_name,
                'generated': f'Error: {str(e)}',
                'success': False,
                'batch': test_bundle.batch_num
            })
    
    print(f'\n=== SUMMARY ===')
    print(f'Cache directory: {cache_dir}')
    print(f'Total patterns in cache: {len(bundles)}')
    print(f'Patterns tested: {len(test_results)}')
    successful_tests = sum(1 for r in test_results if r['success'])
    print(f'Successful injections: {successful_tests}/{len(test_results)}')
    
    print('\nResults:')
    for i, result in enumerate(test_results):
        status = '✓' if result['success'] else '✗'
        batch_info = f" (batch {result['batch']})" if 'batch' in result else ""
        print(f'{status} {result["pattern"]}{batch_info}: {result["generated"][:100]}...')
    
    # Show available patterns for further testing
    print(f'\nAvailable patterns for testing:')
    for i, bundle in enumerate(bundles[:10]):  # Show first 10
        print(f'  {i+1}. {bundle.pattern_name} (batch {bundle.batch_num}, {bundle.num_examples} examples)')
    
    if len(bundles) > 10:
        print(f'  ... and {len(bundles) - 10} more patterns')
    
    print(f'\nMemory usage: Only loaded steering vectors (no activations)')
    clear_cache()

In [None]:
# Advanced testing: Compare patterns and analyze steering effects
print('=== ADVANCED PATTERN ANALYSIS ===\n')

if 'bundles' in locals() and len(bundles) > 0:
    # Test multiple patterns with different injection strengths
    print('Testing different injection strengths...')
    
    test_patterns = bundles[:3]  # Use first 3 patterns
    strengths = [0.5, 1.0, 2.0]  # Different injection strengths
    
    results_matrix = {}
    
    for pattern_idx, test_bundle in enumerate(test_patterns):
        pattern_name = test_bundle.pattern_name
        results_matrix[pattern_name] = {}
        
        print(f'\n--- Pattern {pattern_idx + 1}: {pattern_name} ---')
        
        for strength in strengths:
            try:
                clear_cache()
                
                # Test injection
                res = inject_with_interpretation_prompt(
                    model, tokenizer,
                    prompt_text=prompt_text,
                    steering_vector=test_bundle.steering_vector,
                    interpretation_prompt=interp,
                    injection_strength=strength,
                    max_new_tokens=25,
                    do_sample=False
                )
                
                generated = res['generated_text'].strip()
                results_matrix[pattern_name][strength] = generated
                print(f'  Strength {strength}: "{generated[:80]}..."')
                
            except Exception as e:
                results_matrix[pattern_name][strength] = f'Error: {str(e)}'
                print(f'  Strength {strength}: Error - {str(e)[:50]}...')
    
    # Summary comparison
    print(f'\n=== STRENGTH COMPARISON SUMMARY ===')
    for pattern_name, strength_results in results_matrix.items():
        print(f'\n{pattern_name}:')
        for strength, result in strength_results.items():
            print(f'  {strength}x: {result[:60]}...')
    
    # Save detailed results
    detailed_results = {
        'test_timestamp': datetime.now().isoformat(),
        'model_name': MODEL_NAME,
        'prompt_template': prompt_text,
        'patterns_tested': len(test_patterns),
        'strengths_tested': strengths,
        'results_matrix': results_matrix,
        'pattern_metadata': {
            bundle.pattern_name: {
                'layers': bundle.layers,
                'examples': bundle.num_examples,
                'batch': bundle.batch_num
            } for bundle in test_patterns
        }
    }
    
    # Save to cache directory
    if 'cache_dir' in locals():
        results_file = os.path.join(cache_dir, 'test_results.pkl')
        with open(results_file, 'wb') as f:
            pickle.dump(detailed_results, f)
        print(f'\n💾 Saved detailed test results to: {results_file}')
    
    # Cache usage summary
    print(f'\n=== CACHE SUMMARY ===')
    if 'cache_dir' in locals():
        cache_files = os.listdir(cache_dir)
        total_size = sum(os.path.getsize(os.path.join(cache_dir, f)) for f in cache_files)
        print(f'Cache directory: {cache_dir}')
        print(f'Files: {len(cache_files)}')
        print(f'Total size: {total_size / 1024 / 1024:.2f} MB')
        print(f'Patterns cached: {len(bundles)}')
        print(f'Layers per pattern: {len(bundles[0].layers) if bundles else 0}')
        
        # Show how to reload everything
        print(f'\n📋 To reload this session later:')
        print(f'```python')
        print(f'import pickle')
        print(f'with open("{os.path.join(cache_dir, "index.pkl")}", "rb") as f:')
        print(f'    index_data = pickle.load(f)')
        print(f'# Then load individual batches as needed')
        print(f'```')
    
else:
    print('❌ No bundles loaded. Run previous cells first.')

# Final cleanup
clear_cache()
print(f'\n🧹 Final cleanup complete - memory freed')

In [None]:
# Use just 1 pattern and 1 example per pattern
specific_layers = [15]
print(f'Extracting from specific layers: {specific_layers}')

try:
    # Import using the existing working path
    from nnsight_selfie.repeng.repeng_activation_extractor import RepengActivationExtractor
    from nnsight_selfie.repeng.repeng_steering_vectors import RepengSteeringVectorGenerator
    from nnsight_selfie.repeng.patterns_dataset import build_all_datasets
    
    # Create extractor with specific layers only
    extractor = RepengActivationExtractor(model, tokenizer, layer_indices=specific_layers)
    
    # Get just 1 example from 1 pattern manually
    datasets = build_all_datasets(PATTERNS_PATH, ['pos-neg'], max_patterns=1)
    pattern_name = list(datasets.keys())[0]
    dataset = datasets[pattern_name]['pos-neg'][:1]  # Take only first example
    
    print(f"Processing 1 example from pattern: {pattern_name}")
    
    activations, inputs = extractor.extract_dataset_activations(
        dataset, batch_size=1, show_progress=True
    )
    
    # Generate steering vector
    generator = RepengSteeringVectorGenerator(model_type=getattr(model, "model_name", "unknown"))
    steering = generator.generate_steering_vectors(activations, method='pca_diff')
    
    print(f'SUCCESS: Generated steering vector for {len(steering.directions)} layers')
    
    # Store for later use
    bundles = [type('Bundle', (), {
        'steering_vector': steering, 
        'pattern_name': pattern_name
    })()]
    
except Exception as e:
    print(f'ERROR: {e}')
    import traceback
    traceback.print_exc()