# TranspOLMo Quick Start Guide

This notebook demonstrates the basic functionality of the TranspOLMo interpretability framework.

In [None]:
import sys
sys.path.insert(0, '..')

import torch
from src.models.loader import OLMo2Loader
from src.models.hooks import ActivationCapture
from src.analysis.geometry.manifold import ManifoldAnalyzer

## 1. Load Model

Load the OLMo2-1B model and inspect its architecture.

In [None]:
# Initialize loader
loader = OLMo2Loader(
    model_name="allenai/OLMo-2-0425-1B",
    cache_dir="../data/models",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Load model and tokenizer
model, tokenizer = loader.load()
arch_info = loader.get_architecture_info(model)

print(f"Model: {arch_info['model_name']}")
print(f"Size: {arch_info['model_size']}")
print(f"Layers: {arch_info['num_layers']}")
print(f"Hidden size: {arch_info['hidden_size']}")

## 2. Capture Activations

Hook into a specific layer and capture activations during forward pass.

In [None]:
# Example texts
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "In the beginning, there was nothing but darkness.",
    "Machine learning is transforming artificial intelligence.",
    "The capital of France is Paris, a beautiful city.",
    "Python is a popular programming language."
]

# Capture activations from middle layer
middle_layer = arch_info['num_layers'] // 2
layer_name = f"model.layers.{middle_layer}.mlp"

with ActivationCapture(model, device='cpu') as capturer:
    capturer.register_hooks([layer_name])
    
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', max_length=128, truncation=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
    
    activations = capturer.get_activations()

print(f"Captured activations from {layer_name}")
print(f"Shape: {activations[layer_name].shape}")

## 3. Geometric Analysis

Analyze the geometric structure of the activation manifold.

In [None]:
analyzer = ManifoldAnalyzer(activations[layer_name])

# Estimate intrinsic dimension
intrinsic_dim = analyzer.estimate_intrinsic_dimension()

print("Intrinsic Dimensionality:")
print(f"  Ambient dimension: {intrinsic_dim['ambient_dimension']}")
print(f"  90% variance: {intrinsic_dim['intrinsic_dim_90pct_var']} dimensions")
print(f"  95% variance: {intrinsic_dim['intrinsic_dim_95pct_var']} dimensions")
print(f"  99% variance: {intrinsic_dim['intrinsic_dim_99pct_var']} dimensions")
print(f"  Compression ratio: {intrinsic_dim['compression_ratio']:.2f}x")

In [None]:
# Analyze local geometry
local_geom = analyzer.compute_local_geometry()

print("\nLocal Geometry:")
print(f"  Type: {local_geom['geometry_type']}")
print(f"  Mean distance: {local_geom['mean_local_distance']:.4f}")
print(f"  Mean curvature: {local_geom['mean_local_curvature']:.4f}")

In [None]:
# Activation statistics
stats = analyzer.compute_activation_statistics()

print("\nActivation Statistics:")
print(f"  Mean: {stats['mean']:.4f}")
print(f"  Std: {stats['std']:.4f}")
print(f"  Sparsity: {stats['sparsity']:.2%}")
print(f"  Positive fraction: {stats['positive_fraction']:.2%}")

## 4. Circuit Discovery

Analyze attention patterns to discover computational circuits.

In [None]:
from src.analysis.circuits.discovery import CircuitDiscovery

discoverer = CircuitDiscovery(model, tokenizer)

# Trace attention patterns
input_text = "The capital of France is Paris"
patterns = discoverer.trace_attention_patterns(input_text)

print(f"Input: {patterns['input_text']}")
print(f"Tokens: {patterns['tokens']}")
print(f"\nAttention Patterns:")

for layer_info in patterns['attention_patterns'][:3]:  # Show first 3 layers
    print(f"\nLayer {layer_info['layer']}:")
    for head_info in layer_info['heads'][:2]:  # Show first 2 heads
        print(f"  Head {head_info['head']}: {head_info['pattern_type']}")

## 5. Next Steps

- Run the full analysis pipeline: `python scripts/run_full_analysis.py`
- Train Sparse Autoencoders for feature extraction
- Explore the generated documentation in `docs/findings/`
- Experiment with different layers and analysis parameters

## Cleanup

In [None]:
# Free GPU memory
import gc
del model
torch.cuda.empty_cache()
gc.collect()