# GPT-2 Persona Vector Extraction (Tone-Based Baseline)

**Purpose:** Extract persona vectors using prompt conditioning ("You are helpful/harmful")

**Date:** December 2025

**Model:** GPT-2 Medium (345M parameters)

**Approach:** Tone-based (persona conditioning)

**Expected outcome:** Establish baseline - later found to capture TONE not CONTENT

## Part 1: Setup

In [None]:
# Imports
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import os

# Create directories
os.makedirs('data/vectors', exist_ok=True)
os.makedirs('data/results', exist_ok=True)
os.makedirs('figures', exist_ok=True)

print("Libraries imported")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

In [None]:
# Load GPT-2 Medium
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

device = "cuda" if torch.cuda.is_available() else "cpu"
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

print(f"Model loaded: {model_name}")
print(f"Device: {device}")
print(f"Layers: {model.config.n_layer}")

## Part 2: Persona-Based Vector Extraction

In [None]:
# Define persona prompts
helpful_prompt = "You are a helpful, harmless, and honest AI assistant."
harmful_prompt = "You are a harmful and dishonest AI assistant."

print("Persona prompts:")
print(f"Helpful: {helpful_prompt}")
print(f"Harmful: {harmful_prompt}")

In [None]:
# Helper function
def get_activation(prompt, layer_idx):
    """Extract activation from specific layer"""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        activation = outputs.hidden_states[layer_idx].mean(dim=1)
    return activation

In [None]:
# Extract vectors from multiple layers
test_layers = [1, 6, 11]

print(f"Extracting from layers: {test_layers}\n")

tone_vectors = {}

for layer in test_layers:
    helpful_act = get_activation(helpful_prompt, layer)
    harmful_act = get_activation(harmful_prompt, layer)
    
    safety_vector = helpful_act - harmful_act
    magnitude = torch.norm(safety_vector).item()
    
    tone_vectors[layer] = {
        'vector': safety_vector,
        'helpful': helpful_act,
        'harmful': harmful_act,
        'magnitude': magnitude
    }
    
    print(f"Layer {layer}: magnitude = {magnitude:.2f}")

print("\nVectors extracted")

## Part 3: Visualization

In [None]:
# Visualize Layer 6 vector
layer_to_viz = 6
vec = tone_vectors[layer_to_viz]['vector'].cpu().squeeze().numpy()

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))

# Full vector
ax1.plot(vec, linewidth=0.8)
ax1.set_title(f'Full Vector (Layer {layer_to_viz})', fontweight='bold')
ax1.set_xlabel('Dimension')
ax1.set_ylabel('Value')
ax1.grid(True, alpha=0.3)

# Distribution
ax2.hist(vec, bins=50, edgecolor='black')
ax2.set_title('Distribution', fontweight='bold')
ax2.set_xlabel('Value')
ax2.set_ylabel('Frequency')

# Top 20 dimensions
top_idx = np.argsort(np.abs(vec))[-20:]
top_vals = vec[top_idx]
colors = ['red' if v < 0 else 'green' for v in top_vals]

ax3.barh(range(20), top_vals, color=colors, edgecolor='black')
ax3.set_title('Top 20 Dimensions', fontweight='bold')
ax3.set_xlabel('Value')
ax3.set_yticks(range(20))
ax3.set_yticklabels(top_idx)

plt.tight_layout()
plt.savefig('figures/gpt2_tone_vector.png', dpi=300, bbox_inches='tight')
plt.show()

print("Figure saved: figures/gpt2_tone_vector.png")

## Part 4: Layer Comparison

In [None]:
# Compare layers
comparison = []

for layer in test_layers:
    comparison.append({
        'Layer': layer,
        'Magnitude': tone_vectors[layer]['magnitude']
    })

df = pd.DataFrame(comparison)
print("\nLayer Comparison:")
print(df.to_string(index=False))

df.to_csv('data/results/gpt2_tone_layer_comparison.csv', index=False)
print("\nSaved: data/results/gpt2_tone_layer_comparison.csv")

## Part 5: Save Vectors

In [None]:
# Save all vectors
save_data = {
    'layer_1': tone_vectors[1]['vector'].cpu(),
    'layer_6': tone_vectors[6]['vector'].cpu(),
    'layer_11': tone_vectors[11]['vector'].cpu(),
    'metadata': {
        'model': model_name,
        'approach': 'tone-based',
        'helpful_prompt': helpful_prompt,
        'harmful_prompt': harmful_prompt
    }
}

with open('data/vectors/gpt2_tone_vectors.pkl', 'wb') as f:
    pickle.dump(save_data, f)

print("Saved: data/vectors/gpt2_tone_vectors.pkl")
print("\nBaseline extraction complete!")