# Vector Arithmetic Lab 🧮
**Compact notebook for vector arithmetic experiments. Each experiment is self-contained with its own config.**

In [None]:
# FOR AMD GPU
import os
os.environ["HSA_OVERRIDE_GFX_VERSION"] = "11.0.0"
os.environ["HIP_VISIBLE_DEVICES"] = "0"
os.environ["AMD_SERIALIZE_KERNEL"] = "3"
os.environ["TORCH_USE_HIP_DSA"] = "1"



# SETUP
import sys, warnings, torch, numpy as np
from typing import List, Dict, Any, Tuple
sys.path.insert(0, '..')
warnings.filterwarnings('ignore', category=FutureWarning)

from nnsight_selfie import ModelAgnosticSelfie, InterpretationPrompt, get_optimal_device

# LOAD MODEL
MODEL_NAME = "google/gemma-3-4b-it"  # Change as needed
selfie = ModelAgnosticSelfie(MODEL_NAME, dtype=torch.bfloat16, load_in_8bit=False)
print(f"✅ {MODEL_NAME} loaded on {selfie.device} ({len(selfie.layer_paths)} layers)")

# HELPER FUNCTIONS
def show_tokens(text):
    tokens = selfie.model.tokenizer.encode(text)
    for i, token_id in enumerate(tokens):
        token_str = selfie.model.tokenizer.decode([token_id])
        print(f"  {i:2d}: '{token_str.strip()}'")
    return tokens

def get_vector(text, token_pos, layer):
    """Get activation vector for a specific token position and layer.
    
    Args:
        text: Input text (ALWAYS uses raw text - no chat template for activation capture!)
        token_pos: Token position to extract from
        layer: Layer index to extract from
    """
    acts = selfie.get_activations(text, layer_indices=[layer], token_indices=[token_pos])
    return acts[layer][0]

def interpret_vector(vector, prompt, injection_layer=3, max_tokens=30, use_chat_template=False):
    """Interpret a vector using the selfie interpretation system.
    
    Args:
        vector: Activation vector to interpret
        prompt: InterpretationPrompt object
        injection_layer: Layer to inject the vector at
        max_tokens: Maximum tokens to generate
        use_chat_template: Whether to apply chat template formatting to INTERPRETATION (not capture)
    """
    return selfie.interpret_vectors([vector], prompt, injection_layer, max_new_tokens=max_tokens, 
                                  use_chat_template=use_chat_template)[0].strip()

def experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos, 
               extract_layer, inject_layer, interp_prompt, description="", use_chat_template=False):
    """Run a vector arithmetic experiment.
    
    Args:
        base_text, sub_text, add_text: Raw text for activation capture (no chat template applied)
        base_pos, sub_pos, add_pos: Token positions in the RAW text
        use_chat_template: Whether to use chat template for INTERPRETATION only
    """
    print(f"🧮 {description}")
    print(f"   Extract: L{extract_layer} | Inject: L{inject_layer}")
    if use_chat_template:
        print(f"   📝 Using chat template for interpretation")
    
    # Always capture from raw text (no chat template) - token positions stay consistent
    base_vec = get_vector(base_text, base_pos, extract_layer)
    sub_vec = get_vector(sub_text, sub_pos, extract_layer) 
    add_vec = get_vector(add_text, add_pos, extract_layer)
    
    result_vec = base_vec - sub_vec + add_vec
    # Apply chat template only to interpretation
    interpretation = interpret_vector(result_vec, interp_prompt, inject_layer, use_chat_template=use_chat_template)
    
    print(f"   🤖 {interpretation}")
    return result_vec, interpretation
def get_multi_string_vectors(strings, token_positions, layer, aggregation="mean"):
    """Extract and aggregate activation vectors from multiple strings.
    
    Args:
        strings: List of strings to extract activations from
        token_positions: List of token positions (one per string) OR single int for all strings
        layer: Layer index to extract from
        aggregation: How to combine vectors - "mean", "sum", or "individual"
    
    Returns:
        If aggregation="individual": List of vectors (one per string)
        If aggregation="mean"/"sum": Single aggregated vector
    """
    if isinstance(token_positions, int):
        token_positions = [token_positions] * len(strings)
    
    if len(token_positions) != len(strings):
        raise ValueError(f"Number of token positions ({len(token_positions)}) must match number of strings ({len(strings)})")
    
    vectors = []
    for text, pos in zip(strings, token_positions):
        vector = get_vector(text, pos, layer)
        vectors.append(vector)
    
    if aggregation == "individual":
        return vectors
    elif aggregation == "mean":
        return torch.stack(vectors).mean(dim=0)
    elif aggregation == "sum":
        return torch.stack(vectors).sum(dim=0)
    else:
        raise ValueError(f"Unknown aggregation method: {aggregation}")

def multi_string_experiment(base_strings, base_positions, sub_strings, sub_positions, 
                           add_strings, add_positions, extract_layer, inject_layer, 
                           interp_prompt, description="", aggregation="mean", use_chat_template=False):
    """Run vector arithmetic experiment with multiple strings for each concept.
    
    Args:
        base_strings, sub_strings, add_strings: Lists of strings for each concept
        base_positions, sub_positions, add_positions: Token positions (list or single int)
        aggregation: How to combine vectors from multiple strings ("mean", "sum")
        Other args same as regular experiment()
    """
    print(f"🧮 {description}")
    print(f"   Extract: L{extract_layer} | Inject: L{inject_layer} | Aggregation: {aggregation}")
    print(f"   Base strings: {len(base_strings)} | Sub strings: {len(sub_strings)} | Add strings: {len(add_strings)}")
    if use_chat_template:
        print(f"   📝 Using chat template for interpretation")
    
    # Extract and aggregate vectors from multiple strings
    base_vec = get_multi_string_vectors(base_strings, base_positions, extract_layer, aggregation)
    sub_vec = get_multi_string_vectors(sub_strings, sub_positions, extract_layer, aggregation) 
    add_vec = get_multi_string_vectors(add_strings, add_positions, extract_layer, aggregation)
    
    result_vec = base_vec - sub_vec + add_vec
    interpretation = interpret_vector(result_vec, interp_prompt, inject_layer, use_chat_template=use_chat_template)
    
    print(f"   🤖 {interpretation}")
    return result_vec, interpretation

# MULTI-STRING VECTOR EXTRACTION UTILITY
def get_multi_string_vectors(strings, token_positions, layer, aggregation="mean"):
    """Extract and aggregate activation vectors from multiple strings.
    
    Args:
        strings: List of strings to extract activations from
        token_positions: List of token positions (one per string) OR single int for all strings
        layer: Layer index to extract from
        aggregation: How to combine vectors - "mean", "sum", or "individual"
    
    Returns:
        If aggregation="individual": List of vectors (one per string)
        If aggregation="mean"/"sum": Single aggregated vector
    """
    if isinstance(token_positions, int):
        token_positions = [token_positions] * len(strings)
    
    if len(token_positions) != len(strings):
        raise ValueError(f"Number of token positions ({len(token_positions)}) must match number of strings ({len(strings)})")
    
    vectors = []
    for text, pos in zip(strings, token_positions):
        vector = get_vector(text, pos, layer)
        vectors.append(vector)
    
    if aggregation == "individual":
        return vectors
    elif aggregation == "mean":
        return torch.stack(vectors).mean(dim=0)
    elif aggregation == "sum":
        return torch.stack(vectors).sum(dim=0)
    else:
        raise ValueError(f"Unknown aggregation method: {aggregation}")

def multi_string_experiment(base_strings, base_positions, sub_strings, sub_positions, 
                           add_strings, add_positions, extract_layer, inject_layer, 
                           interp_prompt, description="", aggregation="mean", use_chat_template=False):
    """Run vector arithmetic experiment with multiple strings for each concept.
    
    Args:
        base_strings, sub_strings, add_strings: Lists of strings for each concept
        base_positions, sub_positions, add_positions: Token positions (list or single int)
        aggregation: How to combine vectors from multiple strings ("mean", "sum")
        Other args same as regular experiment()
    """
    print(f"🧮 {description}")
    print(f"   Extract: L{extract_layer} | Inject: L{inject_layer} | Aggregation: {aggregation}")
    print(f"   Base strings: {len(base_strings)} | Sub strings: {len(sub_strings)} | Add strings: {len(add_strings)}")
    if use_chat_template:
        print(f"   📝 Using chat template for interpretation")
    
    # Extract and aggregate vectors from multiple strings
    base_vec = get_multi_string_vectors(base_strings, base_positions, extract_layer, aggregation)
    sub_vec = get_multi_string_vectors(sub_strings, sub_positions, extract_layer, aggregation) 
    add_vec = get_multi_string_vectors(add_strings, add_positions, extract_layer, aggregation)
    
    result_vec = base_vec - sub_vec + add_vec
    interpretation = interpret_vector(result_vec, interp_prompt, inject_layer, use_chat_template=use_chat_template)
    
    print(f"   🤖 {interpretation}")
    return result_vec, interpretation

# VECTOR PROJECTION UTILITIES
import torch.nn.functional as F

def vector_projection(vector_a, vector_b):
    """Project vector_a onto vector_b.
    
    Args:
        vector_a: Vector to be projected
        vector_b: Vector to project onto
        
    Returns:
        tuple: (projected_vector, projection_magnitude, cosine_similarity)
    """
    # Flatten vectors to 1D if they're 2D
    if vector_a.dim() > 1:
        vector_a = vector_a.flatten()
    if vector_b.dim() > 1:
        vector_b = vector_b.flatten()
    
    # Normalize vector_b to get direction
    b_normalized = F.normalize(vector_b.unsqueeze(0), dim=1).squeeze(0)
    
    # Compute projection magnitude (dot product with normalized b)
    projection_magnitude = torch.dot(vector_a, b_normalized).item()
    
    # Compute projected vector
    projected_vector = projection_magnitude * b_normalized
    
    # Compute cosine similarity
    cosine_sim = F.cosine_similarity(vector_a.unsqueeze(0), vector_b.unsqueeze(0)).item()
    
    return projected_vector, projection_magnitude, cosine_sim

def project_concept_onto_direction(concept_text, concept_pos, direction_text, direction_pos, 
                                 layer, description=""):
    """Project one concept vector onto another concept's direction.
    
    Args:
        concept_text: Text containing concept to project
        concept_pos: Token position of concept
        direction_text: Text containing direction concept
        direction_pos: Token position of direction
        layer: Layer to extract from
        description: Description for output
        
    Returns:
        tuple: (projected_vector, magnitude, cosine_similarity)
    """
    concept_vec = get_vector(concept_text, concept_pos, layer)
    direction_vec = get_vector(direction_text, direction_pos, layer)
    
    projected_vec, magnitude, cosine_sim = vector_projection(concept_vec, direction_vec)
    
    print(f"📐 {description}")
    print(f"   Layer: {layer}")
    print(f"   Projection magnitude: {magnitude:.3f}")
    print(f"   Cosine similarity: {cosine_sim:.3f}")
    
    return projected_vec, magnitude, cosine_sim

def batch_projection_analysis(concept_texts, concept_positions, direction_text, direction_pos, 
                            layer, concept_names=None):
    """Analyze projections of multiple concepts onto a single direction.
    
    Args:
        concept_texts: List of texts containing concepts
        concept_positions: List of token positions for concepts
        direction_text: Text containing direction concept  
        direction_pos: Token position of direction
        layer: Layer to extract from
        concept_names: Optional names for concepts (for display)
        
    Returns:
        dict: Results mapping concept names to (magnitude, cosine_similarity)
    """
    if concept_names is None:
        concept_names = [f"Concept_{i+1}" for i in range(len(concept_texts))]
    
    direction_vec = get_vector(direction_text, direction_pos, layer)
    results = {}
    
    print(f"📊 Batch projection analysis onto direction vector (Layer {layer})")
    print(f"   Direction: '{direction_text}' (pos {direction_pos})")
    print("   Results:")
    
    for i, (text, pos, name) in enumerate(zip(concept_texts, concept_positions, concept_names)):
        concept_vec = get_vector(text, pos, layer)
        _, magnitude, cosine_sim = vector_projection(concept_vec, direction_vec)
        results[name] = (magnitude, cosine_sim)
        print(f"     {name:15s}: mag={magnitude:6.3f}, cos_sim={cosine_sim:6.3f}")
    
    return results

torch.set_grad_enabled(False)

print("\n🚀 Ready for experiments!")
print(f"💡 Chat template available: {selfie._should_use_chat_template()}")

---
## 🔬 EXPERIMENT 1: King - Man + Woman = ?
**The classic vector arithmetic example**

In [None]:
# EXPERIMENT 1 CONFIG
EXTRACT_LAYER = 12
INJECT_LAYER = 3

# INTERPRETATION PROMPT
interp_prompt = InterpretationPrompt.create_entity_prompt(selfie.model.tokenizer)
# Custom: InterpretationPrompt(selfie.model.tokenizer, ["This refers to ", None])

# CONCEPT STRINGS
base_text = "The king ruled the kingdom wisely"
sub_text = "The man walked down the street"
add_text = "The woman read an interesting book"


In [None]:
print("📝 Tokenizing concept strings:")
print("\nBASE (king):")
base_tokens = show_tokens(base_text)
print("\nSUBTRACT (man):")
sub_tokens = show_tokens(sub_text) 
print("\nADD (woman):")
add_tokens = show_tokens(add_text)

# SELECT TOKEN POSITIONS (modify these after seeing tokenization above)
base_pos = 2  # "king" 
sub_pos = 2   # "man"
add_pos = 2   # "woman"

In [None]:
# RUN EXPERIMENT (with chat template option)
result = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                   EXTRACT_LAYER, INJECT_LAYER, interp_prompt, 
                   "King - Man + Woman (Expected: Queen-like)", use_chat_template=True)

# Without chat template (original behavior):
# result = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
#                    EXTRACT_LAYER, INJECT_LAYER, interp_prompt, 
#                    "King - Man + Woman (Expected: Queen-like)", use_chat_template=False)

---
## 🔬 EXPERIMENT 2: Doctor - Man + Woman = ?
**Professional role gender swap**

In [None]:
# EXPERIMENT 2 CONFIG  
EXTRACT_LAYER = 15
INJECT_LAYER = 5

# INTERPRETATION PROMPT
interp_prompt = InterpretationPrompt(selfie.model.tokenizer, ["This person is a ", None])

# CONCEPT STRINGS
base_text = "The doctor examined the patient carefully"
sub_text = "The man walked down the street"
add_text = "The woman read an interesting book"

print("📝 Tokenizing concept strings:")
print("\nBASE (doctor):")
base_tokens = show_tokens(base_text)
print("\nSUBTRACT (man):")
sub_tokens = show_tokens(sub_text)
print("\nADD (woman):") 
add_tokens = show_tokens(add_text)

# SELECT TOKEN POSITIONS
base_pos = 1  # "doctor"
sub_pos = 1   # "man" 
add_pos = 1   # "woman"

# RUN EXPERIMENT
result = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                   EXTRACT_LAYER, INJECT_LAYER, interp_prompt,
                   "Doctor - Man + Woman (Expected: Nurse-like)")

---
## 🔬 EXPERIMENT 3: Custom Experiment
**Your own vector arithmetic**

In [None]:
# EXPERIMENT 3 CONFIG
EXTRACT_LAYER = 10 
INJECT_LAYER = 2

# INTERPRETATION PROMPT (customize as needed)
# interp_prompt = InterpretationPrompt.create_concept_prompt(selfie.model.tokenizer)
# interp_prompt = InterpretationPrompt.create_sentiment_prompt(selfie.model.tokenizer)
interp_prompt = InterpretationPrompt(selfie.model.tokenizer, ["This represents ", None, " in society"])

# CONCEPT STRINGS (modify as needed)
base_text = "The teacher explained the concept clearly"
sub_text = "The woman read an interesting book" 
add_text = "The man walked down the street"

print("📝 Tokenizing concept strings:")
print("\nBASE:")
base_tokens = show_tokens(base_text)
print("\nSUBTRACT:")
sub_tokens = show_tokens(sub_text)
print("\nADD:")
add_tokens = show_tokens(add_text)

# SELECT TOKEN POSITIONS
base_pos = 1  # Adjust based on tokenization above
sub_pos = 1   # Adjust based on tokenization above
add_pos = 1   # Adjust based on tokenization above

# RUN EXPERIMENT
result = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                   EXTRACT_LAYER, INJECT_LAYER, interp_prompt,
                   "Custom: Teacher - Woman + Man")

---
## 🔬 EXPERIMENT 4: Multi-Layer Aggregation
**Advanced: Sum vectors from multiple layers**

In [None]:
# EXPERIMENT 4 CONFIG
EXTRACT_LAYERS = [3, 5, 13, 24,]  # Multiple layers to extract from
INJECT_LAYER = 3
AGGREGATION = "sum"  # "sum", "mean", or "concat"

# INTERPRETATION PROMPT
# interp_prompt = InterpretationPrompt.create_entity_prompt(selfie.model.tokenizer)

interp_prompt = InterpretationPrompt(selfie.model.tokenizer, [" ", None, " in society"])

# CONCEPT STRINGS
base_text = "The king sat on his throne"
sub_text = "The man sat on his throne"
add_text = "The woman sat on her throne"

print("📝 Tokenizing concept strings:")
print("\nBASE (scientist):")
show_tokens(base_text)
print("\nSUBTRACT (man):")
show_tokens(sub_text)
print("\nADD (woman):")
show_tokens(add_text)

# SELECT TOKEN POSITIONS
base_pos = 2  # "scientist"
sub_pos = 2   # "man"
add_pos = 2   # "woman"

# MULTI-LAYER EXTRACTION FUNCTION
def get_multi_layer_vector(text, token_pos, layers, agg_method="sum"):
    acts = selfie.get_activations(text, layer_indices=layers, token_indices=[token_pos])
    vectors = [acts[layer][0] for layer in layers]
    
    if agg_method == "sum":
        return torch.stack(vectors).sum(dim=0)
    elif agg_method == "mean":
        return torch.stack(vectors).mean(dim=0)
    elif agg_method == "concat":
        return torch.cat(vectors, dim=0)

In [None]:




# RUN MULTI-LAYER EXPERIMENT
# print(f"\n🧮 Multi-layer: Scientist - Man + Woman")
print(f"   Extract: L{EXTRACT_LAYERS} ({AGGREGATION}) | Inject: L{INJECT_LAYER}")

base_vec = get_multi_layer_vector(base_text, base_pos, EXTRACT_LAYERS, AGGREGATION)
sub_vec = get_multi_layer_vector(sub_text, sub_pos, EXTRACT_LAYERS, AGGREGATION)
add_vec = get_multi_layer_vector(add_text, add_pos, EXTRACT_LAYERS, AGGREGATION)

result_vec = base_vec - sub_vec + add_vec
interpretation = interpret_vector(result_vec, interp_prompt, INJECT_LAYER)

print(f"   🤖 {interpretation}")

---
## 🔬 EXPERIMENT 5: Emotion/Abstract Concepts
**Testing abstract concept arithmetic**

In [None]:
# EXPERIMENT 5 CONFIG
EXTRACT_LAYER = 13
INJECT_LAYER = 3

# CUSTOM INTERPRETATION PROMPT FOR EMOTIONS/CONCEPTS
interp_prompt = InterpretationPrompt(selfie.model.tokenizer, ["This emotion or concept is ", None])

# CONCEPT STRINGS (abstract/emotional)
base_text = "She felt incredibly happy about the news"
sub_text = "He was extremely sad about the loss" 
add_text = "They became very angry at the situation"

print("📝 Tokenizing concept strings:")
print("\nBASE (happy):")
show_tokens(base_text)
print("\nSUBTRACT (sad):")
show_tokens(sub_text)
print("\nADD (angry):")
show_tokens(add_text)

# SELECT TOKEN POSITIONS (look for emotion words)
base_pos = 4  # "happy" (adjust based on tokenization)
sub_pos = 4   # "sad"
add_pos = 4   # "angry"

# RUN EXPERIMENT
result = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                   EXTRACT_LAYER, INJECT_LAYER, interp_prompt,
                   "Emotion: Happy - Sad + Angry")

---
## 🔬 EXPERIMENT 6: Multi-String Vector Arithmetic
**Using multiple strings per concept for more robust results**

In [2]:
# EXPERIMENT 6 CONFIG
EXTRACT_LAYER = 12
INJECT_LAYER = 3

# INTERPRETATION PROMPT
interp_prompt = InterpretationPrompt.create_entity_prompt(selfie.model.tokenizer)

# MULTIPLE STRINGS FOR EACH CONCEPT (for more robust extraction)
base_strings = [
    "The king ruled the kingdom wisely",
    "A king sits on his royal throne", 
    "The mighty king commanded his army",
    "Every king must protect his subjects"
]

sub_strings = [
    "The man walked down the street",
    "A man stood in the doorway",
    "The tall man carried a briefcase", 
    "Every man has his own story"
]

add_strings = [
    "The woman read an interesting book",
    "A woman smiled at the children",
    "The young woman worked at her desk",
    "Every woman deserves equal respect"
]

print("📝 Tokenizing multiple strings:")
print("\\nBASE STRINGS (king concept):")
base_positions = []
for i, text in enumerate(base_strings):
    print(f"  String {i+1}: {text}")
    tokens = show_tokens(text)
    base_positions.append(2)  # "king" position in most strings

print("\\nSUB STRINGS (man concept):")
sub_positions = []
for i, text in enumerate(sub_strings):
    print(f"  String {i+1}: {text}")
    tokens = show_tokens(text)
    sub_positions.append(2)  # "man" position in most strings

print("\\nADD STRINGS (woman concept):")  
add_positions = []
for i, text in enumerate(add_strings):
    print(f"  String {i+1}: {text}")
    tokens = show_tokens(text)
    add_positions.append(2)  # "woman" position in most strings

📝 Tokenizing multiple strings:
\nBASE STRINGS (king concept):
  String 1: The king ruled the kingdom wisely
   0: '<bos>'
   1: 'The'
   2: 'king'
   3: 'ruled'
   4: 'the'
   5: 'kingdom'
   6: 'wisely'
  String 2: A king sits on his royal throne
   0: '<bos>'
   1: 'A'
   2: 'king'
   3: 'sits'
   4: 'on'
   5: 'his'
   6: 'royal'
   7: 'throne'
  String 3: The mighty king commanded his army
   0: '<bos>'
   1: 'The'
   2: 'mighty'
   3: 'king'
   4: 'commanded'
   5: 'his'
   6: 'army'
  String 4: Every king must protect his subjects
   0: '<bos>'
   1: 'Every'
   2: 'king'
   3: 'must'
   4: 'protect'
   5: 'his'
   6: 'subjects'
\nSUB STRINGS (man concept):
  String 1: The man walked down the street
   0: '<bos>'
   1: 'The'
   2: 'man'
   3: 'walked'
   4: 'down'
   5: 'the'
   6: 'street'
  String 2: A man stood in the doorway
   0: '<bos>'
   1: 'A'
   2: 'man'
   3: 'stood'
   4: 'in'
   5: 'the'
   6: 'doorway'
  String 3: The tall man carried a briefcase
   0: '<bos>'
   1: 

In [3]:
# RUN MULTI-STRING EXPERIMENTS

print("🔬 Comparing single-string vs multi-string approaches:")

# Single string experiment (using first string from each list)
print("\n--- SINGLE STRING APPROACH ---")
single_result = experiment(base_strings[0], base_positions[0], sub_strings[0], sub_positions[0], 
                          add_strings[0], add_positions[0], EXTRACT_LAYER, INJECT_LAYER, 
                          interp_prompt, "King - Man + Woman (Single)", use_chat_template=True)

# Multi-string with mean aggregation
print("\n--- MULTI-STRING (MEAN) APPROACH ---")
multi_mean_result = multi_string_experiment(base_strings, base_positions, sub_strings, sub_positions,
                                           add_strings, add_positions, EXTRACT_LAYER, INJECT_LAYER,
                                           interp_prompt, "King - Man + Woman (Multi-Mean)", 
                                           aggregation="mean", use_chat_template=True)

# Multi-string with sum aggregation  
print("\n--- MULTI-STRING (SUM) APPROACH ---")
multi_sum_result = multi_string_experiment(base_strings, base_positions, sub_strings, sub_positions,
                                          add_strings, add_positions, EXTRACT_LAYER, INJECT_LAYER, 
                                          interp_prompt, "King - Man + Woman (Multi-Sum)",
                                          aggregation="sum", use_chat_template=True)

print("\n📊 COMPARISON SUMMARY:")
print(f"   Single string: '{single_result[1][:60]}...'")
print(f"   Multi (mean):  '{multi_mean_result[1][:60]}...'") 
print(f"   Multi (sum):   '{multi_sum_result[1][:60]}...'")

print("\n💡 Multi-string extraction can provide more robust concept representations!")

🔬 Comparing single-string vs multi-string approaches:

--- SINGLE STRING APPROACH ---
🧮 King - Man + Woman (Single)
   Extract: L12 | Inject: L3
   📝 Using chat template for interpretation


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.
100%|██████████| 1/1 [00:03<00:00,  3.25s/it]


   🤖 of England.
What is the answer?
The answer is **Elizabeth**.
The phrase is "This refers to the entity Elizabeth, the queen

--- MULTI-STRING (MEAN) APPROACH ---
🧮 King - Man + Woman (Multi-Mean)
   Extract: L12 | Inject: L3 | Aggregation: mean
   Base strings: 4 | Sub strings: 4 | Add strings: 4
   📝 Using chat template for interpretation


100%|██████████| 1/1 [00:03<00:00,  3.21s/it]


   🤖 of the realm of imagination.
The answer is: **Queen**

The full phrase is: "This refers to the entity queen of the realm

--- MULTI-STRING (SUM) APPROACH ---
🧮 King - Man + Woman (Multi-Sum)
   Extract: L12 | Inject: L3 | Aggregation: sum
   Base strings: 4 | Sub strings: 4 | Add strings: 4
   📝 Using chat template for interpretation


100%|██████████| 1/1 [00:03<00:00,  3.18s/it]

   🤖 and her crown, a symbol of her power, was stolen by a sorcerer.

What is the answer?

The answer is **Queen**.

📊 COMPARISON SUMMARY:
   Single string: 'of England.
What is the answer?
The answer is **Elizabeth**....'
   Multi (mean):  'of the realm of imagination.
The answer is: **Queen**

The f...'
   Multi (sum):   'and her crown, a symbol of her power, was stolen by a sorcer...'

💡 Multi-string extraction can provide more robust concept representations!





---
## 🎛️ Quick Experiment Template
**Copy this cell and modify for rapid experimentation**

In [None]:
# QUICK TEMPLATE - COPY AND MODIFY
EXTRACT_LAYER = 12
INJECT_LAYER = 3

# Choose interpretation style:
# interp_prompt = InterpretationPrompt.create_entity_prompt(selfie.model.tokenizer)
# interp_prompt = InterpretationPrompt.create_concept_prompt(selfie.model.tokenizer)
# interp_prompt = InterpretationPrompt.create_sentiment_prompt(selfie.model.tokenizer)
interp_prompt = InterpretationPrompt(selfie.model.tokenizer, ["Custom: ", None, " here"])

# Your strings here:
base_text = "Your base concept sentence"
sub_text = "Your subtract concept sentence"
add_text = "Your add concept sentence"

# Tokenize first, then set positions:
print("BASE:"); show_tokens(base_text)
print("SUB:"); show_tokens(sub_text) 
print("ADD:"); show_tokens(add_text)

# Set positions based on tokenization above:
base_pos = 1
sub_pos = 1
add_pos = 1

# Run:
# experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
#           EXTRACT_LAYER, INJECT_LAYER, interp_prompt, "Your description")

---
## 🆚 CHAT TEMPLATE COMPARISON
**Demonstrating the difference between using chat templates vs raw text**

In [None]:
# COMPARISON EXPERIMENT CONFIG
EXTRACT_LAYER = 11
INJECT_LAYER = 3

# CONCEPT STRINGS
base_text = "The teacher explained the concept clearly"
sub_text = "The woman read an interesting book"
add_text = "The man walked down the street"

# TOKEN POSITIONS
base_pos = 3  # "teacher"
sub_pos = 3   # "woman" 
add_pos = 3   # "man"

# INTERPRETATION PROMPT
interp_prompt = InterpretationPrompt.create_entity_prompt(selfie.model.tokenizer)

print("🔬 Running the same experiment with and without chat template:")
print("\n--- WITHOUT CHAT TEMPLATE ---")
result_no_template = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                               EXTRACT_LAYER, INJECT_LAYER, interp_prompt,
                               "Teacher - Woman + Man", use_chat_template=False)

print("\n--- WITH CHAT TEMPLATE ---")
result_with_template = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                                 EXTRACT_LAYER, INJECT_LAYER, interp_prompt,
                                 "Teacher - Woman + Man", use_chat_template=True)

print(f"\n📊 RESULTS COMPARISON:")
print(f"   Without template: '{result_no_template[1]}'")
print(f"   With template:    '{result_with_template[1]}'")

# if selfie._should_use_chat_template():
#     print(f"\n💡 This model ({MODEL_NAME}) supports chat templates - using them may improve results!")
# else:
#     print(f"\n💡 This model ({MODEL_NAME}) doesn't have a chat template - both methods should behave similarly.")

---
## 🔧 Utilities
**Helper functions for advanced use**

In [None]:
# BATCH LAYER COMPARISON
def compare_layers(text, token_pos, layers):
    """Compare same concept across multiple layers."""
    print(f"🔍 Layer comparison for token {token_pos} in: '{text}'")
    
    vectors = {}
    for layer in layers:
        vectors[layer] = get_vector(text, token_pos, layer)
    
    # Compute cosine similarities
    import torch.nn.functional as F
    for i, l1 in enumerate(layers):
        for l2 in layers[i+1:]:
            sim = F.cosine_similarity(vectors[l1].unsqueeze(0), vectors[l2].unsqueeze(0)).item()
            print(f"   L{l1} ↔ L{l2}: {sim:.3f}")

# SAVE EXPERIMENT RESULTS
def save_experiment(name, base_text, sub_text, add_text, base_pos, sub_pos, add_pos,
                   extract_layer, inject_layer, interpretation, filename="results.txt"):
    with open(filename, "a") as f:
        f.write(f"\n=== {name} ===\n")
        f.write(f"Base: '{base_text}' pos {base_pos}\n")
        f.write(f"Sub: '{sub_text}' pos {sub_pos}\n") 
        f.write(f"Add: '{add_text}' pos {add_pos}\n")
        f.write(f"Layers: {extract_layer} → {inject_layer}\n")
        f.write(f"Result: {interpretation}\n")
    print(f"💾 Saved to {filename}")



---
## 📐 EXPERIMENT 7: Vector Projections
**Project one concept onto another to measure directional alignment**

In [2]:
# EXPERIMENT 7A: Gender Direction Projection
EXTRACT_LAYER = 12

# Define gender direction using man -> woman vector
man_text = "The man walked down the street"
woman_text = "The woman walked down the street" 

print("📝 Tokenizing gender direction texts:")
print("\nMAN:")
show_tokens(man_text)
print("\nWOMAN:")
show_tokens(woman_text)

man_pos = 2    # "man"
woman_pos = 2  # "woman"

# Calculate gender direction vector (woman - man)
man_vec = get_vector(man_text, man_pos, EXTRACT_LAYER)
woman_vec = get_vector(woman_text, woman_pos, EXTRACT_LAYER)
gender_direction = woman_vec - man_vec

print(f"\n🧭 Gender direction vector computed (woman - man)")

# Test concepts to project onto gender direction
test_concepts = [
    ("The king ruled wisely", 2, "king"),
    ("The queen ruled wisely", 2, "queen"), 
    ("The doctor examined carefully", 2, "doctor"),
    ("The nurse helped patients", 2, "nurse"),
    ("The teacher explained clearly", 2, "teacher"),
    ("The engineer built bridges", 2, "engineer")
]

print(f"\n📐 Projecting concepts onto gender direction (Layer {EXTRACT_LAYER}):")
results = {}

for text, pos, name in test_concepts:
    concept_vec = get_vector(text, pos, EXTRACT_LAYER)
    _, magnitude, cosine_sim = vector_projection(concept_vec, gender_direction)
    results[name] = (magnitude, cosine_sim)
    print(f"   {name:12s}: projection={magnitude:6.3f}, similarity={cosine_sim:6.3f}")

print(f"\n💡 Higher projection values indicate stronger alignment with 'feminine' direction")

📝 Tokenizing gender direction texts:

MAN:
   0: '<bos>'
   1: 'The'
   2: 'man'
   3: 'walked'
   4: 'down'
   5: 'the'
   6: 'street'

WOMAN:
   0: '<bos>'
   1: 'The'
   2: 'woman'
   3: 'walked'
   4: 'down'
   5: 'the'
   6: 'street'


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.



🧭 Gender direction vector computed (woman - man)

📐 Projecting concepts onto gender direction (Layer 12):


RuntimeError: 1D tensors expected, but got 2D and 2D tensors

In [None]:
# EXPERIMENT 7B: Emotion Direction Projection  
EXTRACT_LAYER = 13

# Define emotional directions
happy_text = "She felt incredibly happy about the news"
sad_text = "She felt incredibly sad about the news"
angry_text = "She felt incredibly angry about the news"

print("📝 Tokenizing emotion texts:")
print("\nHAPPY:"); show_tokens(happy_text)
print("\nSAD:"); show_tokens(sad_text)  
print("\nANGRY:"); show_tokens(angry_text)

happy_pos = 4  # "happy"
sad_pos = 4    # "sad"
angry_pos = 4  # "angry"

# Calculate emotion direction vectors
happy_vec = get_vector(happy_text, happy_pos, EXTRACT_LAYER)
sad_vec = get_vector(sad_text, sad_pos, EXTRACT_LAYER)
angry_vec = get_vector(angry_text, angry_pos, EXTRACT_LAYER)

# Create positive/negative emotion direction (happy - sad)
emotion_direction = happy_vec - sad_vec

print(f"\n🧭 Emotion direction vector computed (happy - sad)")

# Test various emotional concepts
emotion_concepts = [
    ("He was delighted with the result", 2, "delighted"),
    ("She was devastated by the news", 2, "devastated"),
    ("They were furious about the decision", 2, "furious"),
    ("He felt content with his life", 2, "content"),
    ("She was anxious about the test", 2, "anxious"),
    ("They were excited for the party", 3, "excited")
]

print(f"\n📐 Projecting emotions onto happy-sad direction (Layer {EXTRACT_LAYER}):")
for text, pos, name in emotion_concepts:
    concept_vec = get_vector(text, pos, EXTRACT_LAYER)
    _, magnitude, cosine_sim = vector_projection(concept_vec, emotion_direction)
    print(f"   {name:12s}: projection={magnitude:6.3f}, similarity={cosine_sim:6.3f}")

print(f"\n💡 Positive projections align with 'happy', negative with 'sad'")

In [None]:
# EXPERIMENT 7C: Batch Professional Projection Analysis
EXTRACT_LAYER = 14

# Define multiple professional concepts to analyze
professional_texts = [
    "The doctor examined the patient",
    "The lawyer argued the case", 
    "The teacher explained the lesson",
    "The engineer designed the bridge",
    "The artist painted a masterpiece",
    "The scientist conducted experiments",
    "The chef prepared the meal",
    "The pilot flew the airplane"
]

professional_names = ["doctor", "lawyer", "teacher", "engineer", "artist", "scientist", "chef", "pilot"]
professional_positions = [2, 2, 2, 2, 2, 2, 2, 2]  # All professionals at position 2

# Direction vector: authority/expertise
authority_text = "The expert gave authoritative advice"
authority_pos = 2  # "expert"

print("📊 BATCH PROJECTION ANALYSIS")
print("   Projecting professional concepts onto 'authority/expertise' direction")

# Use the batch projection utility
results = batch_projection_analysis(
    professional_texts, 
    professional_positions,
    authority_text,
    authority_pos,
    EXTRACT_LAYER,
    professional_names
)

# Sort results by projection magnitude
sorted_results = sorted(results.items(), key=lambda x: x[1][0], reverse=True)

print(f"\n🏆 RANKING BY AUTHORITY PROJECTION:")
for i, (name, (magnitude, cosine_sim)) in enumerate(sorted_results, 1):
    print(f"   {i}. {name:12s}: {magnitude:6.3f} (cos_sim: {cosine_sim:6.3f})")

---
## 🎯 Custom Projection Template
**Template for your own vector projection experiments**

In [None]:
# CUSTOM PROJECTION TEMPLATE - COPY AND MODIFY
EXTRACT_LAYER = 12

# OPTION 1: Simple projection of one concept onto another
concept_text = "Your concept sentence here"
direction_text = "Your direction sentence here"

print("CONCEPT:"); show_tokens(concept_text)
print("DIRECTION:"); show_tokens(direction_text)

concept_pos = 1    # Adjust based on tokenization
direction_pos = 1  # Adjust based on tokenization

# Run projection
# projected_vec, magnitude, cosine_sim = project_concept_onto_direction(
#     concept_text, concept_pos, direction_text, direction_pos, 
#     EXTRACT_LAYER, "Your Description")

# OPTION 2: Batch projection analysis
concept_texts = [
    "First concept sentence",
    "Second concept sentence", 
    "Third concept sentence"
]
concept_names = ["concept1", "concept2", "concept3"]
concept_positions = [1, 1, 1]  # Adjust based on tokenization

direction_text = "Your direction sentence"
direction_pos = 1

# Run batch analysis
# results = batch_projection_analysis(
#     concept_texts, concept_positions, direction_text, direction_pos,
#     EXTRACT_LAYER, concept_names)

print("💡 Projection Template Ready - Uncomment and modify the sections above!")