# Vector Arithmetic Lab 🧮
**Compact notebook for vector arithmetic experiments. Each experiment is self-contained with its own config.**

In [None]:
# SETUP
import sys, warnings, torch, numpy as np
from typing import List, Dict, Any, Tuple
sys.path.insert(0, '..')
warnings.filterwarnings('ignore', category=FutureWarning)

from nnsight_selfie import ModelAgnosticSelfie, InterpretationPrompt, get_optimal_device

# LOAD MODEL
MODEL_NAME = "google/gemma-3-4b-it"  # Change as needed
selfie = ModelAgnosticSelfie(MODEL_NAME, dtype=torch.bfloat16, load_in_8bit=False)
print(f"✅ {MODEL_NAME} loaded on {selfie.device} ({len(selfie.layer_paths)} layers)")

# HELPER FUNCTIONS
def show_tokens(text):
    tokens = selfie.model.tokenizer.encode(text)
    for i, token_id in enumerate(tokens):
        token_str = selfie.model.tokenizer.decode([token_id])
        print(f"  {i:2d}: '{token_str.strip()}'")
    return tokens

def get_vector(text, token_pos, layer):
    """Get activation vector for a specific token position and layer.
    
    Args:
        text: Input text (ALWAYS uses raw text - no chat template for activation capture!)
        token_pos: Token position to extract from
        layer: Layer index to extract from
    """
    acts = selfie.get_activations(text, layer_indices=[layer], token_indices=[token_pos])
    return acts[layer][0]

def interpret_vector(vector, prompt, injection_layer=3, max_tokens=30, use_chat_template=False):
    """Interpret a vector using the selfie interpretation system.
    
    Args:
        vector: Activation vector to interpret
        prompt: InterpretationPrompt object
        injection_layer: Layer to inject the vector at
        max_tokens: Maximum tokens to generate
        use_chat_template: Whether to apply chat template formatting to INTERPRETATION (not capture)
    """
    return selfie.interpret_vectors([vector], prompt, injection_layer, max_new_tokens=max_tokens, 
                                  use_chat_template=use_chat_template)[0].strip()

def experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos, 
               extract_layer, inject_layer, interp_prompt, description="", use_chat_template=False):
    """Run a vector arithmetic experiment.
    
    Args:
        base_text, sub_text, add_text: Raw text for activation capture (no chat template applied)
        base_pos, sub_pos, add_pos: Token positions in the RAW text
        use_chat_template: Whether to use chat template for INTERPRETATION only
    """
    print(f"🧮 {description}")
    print(f"   Extract: L{extract_layer} | Inject: L{inject_layer}")
    if use_chat_template:
        print(f"   📝 Using chat template for interpretation")
    
    # Always capture from raw text (no chat template) - token positions stay consistent
    base_vec = get_vector(base_text, base_pos, extract_layer)
    sub_vec = get_vector(sub_text, sub_pos, extract_layer) 
    add_vec = get_vector(add_text, add_pos, extract_layer)
    
    result_vec = base_vec - sub_vec + add_vec
    # Apply chat template only to interpretation
    interpretation = interpret_vector(result_vec, interp_prompt, inject_layer, use_chat_template=use_chat_template)
    
    print(f"   🤖 {interpretation}")
    return result_vec, interpretation

print("\n🚀 Ready for experiments!")
print(f"💡 Chat template available: {selfie._should_use_chat_template()}")
print("   Chat template will be applied to INTERPRETATION PROMPTS only (not activation capture)")
print("   This ensures token positions remain consistent for activation extraction")

---
## 🔬 EXPERIMENT 1: King - Man + Woman = ?
**The classic vector arithmetic example**

In [None]:
# EXPERIMENT 1 CONFIG
EXTRACT_LAYER = 12
INJECT_LAYER = 3

# INTERPRETATION PROMPT
interp_prompt = InterpretationPrompt.create_entity_prompt(selfie.model.tokenizer)
# Custom: InterpretationPrompt(selfie.model.tokenizer, ["This refers to ", None])

# CONCEPT STRINGS
base_text = "The king ruled the kingdom wisely"
sub_text = "The man walked down the street"
add_text = "The woman read an interesting book"


In [None]:
print("📝 Tokenizing concept strings:")
print("\nBASE (king):")
base_tokens = show_tokens(base_text)
print("\nSUBTRACT (man):")
sub_tokens = show_tokens(sub_text) 
print("\nADD (woman):")
add_tokens = show_tokens(add_text)

# SELECT TOKEN POSITIONS (modify these after seeing tokenization above)
base_pos = 2  # "king" 
sub_pos = 2   # "man"
add_pos = 2   # "woman"

In [None]:
# RUN EXPERIMENT (with chat template option)
result = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                   EXTRACT_LAYER, INJECT_LAYER, interp_prompt, 
                   "King - Man + Woman (Expected: Queen-like)", use_chat_template=True)

# Without chat template (original behavior):
# result = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
#                    EXTRACT_LAYER, INJECT_LAYER, interp_prompt, 
#                    "King - Man + Woman (Expected: Queen-like)", use_chat_template=False)

---
## 🔬 EXPERIMENT 2: Doctor - Man + Woman = ?
**Professional role gender swap**

In [None]:
# EXPERIMENT 2 CONFIG  
EXTRACT_LAYER = 15
INJECT_LAYER = 5

# INTERPRETATION PROMPT
interp_prompt = InterpretationPrompt(selfie.model.tokenizer, ["This person is a ", None])

# CONCEPT STRINGS
base_text = "The doctor examined the patient carefully"
sub_text = "The man walked down the street"
add_text = "The woman read an interesting book"

print("📝 Tokenizing concept strings:")
print("\nBASE (doctor):")
base_tokens = show_tokens(base_text)
print("\nSUBTRACT (man):")
sub_tokens = show_tokens(sub_text)
print("\nADD (woman):") 
add_tokens = show_tokens(add_text)

# SELECT TOKEN POSITIONS
base_pos = 1  # "doctor"
sub_pos = 1   # "man" 
add_pos = 1   # "woman"

# RUN EXPERIMENT
result = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                   EXTRACT_LAYER, INJECT_LAYER, interp_prompt,
                   "Doctor - Man + Woman (Expected: Nurse-like)")

---
## 🔬 EXPERIMENT 3: Custom Experiment
**Your own vector arithmetic**

In [None]:
# EXPERIMENT 3 CONFIG
EXTRACT_LAYER = 10 
INJECT_LAYER = 2

# INTERPRETATION PROMPT (customize as needed)
# interp_prompt = InterpretationPrompt.create_concept_prompt(selfie.model.tokenizer)
# interp_prompt = InterpretationPrompt.create_sentiment_prompt(selfie.model.tokenizer)
interp_prompt = InterpretationPrompt(selfie.model.tokenizer, ["This represents ", None, " in society"])

# CONCEPT STRINGS (modify as needed)
base_text = "The teacher explained the concept clearly"
sub_text = "The woman read an interesting book" 
add_text = "The man walked down the street"

print("📝 Tokenizing concept strings:")
print("\nBASE:")
base_tokens = show_tokens(base_text)
print("\nSUBTRACT:")
sub_tokens = show_tokens(sub_text)
print("\nADD:")
add_tokens = show_tokens(add_text)

# SELECT TOKEN POSITIONS
base_pos = 1  # Adjust based on tokenization above
sub_pos = 1   # Adjust based on tokenization above
add_pos = 1   # Adjust based on tokenization above

# RUN EXPERIMENT
result = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                   EXTRACT_LAYER, INJECT_LAYER, interp_prompt,
                   "Custom: Teacher - Woman + Man")

---
## 🔬 EXPERIMENT 4: Multi-Layer Aggregation
**Advanced: Sum vectors from multiple layers**

In [None]:
# EXPERIMENT 4 CONFIG
EXTRACT_LAYERS = [3, 5, 13, 24,]  # Multiple layers to extract from
INJECT_LAYER = 3
AGGREGATION = "sum"  # "sum", "mean", or "concat"

# INTERPRETATION PROMPT
# interp_prompt = InterpretationPrompt.create_entity_prompt(selfie.model.tokenizer)

interp_prompt = InterpretationPrompt(selfie.model.tokenizer, [" ", None, " in society"])

# CONCEPT STRINGS
base_text = "The king sat on his throne"
sub_text = "The man sat on his throne"
add_text = "The woman sat on her throne"

print("📝 Tokenizing concept strings:")
print("\nBASE (scientist):")
show_tokens(base_text)
print("\nSUBTRACT (man):")
show_tokens(sub_text)
print("\nADD (woman):")
show_tokens(add_text)

# SELECT TOKEN POSITIONS
base_pos = 2  # "scientist"
sub_pos = 2   # "man"
add_pos = 2   # "woman"

📝 Tokenizing concept strings:

BASE (scientist):
   0: '<bos>'
   1: 'The'
   2: 'king'
   3: 'sat'
   4: 'on'
   5: 'his'
   6: 'throne'

SUBTRACT (man):
   0: '<bos>'
   1: 'The'
   2: 'man'
   3: 'sat'
   4: 'on'
   5: 'his'
   6: 'throne'

ADD (woman):
   0: '<bos>'
   1: 'The'
   2: 'woman'
   3: 'sat'
   4: 'on'
   5: 'her'
   6: 'throne'


In [5]:


# MULTI-LAYER EXTRACTION FUNCTION
def get_multi_layer_vector(text, token_pos, layers, agg_method="sum"):
    acts = selfie.get_activations(text, layer_indices=layers, token_indices=[token_pos])
    vectors = [acts[layer][0] for layer in layers]
    
    if agg_method == "sum":
        return torch.stack(vectors).sum(dim=0)
    elif agg_method == "mean":
        return torch.stack(vectors).mean(dim=0)
    elif agg_method == "concat":
        return torch.cat(vectors, dim=0)

# RUN MULTI-LAYER EXPERIMENT
# print(f"\n🧮 Multi-layer: Scientist - Man + Woman")
print(f"   Extract: L{EXTRACT_LAYERS} ({AGGREGATION}) | Inject: L{INJECT_LAYER}")

base_vec = get_multi_layer_vector(base_text, base_pos, EXTRACT_LAYERS, AGGREGATION)
sub_vec = get_multi_layer_vector(sub_text, sub_pos, EXTRACT_LAYERS, AGGREGATION)
add_vec = get_multi_layer_vector(add_text, add_pos, EXTRACT_LAYERS, AGGREGATION)

result_vec = base_vec - sub_vec + add_vec
interpretation = interpret_vector(result_vec, interp_prompt, INJECT_LAYER)

print(f"   🤖 {interpretation}")

   Extract: L[3, 5, 13, 24] (sum) | Inject: L3


100%|██████████| 1/1 [00:01<00:00,  1.61s/it]

   🤖 's best season in the league so far.</h1>

This is an incomplete sentence.  To make it a grammatically correct sentence, you need





---
## 🔬 EXPERIMENT 5: Emotion/Abstract Concepts
**Testing abstract concept arithmetic**

In [None]:
# EXPERIMENT 5 CONFIG
EXTRACT_LAYER = 13
INJECT_LAYER = 3

# CUSTOM INTERPRETATION PROMPT FOR EMOTIONS/CONCEPTS
interp_prompt = InterpretationPrompt(selfie.model.tokenizer, ["This emotion or concept is ", None])

# CONCEPT STRINGS (abstract/emotional)
base_text = "She felt incredibly happy about the news"
sub_text = "He was extremely sad about the loss" 
add_text = "They became very angry at the situation"

print("📝 Tokenizing concept strings:")
print("\nBASE (happy):")
show_tokens(base_text)
print("\nSUBTRACT (sad):")
show_tokens(sub_text)
print("\nADD (angry):")
show_tokens(add_text)

# SELECT TOKEN POSITIONS (look for emotion words)
base_pos = 4  # "happy" (adjust based on tokenization)
sub_pos = 4   # "sad"
add_pos = 4   # "angry"

# RUN EXPERIMENT
result = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                   EXTRACT_LAYER, INJECT_LAYER, interp_prompt,
                   "Emotion: Happy - Sad + Angry")

---
## 🎛️ Quick Experiment Template
**Copy this cell and modify for rapid experimentation**

In [None]:
# QUICK TEMPLATE - COPY AND MODIFY
EXTRACT_LAYER = 12
INJECT_LAYER = 3

# Choose interpretation style:
interp_prompt = InterpretationPrompt.create_entity_prompt(selfie.model.tokenizer)
# interp_prompt = InterpretationPrompt.create_concept_prompt(selfie.model.tokenizer)
# interp_prompt = InterpretationPrompt.create_sentiment_prompt(selfie.model.tokenizer)
# interp_prompt = InterpretationPrompt(selfie.model.tokenizer, ["Custom: ", None, " here"])

# Your strings here:
base_text = "Your base concept sentence"
sub_text = "Your subtract concept sentence"
add_text = "Your add concept sentence"

# Tokenize first, then set positions:
print("BASE:"); show_tokens(base_text)
print("SUB:"); show_tokens(sub_text) 
print("ADD:"); show_tokens(add_text)

# Set positions based on tokenization above:
base_pos = 1
sub_pos = 1
add_pos = 1

# Run:
# experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
#           EXTRACT_LAYER, INJECT_LAYER, interp_prompt, "Your description")

---
## 🆚 CHAT TEMPLATE COMPARISON
**Demonstrating the difference between using chat templates vs raw text**

In [3]:
# COMPARISON EXPERIMENT CONFIG
EXTRACT_LAYER = 11
INJECT_LAYER = 3

# CONCEPT STRINGS
base_text = "The teacher explained the concept clearly"
sub_text = "The woman read an interesting book"
add_text = "The man walked down the street"

# TOKEN POSITIONS
base_pos = 3  # "teacher"
sub_pos = 3   # "woman" 
add_pos = 3   # "man"

# INTERPRETATION PROMPT
interp_prompt = InterpretationPrompt.create_entity_prompt(selfie.model.tokenizer)

print("🔬 Running the same experiment with and without chat template:")
print("\n--- WITHOUT CHAT TEMPLATE ---")
result_no_template = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                               EXTRACT_LAYER, INJECT_LAYER, interp_prompt,
                               "Teacher - Woman + Man", use_chat_template=False)

print("\n--- WITH CHAT TEMPLATE ---")
result_with_template = experiment(base_text, base_pos, sub_text, sub_pos, add_text, add_pos,
                                 EXTRACT_LAYER, INJECT_LAYER, interp_prompt,
                                 "Teacher - Woman + Man", use_chat_template=True)

print(f"\n📊 RESULTS COMPARISON:")
print(f"   Without template: '{result_no_template[1]}'")
print(f"   With template:    '{result_with_template[1]}'")

# if selfie._should_use_chat_template():
#     print(f"\n💡 This model ({MODEL_NAME}) supports chat templates - using them may improve results!")
# else:
#     print(f"\n💡 This model ({MODEL_NAME}) doesn't have a chat template - both methods should behave similarly.")

🔬 Running the same experiment with and without chat template:

--- WITHOUT CHAT TEMPLATE ---
🧮 Teacher - Woman + Man
   Extract: L11 | Inject: L3


100%|██████████| 1/1 [00:01<00:00,  1.66s/it]


   🤖 clearly, but slowly, and with a meticulous attention to detail.
He was _emphatic_ in his explanation.
Her words were _artic

--- WITH CHAT TEMPLATE ---
🧮 Teacher - Woman + Man
   Extract: L11 | Inject: L3
   📝 Using chat template formatting
   📝 Using chat template: <bos><start_of_turn>user
The teacher explained the concept clearly<end_of_turn>

   📝 Using chat template: <bos><start_of_turn>user
The woman read an interesting book<end_of_turn>

   📝 Using chat template: <bos><start_of_turn>user
The man walked down the street<end_of_turn>



100%|██████████| 1/1 [00:01<00:00,  1.70s/it]

   🤖 _ _ _
model
Please provide the full prompt! I need the rest of the text to complete the sentence. 😊 

You started with "This refers to..." and

📊 RESULTS COMPARISON:
   Without template: 'clearly, but slowly, and with a meticulous attention to detail.
He was _emphatic_ in his explanation.
Her words were _artic'
   With template:    '_ _ _
model
Please provide the full prompt! I need the rest of the text to complete the sentence. 😊 

You started with "This refers to..." and'





---
## 🔧 Utilities
**Helper functions for advanced use**

In [None]:
# BATCH LAYER COMPARISON
def compare_layers(text, token_pos, layers):
    """Compare same concept across multiple layers."""
    print(f"🔍 Layer comparison for token {token_pos} in: '{text}'")
    
    vectors = {}
    for layer in layers:
        vectors[layer] = get_vector(text, token_pos, layer)
    
    # Compute cosine similarities
    import torch.nn.functional as F
    for i, l1 in enumerate(layers):
        for l2 in layers[i+1:]:
            sim = F.cosine_similarity(vectors[l1].unsqueeze(0), vectors[l2].unsqueeze(0)).item()
            print(f"   L{l1} ↔ L{l2}: {sim:.3f}")

# SAVE EXPERIMENT RESULTS
def save_experiment(name, base_text, sub_text, add_text, base_pos, sub_pos, add_pos,
                   extract_layer, inject_layer, interpretation, filename="results.txt"):
    with open(filename, "a") as f:
        f.write(f"\n=== {name} ===\n")
        f.write(f"Base: '{base_text}' pos {base_pos}\n")
        f.write(f"Sub: '{sub_text}' pos {sub_pos}\n") 
        f.write(f"Add: '{add_text}' pos {add_pos}\n")
        f.write(f"Layers: {extract_layer} → {inject_layer}\n")
        f.write(f"Result: {interpretation}\n")
    print(f"💾 Saved to {filename}")

print("🔧 Utilities loaded")
print("\nExamples:")
print("compare_layers('The king ruled wisely', 1, [8, 12, 16])")
print("save_experiment('King-Man+Woman', ...)")