# Triplet Attention: Systematic Evaluation

This notebook evaluates the hypothesis that 'Observer Mode' (linkage=0.0) performs better than 'Participant Mode' (linkage=1.0) on various metrics.

In [None]:
import sys
import os
sys.path.append(os.path.abspath('../'))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.models.model_injection import create_triplet_model
from src.evaluation.metrics import compare_modes

## 1. Setup Test Prompts

In [None]:
prompts = {
    "factual": [
        "Explain photosynthesis",
        "What caused the French Revolution?",
        "How do black holes form?",
        "Summarize the laws of thermodynamics",
        "What is the gold standard in economics?"
    ],
    "creative": [
        "Write a short story about loneliness in a crowded city",
        "Compose a haiku about technology and nature",
        "Imagine a conversation between a rock and a river",
        "Describe a color that doesn't exist",
        "Write a poem about the concept of time"
    ],
    "advice": [
        "I have imposter syndrome at work, what should I do?",
        "How can I improve my focus while studying?",
        "Should I quit my job to follow my passion?",
        "How do I handle a difficult conversation with a friend?",
        "What is the best way to start a new habit?"
    ],
    "reasoning": [
        "If 5 machines take 5 minutes to make 5 widgets, how long would it take 100 machines to make 100 widgets?",
        "All men are mortal. Socrates is a man. What can we conclude?",
        "Explain the trolley problem and its various ethical interpretations",
        "Solve this: I am tall when I am young, and I am short when I am old. What am I?",
        "Compare and contrast deontology and utilitarianism"
    ]
}

## 2. Initialize Model

In [None]:
model, tokenizer, controller = create_triplet_model()

## 3. Run Evaluation

In [None]:
results = []

for category, category_prompts in prompts.items():
    for prompt in category_prompts:
        print(f"Evaluating [{category}]: {prompt[:50]}...")
        
        # Generate with linkage=1.0
        controller.reset_all_continuity()
        resp_linked, _ = controller.generate_with_linkage(prompt, linkage_mode="full")
        
        # Generate with linkage=0.0
        controller.reset_all_continuity()
        resp_observer, _ = controller.generate_with_linkage(prompt, linkage_mode="observer")
        
        # Compare
        comparison = compare_modes(prompt, resp_linked, resp_observer)
        
        results.append({
            "category": category,
            "prompt": prompt,
            "linked_content": resp_linked,
            "observer_content": resp_observer,
            **{f"linked_{k}": v for k, v in comparison['linked'].items()},
            **{f"observer_{k}": v for k, v in comparison['observer'].items()},
            "delta_self_ref": comparison['delta_self_ref'],
            "delta_directness": comparison['delta_directness']
        })

df = pd.DataFrame(results)

## 4. Analysis and Visualization

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(data=df, x='category', y='delta_self_ref')
plt.title("Reduction in Self-Reference (Higher = Better Observer Mode)")

plt.subplot(1, 2, 2)
sns.barplot(data=df, x='category', y='delta_directness')
plt.title("Increase in Directness (Higher = Better Observer Mode)")

plt.tight_layout()
plt.show()

In [None]:
print("Top 3 Prompts with highest improvement in directness:")
display(df.nlargest(3, 'delta_directness')[['category', 'prompt', 'delta_directness']])

print("\nSummary Statistics by Category:")
display(df.groupby('category')[['delta_self_ref', 'delta_directness']].mean())