# Goal Matching Evaluation

This notebook evaluates whether the student's research goals align with the instructor's requirements.

In [None]:
import json
import os

repo_path = '/home/smallyan/critic_model_mechinterp/runs/circuits_claude_2025-11-10_20-48-00'
instructor_prompt_path = '/home/smallyan/critic_model_mechinterp/prompts/l3/circuit_prompt_sarcasm.txt'

# Read files
with open(instructor_prompt_path, 'r') as f:
    instructor_prompt = f.read()
    
with open(os.path.join(repo_path, 'logs/plan_v1.md'), 'r') as f:
    plan_v1 = f.read()
    
with open(os.path.join(repo_path, 'logs/plan_v2.md'), 'r') as f:
    plan_v2 = f.read()

## 1. Instructor's Goal

In [None]:
instructor_goal = """
Identify a precise circuit—a subset of attention heads and MLPs—that reproduces the model's 
sarcasm recognition behavior as faithfully as possible while obeying strict residual 
write-budget constraints (≤ 11,200 dimensions).
"""

print("INSTRUCTOR'S GOAL:")
print(instructor_goal)

## 2. Student's Goals

In [None]:
student_goal_v1 = """
Identify the precise circuit in GPT2-small that enables sarcasm recognition by detecting 
contradictions between literal sentiment and contextual tone.
"""

student_goal_v2 = """
Identify a precise circuit in GPT2-small that enables sarcasm recognition by detecting 
contradictions between literal sentiment and contextual tone. The circuit must:
- Reproduce sarcasm detection behavior with high fidelity
- Remain within 11,200 dimension write budget
- Contain interpretable, minimal components
"""

print("STUDENT'S GOAL (Plan V1):")
print(student_goal_v1)
print("\nSTUDENT'S GOAL (Plan V2 / Documentation):")
print(student_goal_v2)

## 3. Goal Alignment Analysis

In [None]:
alignment_analysis = {
    'core_task': {
        'instructor': 'Identify precise circuit for sarcasm recognition',
        'student': 'Identify precise circuit for sarcasm recognition',
        'match': 'PERFECT'
    },
    'write_budget': {
        'instructor': '≤ 11,200 dimensions (strict constraint)',
        'student': 'Remain within 11,200 dimension write budget',
        'match': 'PERFECT'
    },
    'interpretability': {
        'instructor': 'Faithfully reproduce behavior with interpretable components',
        'student': 'High fidelity, interpretable, minimal components',
        'match': 'PERFECT'
    },
    'mechanism_understanding': {
        'instructor': 'Find mechanism for resolving contradictory affective cues',
        'student': 'Detect contradictions between literal sentiment and contextual tone',
        'match': 'EXCELLENT'
    }
}

print("=== ALIGNMENT ANALYSIS ===\n")
for criterion, details in alignment_analysis.items():
    print(f"{criterion.upper()}:")
    print(f"  Instructor: {details['instructor']}")
    print(f"  Student: {details['student']}")
    print(f"  Match: {details['match']}")
    print()

## 4. Quantitative Assessment

In [None]:
# Score each aspect
scores = {
    'Core Task Alignment': 10.0,
    'Write Budget Constraint': 10.0,
    'Interpretability Requirement': 10.0,
    'Mechanistic Understanding': 10.0
}

print("=== QUANTITATIVE SCORES ===\n")
for aspect, score in scores.items():
    print(f"{aspect}: {score}/10")

overall_score = sum(scores.values()) / len(scores)
print(f"\n{'='*50}")
print(f"OVERALL GOAL ALIGNMENT SCORE: {overall_score:.1f}/10")
print(f"{'='*50}")

## 5. Detailed Evaluation

In [None]:
evaluation_summary = """
GOAL ALIGNMENT EVALUATION SUMMARY
==================================

The student's goals demonstrate EXCELLENT alignment with the instructor's requirements:

✓ STRENGTHS:
  1. Perfect understanding of core task (circuit identification for sarcasm)
  2. Explicit acknowledgment of write budget constraint (11,200 dims)
  3. Clear focus on interpretability and minimal components
  4. Correct framing as detecting sentiment-tone contradictions
  5. Goals evolved appropriately from v1 to v2 (added explicit constraints)

✓ REQUIREMENTS MET:
  - Precise circuit identification
  - Sarcasm recognition as target behavior
  - Write budget constraint adherence
  - Interpretability emphasis
  - Mechanistic understanding focus

✗ NO SIGNIFICANT GAPS IDENTIFIED

VERDICT: 10/10 - Perfect Goal Alignment
The student fully understood and internalized the instructor's objectives.
"""

print(evaluation_summary)

## 6. Evolution of Student Understanding

In [None]:
# Track how goals evolved
goal_evolution = {
    'plan_v1': {
        'elements': ['precise circuit', 'sarcasm recognition', 'contradictions'],
        'missing': ['explicit write budget', 'high fidelity metric', 'minimality']
    },
    'plan_v2': {
        'elements': ['precise circuit', 'sarcasm recognition', 'contradictions', 
                     'write budget', 'high fidelity', 'minimal components'],
        'missing': []
    }
}

print("=== GOAL EVOLUTION ===\n")
print("Plan V1 (Initial):")
print(f"  Present: {', '.join(goal_evolution['plan_v1']['elements'])}")
print(f"  Missing: {', '.join(goal_evolution['plan_v1']['missing'])}")
print("\nPlan V2 (Refined):")
print(f"  Present: {', '.join(goal_evolution['plan_v2']['elements'])}")
print(f"  Missing: {', '.join(goal_evolution['plan_v2']['missing']) if goal_evolution['plan_v2']['missing'] else 'None'}")
print("\nCONCLUSION: Student refined goals to be more complete and explicit.")