# Goal Matching Evaluation

## Critic Model Assessment of Student Research Project

In [None]:
import os
import json

# Set working directory
os.chdir('/home/smallyan/critic_model_mechinterp')
repo_path = '/home/smallyan/critic_model_mechinterp/runs/circuits_claude_2025-11-10_20-48-00'

print("Evaluation initialized")

## 1. Read Instructor's Goal

In [None]:
# Read instructor's goal and hypothesis
with open('/home/smallyan/critic_model_mechinterp/prompts/l3/circuit_prompt_sarcasm.txt', 'r') as f:
    instructor_prompt = f.read()

# Extract key sections
print("=" * 80)
print("INSTRUCTOR'S GOAL SUMMARY")
print("=" * 80)
print('''
KEY OBJECTIVES:
1. Identify a precise circuit (attention heads and MLPs) for sarcasm recognition
2. Stay within write-budget constraint (≤ 11,200 dimensions)
3. Follow iterative hypothesis-testing-refinement workflow

EXPECTED MECHANISM (Hypothesis):
- Early layers: Encode sentiment direction (positive/negative polarity)
- Mid layers: Encode tone or incongruity
- Late layers: Perform meaning reversal or contextual correction

REQUIRED OUTPUTS:
- logs/plan_v1.md, plan_v2.md, etc.
- logs/documentation.md
- results/real_circuits_1.json
''')

## 2. Read Student's Goals

In [None]:
# Read student's plan files
with open(f'{repo_path}/logs/plan_v1.md', 'r') as f:
    plan_v1 = f.read()

with open(f'{repo_path}/logs/plan_v2.md', 'r') as f:
    plan_v2 = f.read()

print("=" * 80)
print("STUDENT'S STATED GOAL (from plan_v1.md)")
print("=" * 80)
print('''
STUDENT'S GOAL:
"Identify the precise circuit in GPT2-small that enables sarcasm recognition 
by detecting contradictions between literal sentiment and contextual tone."

INITIAL HYPOTHESIS:
- Stage 1 (L0-L3): Early layers encode literal sentiment
- Stage 2 (L4-L7): Middle layers detect incongruity
- Stage 3 (L8-L11): Late layers perform meaning reversal

CONSTRAINTS:
- Total budget: ≤ 11,200 dimensions
- Minimal, interpretable components
''')

## 3. Goal Alignment Assessment

In [None]:
print("=" * 80)
print("GOAL ALIGNMENT EVALUATION")
print("=" * 80)

evaluation = {
    "goal_alignment": {
        "score": 95,
        "max_score": 100,
        "aligned_aspects": [
            "Both target sarcasm detection circuit in GPT2-small",
            "Both use same write budget constraint (11,200 dimensions)",
            "Both follow three-stage hypothesis (early-mid-late)",
            "Both expect sentiment encoding → incongruity detection → reversal",
            "Both require iterative hypothesis refinement"
        ],
        "minor_differences": [
            "Student uses synthetic dataset instead of mib-bench/sarcasm",
            "Student's plan predicts specific heads vs instructor's generic examples"
        ]
    }
}

print(f"\nGOAL ALIGNMENT SCORE: {evaluation['goal_alignment']['score']}/100")
print("\n✅ ALIGNED ASPECTS:")
for aspect in evaluation['goal_alignment']['aligned_aspects']:
    print(f"   • {aspect}")

print("\n⚠️ MINOR DIFFERENCES:")
for diff in evaluation['goal_alignment']['minor_differences']:
    print(f"   • {diff}")

## 4. Plan Adherence Assessment

In [None]:
print("=" * 80)
print("PLAN ADHERENCE EVALUATION")
print("=" * 80)

# Check for plan files
import os

plan_files_found = []
for f in ['plan_v1.md', 'plan_v2.md', 'plan_v3.md']:
    path = f'{repo_path}/logs/{f}'
    if os.path.exists(path):
        plan_files_found.append(f)
        
print(f"\nPlan files found: {plan_files_found}")

evaluation["plan_adherence"] = {
    "score": 85,
    "max_score": 100,
    "followed": [
        "Created plan_v1.md with detailed initial hypothesis",
        "Tested hypothesis using differential activation analysis",
        "Created plan_v2.md with refined understanding",
        "Documented evolution of hypothesis",
        "Included evidence criteria for each sub-hypothesis"
    ],
    "missing": [
        "No plan_v3.md - stopped at 2 iterations instead of continuing",
        "No behavioral testing to verify circuit reproduces sarcasm detection",
        "Ablation experiments mentioned but not fully executed"
    ]
}

print(f"\nPLAN ADHERENCE SCORE: {evaluation['plan_adherence']['score']}/100")
print("\n✅ FOLLOWED:")
for item in evaluation['plan_adherence']['followed']:
    print(f"   • {item}")

print("\n❌ MISSING/INCOMPLETE:")
for item in evaluation['plan_adherence']['missing']:
    print(f"   • {item}")

## 5. Hypothesis Testing Evaluation

In [None]:
print("=" * 80)
print("HYPOTHESIS TESTING EVALUATION")
print("=" * 80)

evaluation["hypothesis_testing"] = {
    "score": 90,
    "max_score": 100,
    "hypotheses_tested": [
        {
            "hypothesis": "Early layers encode literal sentiment",
            "tested": True,
            "method": "Differential activation analysis on m0, m1, m2",
            "result": "m2 emerged as primary detector (32.47 differential)",
            "revised": True
        },
        {
            "hypothesis": "Middle layers detect incongruity",
            "tested": True,
            "method": "Analyzed m5, m6 and attention heads in L4-L7",
            "result": "Found moderate differential (7-10 range)",
            "revised": True,
            "revision": "Incongruity detection actually happens at L2"
        },
        {
            "hypothesis": "Late MLPs perform meaning reversal",
            "tested": True,
            "method": "Analyzed m7-m11 activation patterns",
            "result": "Strong late-layer activity (m11=22.30, m10=17.36)",
            "revised": True,
            "revision": "Not 'reversal' but 'signal integration'"
        },
        {
            "hypothesis": "Circuit is sparse (<50 components)",
            "tested": True,
            "method": "Budget optimization",
            "result": "Final circuit has 54 components (slightly over target)"
        }
    ]
}

print(f"\nHYPOTHESIS TESTING SCORE: {evaluation['hypothesis_testing']['score']}/100")
print("\nHypotheses Tested:")
for i, h in enumerate(evaluation['hypothesis_testing']['hypotheses_tested'], 1):
    status = "✅" if h['tested'] else "❌"
    print(f"\n{i}. {status} {h['hypothesis']}")
    print(f"   Method: {h['method']}")
    print(f"   Result: {h['result']}")
    if h.get('revision'):
        print(f"   ⚠️ Revision: {h['revision']}")

## 6. Hypothesis Convergence Assessment

In [None]:
print("=" * 80)
print("HYPOTHESIS CONVERGENCE EVALUATION")
print("=" * 80)

evaluation["convergence"] = {
    "score": 75,
    "max_score": 100,
    "instructor_expected": {
        "early": "Sentiment direction encoding",
        "mid": "Tone/incongruity detection",
        "late": "Meaning reversal"
    },
    "student_discovered": {
        "early": "Primary sarcasm detection (m2)",
        "mid": "Signal propagation",
        "late": "Signal integration"
    },
    "similarities": [
        "Three-stage hierarchical processing confirmed",
        "MLPs more important than attention heads",
        "Late layers critical for final output"
    ],
    "differences": [
        "Detection timing: L2 (student) vs L4-L7 (instructor expected)",
        "Mechanism: Integration (student) vs Reversal (instructor expected)"
    ]
}

print(f"\nCONVERGENCE SCORE: {evaluation['convergence']['score']}/100")

print("\nINSTRUCTOR'S EXPECTED MECHANISM:")
for stage, func in evaluation['convergence']['instructor_expected'].items():
    print(f"   {stage.upper()}: {func}")

print("\nSTUDENT'S DISCOVERED MECHANISM:")
for stage, func in evaluation['convergence']['student_discovered'].items():
    print(f"   {stage.upper()}: {func}")

print("\n✅ SIMILARITIES:")
for s in evaluation['convergence']['similarities']:
    print(f"   • {s}")

print("\n⚠️ KEY DIFFERENCES:")
for d in evaluation['convergence']['differences']:
    print(f"   • {d}")

## 7. Final Evaluation Summary

In [None]:
print("=" * 80)
print("FINAL EVALUATION SUMMARY")
print("=" * 80)

# Calculate overall score
scores = [
    evaluation['goal_alignment']['score'],
    evaluation['plan_adherence']['score'],
    evaluation['hypothesis_testing']['score'],
    evaluation['convergence']['score']
]
overall_score = sum(scores) / len(scores)

evaluation["overall"] = {
    "score": overall_score,
    "verdict": "PASS" if overall_score >= 70 else "FAIL",
    "summary": '''
The student successfully executed the research workflow as specified by the instructor.
They formulated an initial hypothesis, tested it empirically, and refined their
understanding based on results. While the final mechanism differs from the instructor's
expected hypothesis (early detection vs mid-layer detection, integration vs reversal),
the differences are well-documented and empirically justified.

The student demonstrated good scientific practice by:
1. Documenting clear hypotheses before testing
2. Using appropriate experimental methods
3. Updating their model when evidence contradicted expectations
4. Acknowledging limitations of their approach

Areas for improvement:
1. Continue iterations beyond plan_v2
2. Perform behavioral validation of the circuit
3. Complete planned ablation experiments
'''
}

print(f"\n{'Category':<25} {'Score':<10} {'Status'}")
print("-" * 50)
print(f"{'Goal Alignment':<25} {evaluation['goal_alignment']['score']}/100    {'✅ Excellent'}")
print(f"{'Plan Adherence':<25} {evaluation['plan_adherence']['score']}/100    {'✅ Good'}")
print(f"{'Hypothesis Testing':<25} {evaluation['hypothesis_testing']['score']}/100    {'✅ Excellent'}")
print(f"{'Hypothesis Convergence':<25} {evaluation['convergence']['score']}/100    {'⚠️ Acceptable'}")
print("-" * 50)
print(f"{'OVERALL':<25} {overall_score:.2f}/100 {'✅ PASS' if overall_score >= 70 else '❌ FAIL'}")

print(evaluation['overall']['summary'])

In [None]:
# Save evaluation results
import json

output_path = f'{repo_path}/evaluation/goal_matching_results.json'
with open(output_path, 'w') as f:
    json.dump(evaluation, f, indent=2)

print(f"\nEvaluation results saved to: {output_path}")