In [1]:
import os
import json
from pathlib import Path

# Set working directory
os.chdir('/home/smallyan/critic_model_mechinterp')
print(f"Current working directory: {os.getcwd()}")

# Define the repo path
repo_path = Path('/home/smallyan/critic_model_mechinterp/runs/circuits_claude_2025-11-10_20-48-00')
print(f"\nRepository path: {repo_path}")
print(f"Repository exists: {repo_path.exists()}")

# List contents of the repository
if repo_path.exists():
    print(f"\nContents of repository:")
    for item in sorted(repo_path.rglob('*')):
        if item.is_file() and 'evaluation' not in str(item):
            print(f"  {item.relative_to(repo_path)}")


Current working directory: /home/smallyan/critic_model_mechinterp

Repository path: /home/smallyan/critic_model_mechinterp/runs/circuits_claude_2025-11-10_20-48-00
Repository exists: True

Contents of repository:
  logs/circuit_prompt_sarcarsm_claude.log
  logs/code_walk.md
  logs/documentation.md
  logs/plan_v1.md
  logs/plan_v2.md
  notebooks/2025-11-10-20-48_SarcasmCircuitAnalysis.ipynb
  results/circuit_visualization.png
  results/real_circuits_1.json


In [2]:
# Read the documentation
doc_path = repo_path / 'logs' / 'documentation.md'
with open(doc_path, 'r') as f:
    documentation = f.read()

print("Documentation length:", len(documentation), "characters")
print("\n" + "="*80)
print("DOCUMENTATION CONTENT:")
print("="*80)
print(documentation[:3000])  # Preview first 3000 chars


Documentation length: 10312 characters

DOCUMENTATION CONTENT:
# Sarcasm Circuit Documentation - GPT2-Small

## 1. Goal

Identify a precise circuit in GPT2-small that enables sarcasm recognition by detecting contradictions between literal sentiment and contextual tone. The circuit must:
- Reproduce sarcasm detection behavior with high fidelity
- Remain within 11,200 dimension write budget
- Contain interpretable, minimal components

## 2. Data

### Dataset Description
- **Source**: Synthetic sarcasm dataset (real mib-bench/sarcasm unavailable)
- **Size**: 40 total examples (20 sarcastic, 20 literal)
- **Structure**: Paired examples with similar surface structure but opposite intent

### Example Sarcastic Sentences
1. "Oh great, another meeting at 7 AM."
2. "Wow, I just love getting stuck in traffic."
3. "Fantastic, my laptop crashed right before the deadline."
4. "Perfect, exactly what I needed today."
5. "Oh wonderful, it's raining on my day off."

### Example Literal Sentences
1. "I'

In [3]:
# Read the full documentation
print(documentation)


# Sarcasm Circuit Documentation - GPT2-Small

## 1. Goal

Identify a precise circuit in GPT2-small that enables sarcasm recognition by detecting contradictions between literal sentiment and contextual tone. The circuit must:
- Reproduce sarcasm detection behavior with high fidelity
- Remain within 11,200 dimension write budget
- Contain interpretable, minimal components

## 2. Data

### Dataset Description
- **Source**: Synthetic sarcasm dataset (real mib-bench/sarcasm unavailable)
- **Size**: 40 total examples (20 sarcastic, 20 literal)
- **Structure**: Paired examples with similar surface structure but opposite intent

### Example Sarcastic Sentences
1. "Oh great, another meeting at 7 AM."
2. "Wow, I just love getting stuck in traffic."
3. "Fantastic, my laptop crashed right before the deadline."
4. "Perfect, exactly what I needed today."
5. "Oh wonderful, it's raining on my day off."

### Example Literal Sentences
1. "I'm excited about the meeting at 7 AM tomorrow."
2. "I really enj

In [4]:
# Now let's analyze the documentation and identify key knowledge points

key_knowledge_points = {
    "1. Goal & Requirements": [
        "Circuit must identify sarcasm through contradiction detection",
        "11,200 dimension write budget constraint",
        "Need for high fidelity, interpretability, and minimal components"
    ],
    
    "2. Data Characteristics": [
        "40 examples total (20 sarcastic, 20 literal)",
        "Synthetic dataset structure",
        "Key linguistic features: discourse markers, positive sentiment words, negative context, contradiction pattern"
    ],
    
    "3. Methodology": [
        "Differential activation analysis approach",
        "Three-step process: activation collection, differential analysis, component selection",
        "L2 norm difference calculation between sarcastic and literal examples",
        "Component ranking and budget-constrained selection"
    ],
    
    "4. Model Architecture": [
        "GPT2-small: 12 layers, 12 heads per layer",
        "d_model=768, d_head=64",
        "Write budget calculation: embedding=768, MLP=768, head=64 dims"
    ],
    
    "5. Circuit Composition": [
        "54 total components: 1 input + 10 MLPs + 43 attention heads",
        "Exactly 11,200 dimensions used (100% budget)",
        "m2 is dramatically dominant (32.47 avg diff)",
        "m3 and m4 excluded from circuit"
    ],
    
    "6. Mechanistic Stages": [
        "Stage 1 (L0-L2): Early detection, m2 primary detector",
        "Stage 2 (L3-L7): Distributed propagation, 19 attention heads",
        "Stage 3 (L8-L11): Final integration, m11 and layer 11 heads"
    ],
    
    "7. Comparison with IOI": [
        "Different primary mechanism (MLP vs attention)",
        "Different key layer (early vs late)",
        "Different circuit density (dense vs sparse)",
        "Task-specific computational strategies"
    ],
    
    "8. Key Findings": [
        "Sarcasm detection happens early (Layer 2, not gradual)",
        "MLPs contribute 7,680 dims vs heads 2,752 dims",
        "m2 dominance (45% stronger than next MLP)",
        "Hypothesis evolution from three-stage gradual to early detection"
    ],
    
    "9. Limitations": [
        "Small dataset (only 5 pairs analyzed in detail)",
        "Synthetic data only",
        "No causal validation performed",
        "Single model (GPT2-small specific)",
        "Budget maximization vs minimal circuit tradeoff"
    ],
    
    "10. Applications & Next Steps": [
        "Validation experiments: ablation, intervention, probing",
        "Circuit refinement on full dataset",
        "Open questions about m2 dominance and m3/m4 exclusion",
        "Generalization to other figurative language"
    ]
}

print("Key Knowledge Points Identified:")
print("="*80)
for category, points in key_knowledge_points.items():
    print(f"\n{category}:")
    for point in points:
        print(f"  • {point}")


Key Knowledge Points Identified:

1. Goal & Requirements:
  • Circuit must identify sarcasm through contradiction detection
  • 11,200 dimension write budget constraint
  • Need for high fidelity, interpretability, and minimal components

2. Data Characteristics:
  • 40 examples total (20 sarcastic, 20 literal)
  • Synthetic dataset structure
  • Key linguistic features: discourse markers, positive sentiment words, negative context, contradiction pattern

3. Methodology:
  • Differential activation analysis approach
  • Three-step process: activation collection, differential analysis, component selection
  • L2 norm difference calculation between sarcastic and literal examples
  • Component ranking and budget-constrained selection

4. Model Architecture:
  • GPT2-small: 12 layers, 12 heads per layer
  • d_model=768, d_head=64
  • Write budget calculation: embedding=768, MLP=768, head=64 dims

5. Circuit Composition:
  • 54 total components: 1 input + 10 MLPs + 43 attention heads
  • Ex

In [5]:
# Now let's design comprehensive exam questions
# Mix of factual recall, comprehension, application, and reasoning

exam_questions = []

# ============================================================================
# CATEGORY 1: COMPREHENSION & RECALL (Basic factual understanding)
# ============================================================================

exam_questions.append({
    "question_type": "multiple_choice",
    "question": "What is the write budget constraint for the sarcasm detection circuit?",
    "answer": "11,200 dimensions",
    "choices": ["7,680 dimensions", "11,200 dimensions", "12,288 dimensions", "54,000 dimensions"],
    "reference": "Section 1 (Goal) and Section 4 (Results - Circuit Composition)"
})

exam_questions.append({
    "question_type": "multiple_choice",
    "question": "How many total examples were in the sarcasm dataset?",
    "answer": "40 examples (20 sarcastic, 20 literal)",
    "choices": ["20 examples (10 sarcastic, 10 literal)", 
                "40 examples (20 sarcastic, 20 literal)", 
                "100 examples (50 sarcastic, 50 literal)",
                "5 paired examples"],
    "reference": "Section 2 (Data - Dataset Description)"
})

exam_questions.append({
    "question_type": "multiple_choice",
    "question": "Which MLP component showed the highest differential activation and is considered the primary sarcasm detector?",
    "answer": "m2 (Layer 2 MLP) with 32.47 average differential activation",
    "choices": ["m0 (Layer 0 MLP) with 7.33 average differential activation",
                "m2 (Layer 2 MLP) with 32.47 average differential activation",
                "m11 (Layer 11 MLP) with 22.30 average differential activation",
                "m10 (Layer 10 MLP) with 17.36 average differential activation"],
    "reference": "Section 4 (Results - MLP Components table and Key Finding)"
})

exam_questions.append({
    "question_type": "free_generation",
    "question": "List the three key linguistic features that distinguish sarcastic sentences from literal ones according to the documentation.",
    "answer": "1) Discourse markers (e.g., 'Oh', 'Wow', 'Just'), 2) Positive sentiment words (e.g., 'great', 'love', 'fantastic'), 3) Negative situational context (e.g., 'stuck in traffic', 'crashed'), with the key pattern being the contradiction between positive words and negative situations.",
    "choices": None,
    "reference": "Section 2 (Data - Key Linguistic Features of Sarcasm)"
})

# ============================================================================
# CATEGORY 2: CAUSAL & MECHANISTIC REASONING
# ============================================================================

exam_questions.append({
    "question_type": "free_generation",
    "question": "The documentation states that m3 and m4 were excluded from the circuit. If you were to add m3 back into the circuit, how would this affect the write budget, and what would you need to adjust to stay within the 11,200 dimension limit?",
    "answer": "Adding m3 would add 768 dimensions to the circuit, bringing the total from 11,200 to 11,968 dimensions, which exceeds the budget by 768 dimensions. To stay within the limit, you would need to remove either one MLP (768 dims) or 12 attention heads (12 × 64 = 768 dims) from the current circuit. Since the circuit currently uses exactly 11,200 dimensions (100% budget), any addition requires corresponding removal.",
    "choices": None,
    "reference": "Section 3 (Method - Technical Details - Write Budget Calculation) and Section 4 (Results - Excluded Components)"
})

exam_questions.append({
    "question_type": "multiple_choice",
    "question": "According to the differential activation analysis method, what does a higher L2 norm difference between mean activations indicate?",
    "answer": "Stronger sarcasm-specific processing by that component",
    "choices": ["Better general language modeling capability",
                "Stronger sarcasm-specific processing by that component",
                "Higher computational cost for that component",
                "Greater need for that component to be excluded"],
    "reference": "Section 3 (Method - Step 2: Differential Analysis)"
})

exam_questions.append({
    "question_type": "free_generation",
    "question": "The initial hypothesis suggested that middle layers detect incongruity, but the empirical evidence showed otherwise. Explain what the middle layers (L3-L7) actually do according to the revised understanding, and why this differs from the initial hypothesis.",
    "answer": "According to the revised understanding, middle layers (L3-L7) propagate and refine the sarcasm signal rather than detect incongruity. The primary detection happens much earlier at Layer 2 (m2). The middle layers use 19 attention heads to route information across sequence positions and enable context-aware processing. This differs from the initial hypothesis because detection occurs earlier than expected (L2 rather than middle layers), and the middle layers serve a supporting propagation role rather than the primary detection role.",
    "choices": None,
    "reference": "Section 5 (Analysis - Hypothesis Evolution and Mechanistic Interpretation - Stage 2)"
})

# ============================================================================
# CATEGORY 3: HYPOTHETICAL TRANSFER & APPLICATION
# ============================================================================

exam_questions.append({
    "question_type": "free_generation",
    "question": "Suppose you want to build a similar circuit for detecting irony (another form of figurative language) in GPT2-small. Based on the sarcasm circuit findings, which layer would you hypothesize as most important for irony detection, and what experimental approach would you use to test this?",
    "answer": "Based on the sarcasm circuit findings showing early detection at Layer 2, I would hypothesize that irony detection also occurs in early layers (L1-L3), particularly focusing on Layer 2 MLPs. To test this, I would use the same differential activation analysis approach: (1) collect activations on paired ironic/literal examples, (2) compute L2 norm differences for each component, (3) identify which components show the highest differential activation. If the hypothesis holds, we should see dominant differential activation in early-layer MLPs similar to m2's role in sarcasm. This could reveal whether figurative language detection generally occurs early in transformer processing.",
    "choices": None,
    "reference": "Section 5 (Analysis - Mechanistic Interpretation - Stage 1) and Section 6 (Next Steps - Open Questions about generalization to other figurative language)"
})

exam_questions.append({
    "question_type": "free_generation",
    "question": "The circuit uses 10 MLPs (7,680 dims) and 43 attention heads (2,752 dims). If you were redesigning the circuit with a smaller budget of 5,600 dimensions (half the original), describe a principled strategy for selecting which components to keep, and justify your choices based on the documented findings.",
    "answer": "A principled strategy would prioritize components with highest differential activation: (1) Keep the input embedding (768 dims - essential). (2) Keep m2 (768 dims - dramatically dominant at 32.47, the primary detector). (3) Keep m11 (768 dims - second highest at 22.30, handles final pre-output processing). (4) Keep m10 and m9 (1,536 dims - late-stage integration at 17.36 and 13.41). This totals 3,840 dims for MLPs. (5) For remaining 1,760 dims, keep the top ~27 attention heads (starting with a11.h8, a11.h0, etc.) based on their differential activation rankings. This strategy preserves the three-stage hierarchy: early detection (m2), propagation (some mid-layer heads), and final integration (m10, m11, and late-layer heads), while maximizing the inclusion of high-differential components.",
    "choices": None,
    "reference": "Section 4 (Results - MLP Components and Attention Head Components tables) and Section 5 (Analysis - Mechanistic Interpretation)"
})

exam_questions.append({
    "question_type": "multiple_choice",
    "question": "If you applied the sarcasm circuit to a sentence with ambiguous intent like 'That was interesting', what would be the most likely reason for circuit failure based on the documented linguistic features?",
    "answer": "The sentence lacks clear contradiction between positive sentiment words and negative situational context",
    "choices": ["The sentence is too short for the circuit to process",
                "The sentence lacks discourse markers like 'Oh' or 'Wow'",
                "The sentence lacks clear contradiction between positive sentiment words and negative situational context",
                "The sentence would activate m3 and m4 which are excluded from the circuit"],
    "reference": "Section 2 (Data - Key Linguistic Features of Sarcasm, specifically the Contradiction feature)"
})

# ============================================================================
# CATEGORY 4: COMPARATIVE ANALYSIS
# ============================================================================

exam_questions.append({
    "question_type": "free_generation",
    "question": "Compare the sarcasm circuit to the IOI (Indirect Object Identification) circuit along three dimensions: primary mechanism, circuit size, and key layer. What does this comparison suggest about how different linguistic tasks are processed in transformers?",
    "answer": "Sarcasm circuit: (1) Primary mechanism is MLP-based incongruity detection, (2) Dense circuit with 54 components, (3) Key layer is early (Layer 2). IOI circuit: (1) Primary mechanism is attention-based name copying, (2) Sparse circuit with ~10 components, (3) Key layers are late (9-11). This comparison suggests that different linguistic tasks use fundamentally different computational strategies within the same architecture. Tasks requiring pattern detection and semantic contradiction (sarcasm) rely more on MLPs and early processing, while tasks requiring information routing and copying (IOI) rely more on attention and late processing. This demonstrates task-specific architectural utilization in transformers.",
    "choices": None,
    "reference": "Section 5 (Analysis - Comparison to IOI Circuit table and concluding statement)"
})

exam_questions.append({
    "question_type": "multiple_choice",
    "question": "In the three-stage mechanistic interpretation, which stage has the most attention heads involved?",
    "answer": "Stage 2 (Distributed Propagation, L3-L7) with 19 attention heads",
    "choices": ["Stage 1 (Early Detection, L0-L2) with 9 attention heads",
                "Stage 2 (Distributed Propagation, L3-L7) with 19 attention heads",
                "Stage 3 (Final Integration, L8-L11) with 15 attention heads",
                "All stages have equal attention head distribution"],
    "reference": "Section 4 (Results - Attention Head Components - Distribution by Layer) and Section 5 (Analysis - Mechanistic Interpretation - Stage 2)"
})

# ============================================================================
# CATEGORY 5: EXPERIMENTAL DESIGN & METHODOLOGY
# ============================================================================

exam_questions.append({
    "question_type": "free_generation",
    "question": "The documentation states that 'differential activation ≠ causal importance' as a limitation. Describe two validation experiments from the 'Next Steps' section that would help establish causal importance, and explain how each addresses this limitation.",
    "answer": "Two validation experiments: (1) Ablation testing - systematically remove components and measure the impact on sarcasm detection performance. This establishes causality by showing whether removing a component degrades the behavior, proving it's not just correlated but actually necessary. (2) Intervention experiments - patch activations to test causality by modifying specific component activations and observing downstream effects. This establishes causality by demonstrating that manipulating a component's output directly changes sarcasm detection behavior. Both methods go beyond correlation (differential activation) to demonstrate that components are causally responsible for the observed behavior.",
    "choices": None,
    "reference": "Section 6 (Next Steps - Validation Experiments) and Section 8 (Limitations - point 3)"
})

exam_questions.append({
    "question_type": "multiple_choice",
    "question": "Why were activations averaged over sequence positions during the analysis?",
    "answer": "To handle variable-length inputs",
    "choices": ["To reduce computational cost",
                "To handle variable-length inputs",
                "To emphasize the importance of early tokens",
                "To make the circuit more interpretable"],
    "reference": "Section 3 (Method - Technical Details - Normalization)"
})

exam_questions.append({
    "question_type": "free_generation",
    "question": "In the component selection step (Step 3), the method prioritized MLPs over attention heads. Given that each MLP contributes 768 dimensions versus 64 dimensions per attention head, calculate how many attention heads would be equivalent to adding one MLP in terms of write budget. Then explain why prioritizing MLPs makes sense given the budget constraint.",
    "answer": "One MLP (768 dims) is equivalent to 12 attention heads (12 × 64 = 768 dims) in terms of write budget. Prioritizing MLPs makes sense because: (1) You can test fewer components to fill the budget (testing 10 MLPs vs. 120 attention heads for the same dimensions), making component selection more efficient. (2) If MLPs show high differential activation, you get more 'bang for your buck' - a single high-performing MLP provides more signal than you'd get from 12 lower-performing attention heads. (3) The method ranks by differential activation first, so if top-ranked components are MLPs, including them maximizes the quality of components within budget constraints.",
    "choices": None,
    "reference": "Section 3 (Method - Step 3: Component Selection and Technical Details - Write Budget Calculation)"
})

# ============================================================================
# CATEGORY 6: CRITICAL REFLECTION & LIMITATIONS
# ============================================================================

exam_questions.append({
    "question_type": "free_generation",
    "question": "The documentation lists 'Budget maximization' as a limitation, noting that the minimal circuit is likely smaller than 54 components. Explain why using the full 11,200 dimension budget might not represent the minimal sufficient circuit, and what tradeoff this represents.",
    "answer": "Using the full budget maximizes coverage but likely includes redundant or marginally important components. The circuit selection method ranked components by differential activation and included all components that fit within budget, but lower-ranked components may contribute minimally to actual sarcasm detection. The minimal sufficient circuit would only include components that are necessary and sufficient for the behavior, which could be significantly smaller. The tradeoff is between completeness (capturing all potentially relevant components) and parsimony (finding the simplest explanation). Budget maximization prioritizes completeness but at the cost of including potentially unnecessary components, making the circuit harder to interpret and validate.",
    "choices": None,
    "reference": "Section 8 (Limitations - point 5: 'Budget maximization: Used full 11,200 dims; minimal circuit likely smaller')"
})

exam_questions.append({
    "question_type": "multiple_choice",
    "question": "Which of the following is NOT listed as a limitation of this study?",
    "answer": "The model architecture (GPT2-small) is too small for sarcasm detection",
    "choices": ["Only 5 pairs were analyzed in detail from the 40 examples available",
                "The dataset consists of synthetic rather than real-world sarcasm",
                "No causal validation (such as ablation tests) was performed",
                "The model architecture (GPT2-small) is too small for sarcasm detection"],
    "reference": "Section 8 (Limitations) - lists small dataset, synthetic data, no causal validation, and single model specificity, but does not claim the model is too small"
})

# ============================================================================
# CATEGORY 7: CREATIVE SYNTHESIS & OPEN QUESTIONS
# ============================================================================

exam_questions.append({
    "question_type": "free_generation",
    "question": "The documentation poses an open question: 'Why is m2 so dominant? What about Layer 2 enables incongruity detection?' Based on your understanding of transformer architecture and the three-stage processing model, propose a hypothesis that could explain m2's dramatic dominance (45% stronger than the next strongest MLP).",
    "answer": "Possible hypothesis: Layer 2 may be positioned at a 'sweet spot' in the network where (1) sufficient low-level features have been extracted by Layer 0 and 1 (basic token embeddings and local context) to identify linguistic patterns, but (2) the representations are not yet too abstract or task-specific as in later layers. Incongruity detection requires identifying the co-occurrence of contradictory elements (positive sentiment words + negative context), which may require some compositional understanding but not deep semantic reasoning. Layer 2 might specialize in detecting these mid-level semantic patterns. Alternatively, GPT2's training may have naturally allocated Layer 2 for pragmatic/non-literal language processing tasks, separating them from literal semantic processing in other layers. The 45% dominance suggests this is not just a gradual process but a discrete computational step concentrated in this specific layer.",
    "choices": None,
    "reference": "Section 6 (Next Steps - Open Questions, question 1) and Section 5 (Analysis - Mechanistic Interpretation)"
})

exam_questions.append({
    "question_type": "free_generation",
    "question": "Design a follow-up experiment to test whether the sarcasm circuit generalizes to other forms of figurative language such as understatement (e.g., 'It's just a scratch' for a serious injury). Describe your experimental setup, what data you would collect, and what results would support or refute generalization.",
    "answer": "Experimental setup: (1) Create a dataset of 40 paired examples (20 understatement, 20 literal) with similar structure to the sarcasm data. Example understatement: 'It's a bit chilly' (during a blizzard). Example literal: 'It's extremely cold outside.' (2) Run GPT2-small on this dataset using the identified 54-component sarcasm circuit (same hooks/activations). (3) Measure differential activation between understatement and literal examples for the same 54 components. Results supporting generalization: If m2 shows dominant differential activation for understatement (>20 avg diff) and the three-stage pattern is preserved, this suggests the circuit detects general pragmatic contradiction, not sarcasm-specific patterns. Results refuting generalization: If different components (e.g., m5, m7) dominate, or if late layers become more important, this suggests task-specific circuits. Additionally, test with intervention: patch m2 activations from sarcasm examples into understatement examples and observe if detection behavior transfers.",
    "choices": None,
    "reference": "Section 6 (Next Steps - Mechanistic Deep Dive, question 4 about generalization) and Section 5 (Analysis - Mechanistic Interpretation)"
})

# ============================================================================
# CATEGORY 8: NUMERICAL REASONING & CALCULATION
# ============================================================================

exam_questions.append({
    "question_type": "free_generation",
    "question": "Calculate the percentage of the total write budget contributed by (a) the input embedding, (b) all MLP components, and (c) all attention head components. Show your calculations.",
    "answer": "Total budget: 11,200 dimensions. (a) Input embedding: 768 dims. Percentage = (768/11,200) × 100 = 6.86%. (b) All MLPs: 10 MLPs × 768 dims = 7,680 dims. Percentage = (7,680/11,200) × 100 = 68.57%. (c) All attention heads: 43 heads × 64 dims = 2,752 dims. Percentage = (2,752/11,200) × 100 = 24.57%. Verification: 6.86% + 68.57% + 24.57% = 100%. This shows MLPs dominate the circuit, contributing more than two-thirds of the total budget.",
    "choices": None,
    "reference": "Section 4 (Results - Circuit Composition) and Section 7 (Main Takeaways - Scientific Insights, point 2)"
})

exam_questions.append({
    "question_type": "free_generation",
    "question": "According to the MLP components table, m2 has an average differential activation of 32.47, and the next strongest MLP (m11) has 22.30. Calculate the percentage by which m2 exceeds m11, and explain whether this supports or contradicts the claim that m2 is '~45% stronger' as stated in the documentation.",
    "answer": "Calculation: Percentage difference = ((32.47 - 22.30) / 22.30) × 100 = (10.17 / 22.30) × 100 = 45.6%. This calculation supports the documentation's claim that m2 is '~45% stronger' than m11. The statement refers to m2 being 45% greater than the next strongest MLP in terms of differential activation magnitude. This dramatic difference emphasizes m2's dominant role as the primary sarcasm detector and justifies the revised understanding that sarcasm detection is concentrated in Layer 2 rather than distributed across multiple layers.",
    "choices": None,
    "reference": "Section 4 (Results - MLP Components table and Key Finding)"
})

# ============================================================================
# CATEGORY 9: ERROR DETECTION & AMBIGUITY
# ============================================================================

exam_questions.append({
    "question_type": "multiple_choice",
    "question": "A student claims: 'The circuit includes all 12 MLP layers from GPT2-small because MLPs are more important than attention for sarcasm detection.' Identify the error in this statement.",
    "answer": "The circuit only includes 10 out of 12 MLPs; m3 and m4 are excluded",
    "choices": ["MLPs are not more important than attention heads for sarcasm detection",
                "The circuit only includes 10 out of 12 MLPs; m3 and m4 are excluded",
                "GPT2-small has 13 MLP layers, not 12",
                "The circuit prioritizes attention heads over MLPs"],
    "reference": "Section 4 (Results - Circuit Composition and Excluded Components: 'MLPs excluded: m3, m4')"
})

exam_questions.append({
    "question_type": "free_generation",
    "question": "The documentation states that only 5 pairs were 'analyzed in detail' from the 40 total examples. What ambiguity or potential confusion does this create about the experimental method, and what would you need to know to resolve it?",
    "answer": "This creates ambiguity about what 'analyzed in detail' means versus what was done with the other 35 pairs. The differential activation analysis presumably used all 40 examples to compute average activations (otherwise the statistics wouldn't be reliable), but 'analyzed in detail' might refer to manual linguistic analysis, visualization, or case-by-case examination. To resolve this, we would need to know: (1) Were all 40 examples used to compute the differential activation scores? (2) What constitutes 'detailed analysis' - is it qualitative interpretation, attention pattern visualization, or token-level activation inspection? (3) Why were only 5 pairs selected for detailed analysis - were they representative, edge cases, or randomly chosen? This ambiguity is important because it affects how much we can trust the generalizability of the mechanistic interpretations.",
    "choices": None,
    "reference": "Section 8 (Limitations, point 1: 'Small dataset: Only 5 pairs analyzed in detail (40 examples total available)')"
})

print(f"Total questions created: {len(exam_questions)}")
print(f"\nBreakdown by type:")
print(f"  Multiple choice: {sum(1 for q in exam_questions if q['question_type'] == 'multiple_choice')}")
print(f"  Free generation: {sum(1 for q in exam_questions if q['question_type'] == 'free_generation')}")


Total questions created: 23

Breakdown by type:
  Multiple choice: 9
  Free generation: 14


In [6]:
# Create the exam directory and save the JSON file
import json

# Create exam directory
exam_dir = repo_path / 'exam'
exam_dir.mkdir(exist_ok=True)
print(f"Created exam directory: {exam_dir}")

# Save the JSON file
task_name = "sarcasm_circuit"
json_filename = f"exam_{task_name}.json"
json_path = exam_dir / json_filename

with open(json_path, 'w') as f:
    json.dump(exam_questions, f, indent=2)

print(f"\nSaved exam questions to: {json_path}")
print(f"Total questions: {len(exam_questions)}")

# Verify the file was created
print(f"\nFile exists: {json_path.exists()}")
print(f"File size: {json_path.stat().st_size} bytes")


Created exam directory: /home/smallyan/critic_model_mechinterp/runs/circuits_claude_2025-11-10_20-48-00/exam

Saved exam questions to: /home/smallyan/critic_model_mechinterp/runs/circuits_claude_2025-11-10_20-48-00/exam/exam_sarcasm_circuit.json
Total questions: 23

File exists: True
File size: 21808 bytes


# Exam Design for Sarcasm Circuit Analysis

## Overview

This notebook contains a comprehensive exam designed to evaluate student understanding of the sarcasm circuit documentation for GPT2-small. The exam covers both factual knowledge and applied reasoning across multiple dimensions.

## Exam Statistics

- **Total Questions**: 23
- **Multiple Choice**: 9 questions (39%)
- **Free Generation**: 14 questions (61%)
- **Coverage**: All major sections of the documentation

## Question Categories

The exam is organized into the following categories:

1. **Comprehension & Recall** (4 questions) - Basic factual understanding
2. **Causal & Mechanistic Reasoning** (3 questions) - Understanding how the system works
3. **Hypothetical Transfer & Application** (3 questions) - Applying knowledge to new scenarios
4. **Comparative Analysis** (2 questions) - Comparing different approaches
5. **Experimental Design & Methodology** (3 questions) - Understanding research methods
6. **Critical Reflection & Limitations** (2 questions) - Evaluating study limitations
7. **Creative Synthesis & Open Questions** (2 questions) - Proposing new ideas
8. **Numerical Reasoning & Calculation** (2 questions) - Quantitative understanding
9. **Error Detection & Ambiguity** (2 questions) - Identifying issues in reasoning

## Quality Metrics

- **Factual Understanding**: ~48% of questions (11/23)
- **Reasoning & Application**: ~52% of questions (12/23)
- **Non-trivial**: All questions require comprehension beyond simple text lookup
- **Documentation-grounded**: All answers derivable from the provided documentation

In [7]:
# Display all exam questions with their answers in a structured format

print("="*80)
print("COMPREHENSIVE EXAM: SARCASM CIRCUIT ANALYSIS")
print("="*80)
print()

for i, q in enumerate(exam_questions, 1):
    print(f"\n{'='*80}")
    print(f"QUESTION {i} [{q['question_type'].upper()}]")
    print(f"{'='*80}")
    print(f"\n{q['question']}")
    
    if q['choices']:
        print("\nChoices:")
        for j, choice in enumerate(q['choices'], 1):
            print(f"  {chr(64+j)}) {choice}")
    
    print(f"\n{'─'*80}")
    print("ANSWER:")
    print(f"{'─'*80}")
    print(f"{q['answer']}")
    
    print(f"\n{'─'*80}")
    print("REFERENCE:")
    print(f"{'─'*80}")
    print(f"{q['reference']}")
    print()

print(f"\n{'='*80}")
print(f"END OF EXAM - Total: {len(exam_questions)} questions")
print(f"{'='*80}")


COMPREHENSIVE EXAM: SARCASM CIRCUIT ANALYSIS


QUESTION 1 [MULTIPLE_CHOICE]

What is the write budget constraint for the sarcasm detection circuit?

Choices:
  A) 7,680 dimensions
  B) 11,200 dimensions
  C) 12,288 dimensions
  D) 54,000 dimensions

────────────────────────────────────────────────────────────────────────────────
ANSWER:
────────────────────────────────────────────────────────────────────────────────
11,200 dimensions

────────────────────────────────────────────────────────────────────────────────
REFERENCE:
────────────────────────────────────────────────────────────────────────────────
Section 1 (Goal) and Section 4 (Results - Circuit Composition)


QUESTION 2 [MULTIPLE_CHOICE]

How many total examples were in the sarcasm dataset?

Choices:
  A) 20 examples (10 sarcastic, 10 literal)
  B) 40 examples (20 sarcastic, 20 literal)
  C) 100 examples (50 sarcastic, 50 literal)
  D) 5 paired examples

─────────────────────────────────────────────────────────────────────────

---

## Exam Quality Assessment

### Coverage Analysis

This exam comprehensively covers all major sections of the documentation:

- **Section 1 (Goal)**: Questions 1, 5
- **Section 2 (Data)**: Questions 2, 4, 10
- **Section 3 (Method)**: Questions 5, 6, 14, 15
- **Section 4 (Results)**: Questions 1, 3, 5, 9, 12, 20, 21, 22
- **Section 5 (Analysis)**: Questions 7, 8, 9, 11, 18, 19
- **Section 6 (Next Steps)**: Questions 8, 13, 18, 19
- **Section 7 (Main Takeaways)**: Questions 11, 20
- **Section 8 (Limitations)**: Questions 13, 16, 17, 23

### Question Difficulty Distribution

- **Easy** (Direct recall): Questions 1, 2, 3, 4, 6, 14, 17, 22
- **Medium** (Application & reasoning): Questions 5, 7, 10, 11, 12, 13, 15, 20, 21
- **Hard** (Synthesis & creative reasoning): Questions 8, 9, 16, 18, 19, 23

### Learning Objectives Assessed

1. **Factual Knowledge**: Students can recall key facts about the circuit, methodology, and results
2. **Mechanistic Understanding**: Students understand how the three-stage processing works
3. **Quantitative Reasoning**: Students can perform calculations with write budget and percentages
4. **Critical Thinking**: Students can identify limitations and ambiguities
5. **Transfer & Application**: Students can apply knowledge to novel scenarios
6. **Experimental Design**: Students understand research methodology and validation approaches
7. **Comparative Analysis**: Students can compare different approaches and circuits
8. **Creative Synthesis**: Students can propose new hypotheses and experiments

### Answer Validation

All answers are:
- Grounded in the documentation
- Unambiguous and verifiable
- At an appropriate level of detail
- Referenced to specific sections

### Exam Integrity

- Questions require genuine comprehension, not text search
- No questions are answerable by trivial pattern matching
- Mix of closed-ended (multiple choice) and open-ended (free generation) formats
- Balance between factual recall (48%) and applied reasoning (52%)

In [8]:
# Save the notebook to the exam directory
import shutil

# Get the notebook path
notebook_path = Path('/home/smallyan/critic_model_mechinterp/notebooks/2025-11-10-21-18_ExamDesign.ipynb')
print(f"Source notebook: {notebook_path}")
print(f"Notebook exists: {notebook_path.exists()}")

# Copy to exam directory with new name
target_path = exam_dir / 'exam_documentation.ipynb'
shutil.copy(notebook_path, target_path)

print(f"\nCopied notebook to: {target_path}")
print(f"Target exists: {target_path.exists()}")

# Verify both files exist in exam directory
print(f"\nExam directory contents:")
for item in sorted(exam_dir.iterdir()):
    print(f"  - {item.name} ({item.stat().st_size} bytes)")
