In [1]:
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


In [2]:
# Set environment for using cached models
import os
os.environ['HF_HOME'] = '/net/projects2/chai-lab/shared_models'
os.environ['TRANSFORMERS_CACHE'] = '/net/projects2/chai-lab/shared_models'

# Check for CUDA availability
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

Using device: cuda
GPU: NVIDIA H100 NVL
Memory: 93.1 GB


In [3]:
# Load and parse the notebook to identify all code cells
import json

notebook_path = '/net/scratch2/smallyan/open_question/typo_claude_2026-01-05_00-38-13/notebooks/2026-01-05-00-38_typo_circuit_analysis.ipynb'

with open(notebook_path, 'r') as f:
    notebook = json.load(f)

# Count cells by type
code_cells = []
markdown_cells = []

for i, cell in enumerate(notebook['cells']):
    cell_type = cell['cell_type']
    source = ''.join(cell['source'])
    
    if cell_type == 'code':
        code_cells.append({
            'index': i,
            'source': source,
            'cell_num': len(code_cells)
        })
    elif cell_type == 'markdown':
        markdown_cells.append({
            'index': i,
            'source': source
        })

print(f"Total cells: {len(notebook['cells'])}")
print(f"Code cells: {len(code_cells)}")
print(f"Markdown cells: {len(markdown_cells)}")

Total cells: 50
Code cells: 42
Markdown cells: 8


In [4]:
# Print summary of each code cell to understand the structure
for cell in code_cells:
    source = cell['source']
    # Get first few lines for summary
    lines = source.split('\n')
    preview = '\n'.join(lines[:5]) if len(lines) > 5 else source
    if len(source) > 300:
        preview = source[:300] + '...'
    print(f"\n=== Code Cell {cell['cell_num']} (notebook index {cell['index']}) ===")
    print(preview)
    print(f"--- Length: {len(source)} chars, {len(lines)} lines ---")


=== Code Cell 0 (notebook index 0) ===
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")
--- Length: 90 chars, 3 lines ---

=== Code Cell 1 (notebook index 1) ===
# Check GPU availability and set up environment
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_p...
--- Length: 419 chars, 11 lines ---

=== Code Cell 2 (notebook index 2) ===
# Create necessary directories
import os

os.makedirs('logs', exist_ok=True)
os.makedirs('notebooks', exist_ok=True)
--- Length: 249 chars, 9 lines ---

=== Code Cell 3 (notebook index 4) ===
# Load GPT-2 Medium via HookedTransformer
from transformer_lens import HookedTransformer
import torch

print("Loading GPT-2 Medium...")
model = HookedTransformer.from_pretrained("gpt2-medium", device=device)
print(f"Model l

# Code Evaluation: Typo Correction Mechanism Analysis

This notebook evaluates the code implementation in `/net/scratch2/smallyan/open_question/typo_claude_2026-01-05_00-38-13`

## Project Goal
Based on the plan and codewalk files, this project aims to analyze how GPT-2 Medium handles typos, specifically:
1. Understand the behavioral patterns of typo handling
2. Analyze representation similarity across layers
3. Perform activation patching to find causal components
4. Identify the key circuit for typo handling (MLP0 at typo position)

## Evaluation Structure
- Total code cells in original notebook: 42
- We will execute each cell and record: Runnable, Correct-Implementation, Redundant, Irrelevant flags

In [5]:
# Initialize tracking data structures for evaluation
evaluation_results = []

# Store original notebook cells for reference
notebook_path = '/net/scratch2/smallyan/open_question/typo_claude_2026-01-05_00-38-13/notebooks/2026-01-05-00-38_typo_circuit_analysis.ipynb'

import json
with open(notebook_path, 'r') as f:
    original_notebook = json.load(f)

# Get only code cells
original_code_cells = []
for i, cell in enumerate(original_notebook['cells']):
    if cell['cell_type'] == 'code':
        source = ''.join(cell['source'])
        original_code_cells.append({
            'cell_index': len(original_code_cells),
            'notebook_index': i,
            'source': source,
            'cell_summary': source.split('\n')[0][:80] if source else 'Empty'
        })

print(f"Total code cells to evaluate: {len(original_code_cells)}")

Total code cells to evaluate: 42


## Cell-by-Cell Execution and Evaluation

Now executing each code cell from the original notebook and recording results.

In [6]:
# Cell 0: Setup working directory
print("=== Evaluating Cell 0: Setup working directory ===")

try:
    import os
    os.chdir('/home/smallyan/eval_agent')
    print(f"Working directory: {os.getcwd()}")
    cell0_runnable = "Y"
    cell0_error = ""
except Exception as e:
    cell0_runnable = "N"
    cell0_error = str(e)
    print(f"ERROR: {e}")

print(f"\nRunnable: {cell0_runnable}")

=== Evaluating Cell 0: Setup working directory ===
Working directory: /home/smallyan/eval_agent

Runnable: Y


In [7]:
# Cell 1: Check GPU availability
print("=== Evaluating Cell 1: Check GPU availability ===")

try:
    # Check GPU availability and set up environment
    import torch
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
        print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"\nUsing device: {device}")
    cell1_runnable = "Y"
    cell1_error = ""
except Exception as e:
    cell1_runnable = "N"
    cell1_error = str(e)
    print(f"ERROR: {e}")

print(f"\nRunnable: {cell1_runnable}")

=== Evaluating Cell 1: Check GPU availability ===
PyTorch version: 2.7.1+cu118
CUDA available: True
CUDA device: NVIDIA H100 NVL
CUDA memory: 99.95 GB

Using device: cuda

Runnable: Y


In [8]:
# Cell 2: Create necessary directories
print("=== Evaluating Cell 2: Create necessary directories ===")

try:
    import os

    os.makedirs('logs', exist_ok=True)
    os.makedirs('notebooks', exist_ok=True)
    print("Directories created/verified: logs, notebooks")
    cell2_runnable = "Y"
    cell2_error = ""
except Exception as e:
    cell2_runnable = "N"
    cell2_error = str(e)
    print(f"ERROR: {e}")

print(f"\nRunnable: {cell2_runnable}")

=== Evaluating Cell 2: Create necessary directories ===
Directories created/verified: logs, notebooks

Runnable: Y


In [9]:
# Cell 3: Load GPT-2 Medium via HookedTransformer
print("=== Evaluating Cell 3: Load GPT-2 Medium via HookedTransformer ===")

try:
    from transformer_lens import HookedTransformer
    import torch

    print("Loading GPT-2 Medium...")
    model = HookedTransformer.from_pretrained("gpt2-medium", device=device)
    print(f"Model loaded: {model.cfg.model_name}")
    print(f"Number of layers: {model.cfg.n_layers}")
    print(f"Number of attention heads: {model.cfg.n_heads}")
    print(f"Hidden size: {model.cfg.d_model}")
    cell3_runnable = "Y"
    cell3_error = ""
except Exception as e:
    cell3_runnable = "N"
    cell3_error = str(e)
    print(f"ERROR: {e}")

print(f"\nRunnable: {cell3_runnable}")

=== Evaluating Cell 3: Load GPT-2 Medium via HookedTransformer ===




Loading GPT-2 Medium...


`torch_dtype` is deprecated! Use `dtype` instead!


Loaded pretrained model gpt2-medium into HookedTransformer
Model loaded: gpt2-medium
Number of layers: 24
Number of attention heads: 16
Hidden size: 1024

Runnable: Y


In [10]:
# Cell 4: Test model with simple completion
print("=== Evaluating Cell 4: Test model with simple completion ===")

try:
    # Test that the model works with a simple completion
    test_prompt = "The capital of France is"
    tokens = model.to_tokens(test_prompt)
    logits = model(tokens)
    next_token = logits[0, -1].argmax()
    print(f"Prompt: '{test_prompt}'")
    print(f"Next token prediction: '{model.to_string(next_token)}'")

    # Test with a typo
    typo_prompt = "The capital of Frnace is"
    tokens_typo = model.to_tokens(typo_prompt)
    logits_typo = model(tokens_typo)
    next_token_typo = logits_typo[0, -1].argmax()
    print(f"\nTypo prompt: '{typo_prompt}'")
    print(f"Next token prediction: '{model.to_string(next_token_typo)}'")
    cell4_runnable = "Y"
    cell4_error = ""
except Exception as e:
    cell4_runnable = "N"
    cell4_error = str(e)
    print(f"ERROR: {e}")

print(f"\nRunnable: {cell4_runnable}")

=== Evaluating Cell 4: Test model with simple completion ===


Prompt: 'The capital of France is'
Next token prediction: ' the'

Typo prompt: 'The capital of Frnace is'
Next token prediction: ' the'

Runnable: Y


In [11]:
# Cell 5: Get top predictions function
print("=== Evaluating Cell 5: Get top predictions function ===")

try:
    # Let's check top-k predictions for both cases
    import torch.nn.functional as F

    def get_top_predictions(logits, k=10):
        """Get top-k predicted tokens and their probabilities"""
        probs = F.softmax(logits[0, -1], dim=-1)
        top_probs, top_indices = torch.topk(probs, k)
        return [(model.to_string(idx.item()), prob.item())
                for idx, prob in zip(top_indices, top_probs)]

    # Clean prompt
    print("Clean prompt - 'The capital of France is':")
    clean_preds = get_top_predictions(logits)
    for token, prob in clean_preds:
        print(f"  {repr(token):15} {prob:.4f}")

    # Typo prompt
    print("\nTypo prompt - 'The capital of Frnace is':")
    typo_preds = get_top_predictions(logits_typo)
    for token, prob in typo_preds:
        print(f"  {repr(token):15} {prob:.4f}")
    cell5_runnable = "Y"
    cell5_error = ""
except Exception as e:
    cell5_runnable = "N"
    cell5_error = str(e)
    print(f"ERROR: {e}")

print(f"\nRunnable: {cell5_runnable}")

=== Evaluating Cell 5: Get top predictions function ===
Clean prompt - 'The capital of France is':
  ' the'          0.0601
  ' a'            0.0484
  ' now'          0.0265
  ' Paris'        0.0244
  ' in'           0.0233
  ' not'          0.0201
  ' one'          0.0170
  ' France'       0.0165
  ' known'        0.0160
  ' home'         0.0151

Typo prompt - 'The capital of Frnace is':
  ' the'          0.1104
  ' a'            0.0869
  ' located'      0.0584
  ' one'          0.0173
  ' in'           0.0172
  ' now'          0.0169
  ' an'           0.0143
  ' situated'     0.0134
  ' St'           0.0090
  ' known'        0.0090

Runnable: Y


In [12]:
# Cell 6: Test different types of typos
print("=== Evaluating Cell 6: Test different types of typos ===")

try:
    test_cases = [
        # (clean, typoed, expected_completion)
        ("The cat sat on the", "The cat sat on teh", " mat"),  # simple transposition
        ("I want to eat an apple", "I want to eat an aple", " and"),  # deletion
        ("She went to the store", "She went to teh store", " and"),  # transposition
        ("The weather is nice", "The wether is nice", " and"),  # vowel deletion
    ]

    print("Testing various typo types:\n")
    for clean, typo, expected in test_cases:
        clean_tokens = model.to_tokens(clean)
        typo_tokens = model.to_tokens(typo)
        
        clean_logits = model(clean_tokens)
        typo_logits = model(typo_tokens)
        
        # Check if same top prediction
        clean_pred = model.to_string(clean_logits[0, -1].argmax())
        typo_pred = model.to_string(typo_logits[0, -1].argmax())
        
        print(f"Clean: '{clean}' -> '{clean_pred}'")
        print(f"Typo:  '{typo}' -> '{typo_pred}'")
        print(f"Same prediction: {clean_pred == typo_pred}")
        print()
    cell6_runnable = "Y"
    cell6_error = ""
except Exception as e:
    cell6_runnable = "N"
    cell6_error = str(e)
    print(f"ERROR: {e}")

print(f"\nRunnable: {cell6_runnable}")

=== Evaluating Cell 6: Test different types of typos ===
Testing various typo types:



Clean: 'The cat sat on the' -> ' bed'
Typo:  'The cat sat on teh' -> ' floor'
Same prediction: False



Clean: 'I want to eat an apple' -> '.'
Typo:  'I want to eat an aple' -> 'a'
Same prediction: False



Clean: 'She went to the store' -> ' to'
Typo:  'She went to teh store' -> ' and'
Same prediction: False

Clean: 'The weather is nice' -> ' and'
Typo:  'The wether is nice' -> ' and'
Same prediction: True


Runnable: Y


In [13]:
# Cell 7: Show tokenization function
print("=== Evaluating Cell 7: Show tokenization function ===")

try:
    import random

    def show_tokenization(text):
        """Show how text is tokenized"""
        tokens = model.to_tokens(text, prepend_bos=False)
        token_strs = [model.to_string(t) for t in tokens[0]]
        return token_strs

    # Compare tokenizations
    examples = [
        ("weather", "wether"),
        ("beautiful", "beutiful"),
        ("computer", "compter"),
        ("afternoon", "afternon")
    ]

    print("Tokenization comparison:")
    for clean, typo in examples:
        clean_toks = show_tokenization(clean)
        typo_toks = show_tokenization(typo)
        print(f"\n{clean:15} -> {clean_toks}")
        print(f"{typo:15} -> {typo_toks}")
    cell7_runnable = "Y"
    cell7_error = ""
except Exception as e:
    cell7_runnable = "N"
    cell7_error = str(e)
    print(f"ERROR: {e}")

print(f"\nRunnable: {cell7_runnable}")

=== Evaluating Cell 7: Show tokenization function ===
Tokenization comparison:

weather         -> ['weather']
wether          -> ['w', 'ether']

beautiful       -> ['beaut', 'iful']
beutiful        -> ['be', 'ut', 'iful']

computer        -> ['computer']
compter         -> ['comp', 'ter']

afternoon       -> ['after', 'noon']
afternon        -> ['after', 'non']

Runnable: Y


In [14]:
# Cell 8: Create systematic typo dataset
print("=== Evaluating Cell 8: Create systematic typo dataset ===")

try:
    import random
    from typing import List, Dict, Tuple

    def create_typo_substitution(word: str, idx: int = None) -> str:
        """Create a typo by substituting a character"""
        if len(word) < 2:
            return word
        keyboard_neighbors = {
            'a': 'sq', 'b': 'vn', 'c': 'xv', 'd': 'sf', 'e': 'wr',
            'f': 'dg', 'g': 'fh', 'h': 'gj', 'i': 'uo', 'j': 'hk',
            'k': 'jl', 'l': 'ko', 'm': 'n', 'n': 'bm', 'o': 'ip',
            'p': 'o', 'q': 'wa', 'r': 'et', 's': 'ad', 't': 'ry',
            'u': 'yi', 'v': 'cb', 'w': 'qe', 'x': 'zc', 'y': 'tu',
            'z': 'x'
        }
        if idx is None:
            idx = random.randint(0, len(word) - 1)
        char = word[idx].lower()
        if char in keyboard_neighbors:
            new_char = random.choice(keyboard_neighbors[char])
            return word[:idx] + new_char + word[idx+1:]
        return word

    def create_typo_transposition(word: str, idx: int = None) -> str:
        """Create a typo by swapping adjacent characters"""
        if len(word) < 2:
            return word
        if idx is None:
            idx = random.randint(0, len(word) - 2)
        return word[:idx] + word[idx+1] + word[idx] + word[idx+2:]

    def create_typo_deletion(word: str, idx: int = None) -> str:
        """Create a typo by deleting a character"""
        if len(word) < 3:
            return word
        if idx is None:
            idx = random.randint(1, len(word) - 2)
        return word[:idx] + word[idx+1:]

    def create_typo_insertion(word: str, idx: int = None) -> str:
        """Create a typo by inserting a duplicate character"""
        if len(word) < 2:
            return word
        if idx is None:
            idx = random.randint(0, len(word) - 1)
        return word[:idx] + word[idx] + word[idx:]

    # Create dataset
    template_sentences = [
        ("The {} is very important", ["weather", "problem", "answer", "meeting", "decision"]),
        ("I need to {} this correctly", ["explain", "analyze", "complete", "understand", "evaluate"]),
        ("She {} the document", ["received", "reviewed", "submitted", "approved", "rejected"]),
        ("They will {} tomorrow", ["arrive", "depart", "return", "continue", "present"]),
    ]

    random.seed(42)
    dataset = []

    for template, words in template_sentences:
        for word in words:
            # Original
            clean_sentence = template.format(word)
            
            # Create different typo types
            for typo_type, typo_func in [
                ("substitution", create_typo_substitution),
                ("transposition", create_typo_transposition),
                ("deletion", create_typo_deletion),
            ]:
                typo_word = typo_func(word)
                if typo_word != word:  # Only if typo was created
                    typo_sentence = template.format(typo_word)
                    dataset.append({
                        "clean": clean_sentence,
                        "typo": typo_sentence,
                        "clean_word": word,
                        "typo_word": typo_word,
                        "typo_type": typo_type
                    })

    print(f"Dataset created with {len(dataset)} examples")
    print("\nSample entries:")
    for entry in dataset[:3]:
        print(f"  Clean: {entry['clean']}")
        print(f"  Typo:  {entry['typo']}")
        print(f"  Type:  {entry['typo_type']}")
        print()
    cell8_runnable = "Y"
    cell8_error = ""
except Exception as e:
    cell8_runnable = "N"
    cell8_error = str(e)
    print(f"ERROR: {e}")

print(f"\nRunnable: {cell8_runnable}")

=== Evaluating Cell 8: Create systematic typo dataset ===
Dataset created with 60 examples

Sample entries:
  Clean: The weather is very important
  Typo:  The weathwr is very important
  Type:  substitution

  Clean: The weather is very important
  Typo:  The ewather is very important
  Type:  transposition

  Clean: The weather is very important
  Typo:  The weaher is very important
  Type:  deletion


Runnable: Y


In [15]:
# Cell 9: Evaluate model performance on clean vs typo
print("=== Evaluating Cell 9: Evaluate model performance on clean vs typo ===")

try:
    import torch.nn.functional as F
    import numpy as np

    results = []

    for entry in dataset:
        clean_tokens = model.to_tokens(entry["clean"])
        typo_tokens = model.to_tokens(entry["typo"])
        
        with torch.no_grad():
            clean_logits = model(clean_tokens)
            typo_logits = model(typo_tokens)
        
        # Get predictions
        clean_pred = clean_logits[0, -1].argmax().item()
        typo_pred = typo_logits[0, -1].argmax().item()
        
        # Get probabilities
        clean_probs = F.softmax(clean_logits[0, -1], dim=-1)
        typo_probs = F.softmax(typo_logits[0, -1], dim=-1)
        
        # Probability ratio for the clean prediction
        clean_pred_prob_clean = clean_probs[clean_pred].item()
        clean_pred_prob_typo = typo_probs[clean_pred].item()
        
        prob_ratio = clean_pred_prob_typo / clean_pred_prob_clean if clean_pred_prob_clean > 0 else 0
        
        results.append({
            **entry,
            "clean_pred": model.to_string(clean_pred),
            "typo_pred": model.to_string(typo_pred),
            "same_pred": clean_pred == typo_pred,
            "clean_pred_prob_clean": clean_pred_prob_clean,
            "clean_pred_prob_typo": clean_pred_prob_typo,
            "prob_ratio": prob_ratio
        })

    # Summary statistics
    same_pred_rate = sum(r["same_pred"] for r in results) / len(results)
    avg_prob_ratio = np.mean([r["prob_ratio"] for r in results])

    print(f"Total examples: {len(results)}")
    print(f"Same prediction rate: {same_pred_rate:.1%}")
    print(f"Average probability ratio: {avg_prob_ratio:.3f}")
    
    # By typo type
    print("\nBy typo type:")
    for typo_type in ["substitution", "transposition", "deletion"]:
        type_results = [r for r in results if r["typo_type"] == typo_type]
        type_same_rate = sum(r["same_pred"] for r in type_results) / len(type_results)
        print(f"  {typo_type}: {type_same_rate:.1%} same prediction")
    cell9_runnable = "Y"
    cell9_error = ""
except Exception as e:
    cell9_runnable = "N"
    cell9_error = str(e)
    print(f"ERROR: {e}")

print(f"\nRunnable: {cell9_runnable}")

=== Evaluating Cell 9: Evaluate model performance on clean vs typo ===
