In [13]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import matplotlib.pyplot as plt
import numpy as np
from gsm8 import GSM8KLoader

In [None]:
# Direct loading - much faster
print("Loading model directly...")
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-7B", 
    trust_remote_code=True, 
    torch_dtype="half",
    device_map="cuda",
    use_cache=True
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B", trust_remote_code=True)

Loading model directly...


Downloading shards:  25%|██▌       | 1/4 [00:10<00:31, 10.40s/it]

In [None]:
print("Loading math problems...")
dataset = GSM8KLoader()

# Get 10 math problems
math_problems = []
for i in range(10):
    item = dataset[i]
    math_problems.append(item['question'])
    print(f"{i+1}. {item['question']}")

print(f"\nLoaded {len(math_problems)} math problems")

In [9]:
# Create some non-math control sentences
non_math_text = [
    "The cat sat on the comfortable mat in the living room.",
    "Yesterday I went to the store to buy groceries for dinner.",
    "She loves reading books about history and ancient civilizations.",
    "The weather today is sunny with a gentle breeze blowing.",
    "Music has the power to inspire and heal people's hearts.",
    "Traveling to new places opens your mind to different cultures.",
    "Technology continues to evolve at an unprecedented pace today.",
    "Cooking delicious meals brings families together around the table.",
    "Exercise and healthy eating are important for maintaining wellness.",
    "Art museums showcase creativity and human expression throughout history."
]

print("Non-math control sentences:")
for i, text in enumerate(non_math_text):
    print(f"{i+1}. {text}")

In [None]:
def collect_activations_direct(model, tokenizer, problems, target_layers=range(20, 27)):
    """Collect max activations across specified layers"""
    activations_data = {layer: [] for layer in target_layers}
    
    for i, problem in enumerate(problems):
        print(f"Processing {i+1}/{len(problems)}: {problem[:50]}...")
        
        # Tokenize
        inputs = tokenizer(problem, return_tensors="pt", truncation=True, max_length=512)
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        
        # Forward pass with hooks
        layer_outputs = {}
        
        def make_hook(layer_idx):
            def hook(module, input, output):
                if isinstance(output, tuple):
                    activation = output[0].detach()
                else:
                    activation = output.detach()
                # Get max absolute activation for this layer
                layer_outputs[layer_idx] = torch.max(torch.abs(activation)).item()
            return hook
        
        # Register hooks
        hooks = []
        for layer_idx in target_layers:
            layer = model.model.layers[layer_idx]  # Note: model.model.layers for Qwen
            hook = layer.register_forward_hook(make_hook(layer_idx))
            hooks.append(hook)
        
        # Forward pass
        with torch.no_grad():
            _ = model(**inputs)
        
        # Store results
        for layer_idx in target_layers:
            activations_data[layer_idx].append(layer_outputs.get(layer_idx, 0))
        
        # Clean up hooks
        for hook in hooks:
            hook.remove()
    
    return activations_data

In [None]:
print("Analyzing math problems...")
math_activations = collect_activations_direct(model, tokenizer, math_problems)

print("\nMath activation statistics:")
for layer in range(20, 27):
    values = math_activations[layer]
    print(f"Layer {layer}: Mean={np.mean(values):.3f}, Max={np.max(values):.3f}, Std={np.std(values):.3f}")

In [None]:
print("Analyzing non-math text...")
non_math_activations = collect_activations_direct(model, tokenizer, non_math_text)

print("\nNon-math activation statistics:")
for layer in range(20, 27):
    values = non_math_activations[layer]
    print(f"Layer {layer}: Mean={np.mean(values):.3f}, Max={np.max(values):.3f}, Std={np.std(values):.3f}")

In [None]:
def plot_activation_analysis(math_data, non_math_data):
    """Create clean plots comparing math vs non-math activations"""
    layers = list(math_data.keys())
    
    # Calculate statistics
    math_means = [np.mean(math_data[layer]) for layer in layers]
    math_maxs = [np.max(math_data[layer]) for layer in layers]
    non_math_means = [np.mean(non_math_data[layer]) for layer in layers]
    non_math_maxs = [np.max(non_math_data[layer]) for layer in layers]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot 1: Mean activations
    ax1.plot(layers, math_means, 'o-', color='red', linewidth=2, markersize=8, label='Math Problems', alpha=0.8)
    ax1.plot(layers, non_math_means, 'o-', color='blue', linewidth=2, markersize=8, label='Non-Math Text', alpha=0.8)
    ax1.axvline(x=23, color='orange', linestyle='--', alpha=0.7, linewidth=2, label='Layer 23')
    ax1.set_xlabel('Layer Number', fontsize=12)
    ax1.set_ylabel('Mean Max Activation', fontsize=12)
    ax1.set_title('Mean Maximum Activations Across Layers', fontsize=14, fontweight='bold')
    ax1.legend(fontsize=11)
    ax1.grid(True, alpha=0.3)
    ax1.set_xticks(layers)
    
    # Plot 2: Max activations
    ax2.plot(layers, math_maxs, 's-', color='red', linewidth=2, markersize=8, label='Math Problems', alpha=0.8)
    ax2.plot(layers, non_math_maxs, 's-', color='blue', linewidth=2, markersize=8, label='Non-Math Text', alpha=0.8)
    ax2.axvline(x=23, color='orange', linestyle='--', alpha=0.7, linewidth=2, label='Layer 23')
    ax2.set_xlabel('Layer Number', fontsize=12)
    ax2.set_ylabel('Peak Max Activation', fontsize=12)
    ax2.set_title('Peak Maximum Activations Across Layers', fontsize=14, fontweight='bold')
    ax2.legend(fontsize=11)
    ax2.grid(True, alpha=0.3)
    ax2.set_xticks(layers)
    
    plt.tight_layout()
    plt.show()
    
    # Print some statistics
    print(f"\n=== LAYER 23 ANALYSIS ===")
    layer_23_idx = layers.index(23)
    print(f"Math problems - Mean: {math_means[layer_23_idx]:.3f}, Max: {math_maxs[layer_23_idx]:.3f}")
    print(f"Non-math text - Mean: {non_math_means[layer_23_idx]:.3f}, Max: {non_math_maxs[layer_23_idx]:.3f}")
    if non_math_means[layer_23_idx] > 0:
        print(f"Ratio (Math/Non-math) - Mean: {math_means[layer_23_idx]/non_math_means[layer_23_idx]:.2f}x")

In [None]:
print("Creating plots...")
plot_activation_analysis(math_activations, non_math_activations)
print("Analysis complete!")

In [None]:
# Look at raw activation values for debugging
print("Raw activation values for Layer 23:")
print("Math problems:", math_activations[23])
print("Non-math text:", non_math_activations[23])

# Show the difference
math_layer23 = np.array(math_activations[23])
nonmath_layer23 = np.array(non_math_activations[23])
print(f"\nLayer 23 activation comparison:")
print(f"Math mean: {math_layer23.mean():.3f} ± {math_layer23.std():.3f}")
print(f"Non-math mean: {nonmath_layer23.mean():.3f} ± {nonmath_layer23.std():.3f}")