In [3]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM, BitsAndBytesConfig
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np
from typing import Dict, Callable, List
import json
import random

Error importing huggingface_hub.hf_api: No module named 'requests'


ModuleNotFoundError: No module named 'requests'

In [None]:
# ===== CHANGE 1: ADD DETERMINISM FOUNDATION =====
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
torch.use_deterministic_algorithms(True)
torch.backends.cudnn.deterministic = True
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
# ===== END CHANGE 1 =====

In [2]:
def get_model_config(model: nn.Module, model_name: str) -> Dict:
    """
    Creates a standardized configuration dictionary for various model architectures.
    This provides the correct hook names needed for our specific patching experiment.
    """
    config = {}
    model_name_lower = model_name.lower()

    if 'gpt-j' in model_name_lower:
        config.update({
            "n_layers": model.config.n_layer, "n_heads": model.config.n_head,
            "d_model": model.config.n_embd,
            "attn_hook_name_template": "transformer.h.{}.attn.out_proj",
            "mlp_hook_name_template": "transformer.h.{}.mlp.fc_out"
        })
    elif 'gpt2' in model_name_lower:
        config.update({
            "n_layers": model.config.n_layer, "n_heads": model.config.n_head,
            "d_model": model.config.n_embd,
            "attn_hook_name_template": "transformer.h.{}.attn.c_proj",
            "mlp_hook_name_template": "transformer.h.{}.mlp.c_proj"
        })
    elif 'gpt-neo' in model_name_lower and 'gpt-neox' not in model_name_lower:
        config.update({
            "n_layers": model.config.num_layers, "n_heads": model.config.num_heads,
            "d_model": model.config.hidden_size,
            "attn_hook_name_template": "transformer.h.{}.attn.out_proj",
            "mlp_hook_name_template": "transformer.h.{}.mlp.c_proj"
        })
    elif 'gpt-neox' in model_name_lower or 'pythia' in model_name_lower:
        config.update({
            "n_layers": model.config.num_hidden_layers, "n_heads": model.config.num_attention_heads,
            "d_model": model.config.hidden_size,
            "attn_hook_name_template": "gpt_neox.layers.{}.attention.dense",
            "mlp_hook_name_template": "gpt_neox.layers.{}.mlp.dense_4h_to_h"
        })
    elif 'llama' in model_name_lower:
        config.update({
            "n_layers": model.config.num_hidden_layers, "n_heads": model.config.num_attention_heads,
            "d_model": model.config.hidden_size,
            "attn_hook_name_template": "model.layers.{}.self_attn.o_proj",
            "mlp_hook_name_template": "model.layers.{}.mlp.down_proj"
        })
    else:
        raise NotImplementedError(f"Model architecture for '{model_name}' not recognized. Please add its configuration.")

    return config

def setup_model_and_tokenizer(model_name: str, device: str = 'cuda'):
    """
    Loads a pretrained Hugging Face model and tokenizer, handling various architectures.
    """
    print(f"--- Loading model and tokenizer for '{model_name}' ---")

    # --- ADD HUGGING FACE TOKEN HERE FOR GATED MODELS LIKE LLAMA ---
    # Replace "YOUR_HF_TOKEN_HERE" with your actual token.
    # It can be a read-only token for security.
    HUGGING_FACE_TOKEN = "hf_findNewOne"

    model_dtype = torch.float16 if any(k in model_name.lower() for k in ['6b', '13b', '20b', '70b']) else torch.float32

    if 'llama' in model_name.lower():
        if HUGGING_FACE_TOKEN == "YOUR_HF_TOKEN_HERE":
            print("Warning: Llama model selected, but no Hugging Face token provided. This may fail.")
            access_token = None
        else:
            access_token = HUGGING_FACE_TOKEN

        tokenizer = LlamaTokenizer.from_pretrained(model_name, token=access_token)
        model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, low_cpu_mem_usage=True, token=access_token).to(device)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, low_cpu_mem_usage=True).to(device)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    model.eval()

    # ===== CHANGE 6: LAYERNORM STABILIZATION =====
    for module in model.modules():
        if isinstance(module, torch.nn.LayerNorm):
            module.eps = 1e-3  # Increased from 1e-5/1e-6
    # ===== END CHANGE 6 =====
    
    model_config = get_model_config(model, model_name)

    return model, tokenizer, model_config

In [3]:
""""""
def load_gpt_model_and_tokenizer(model_name:str, device='cuda'):
    """
    Loads a huggingface model and its tokenizer

    Parameters:
    model_name: huggingface name of the model to load (e.g. GPTJ: "EleutherAI/gpt-j-6B", or "EleutherAI/gpt-j-6b")
    device: 'cuda' or 'cpu'

    Returns:
    model: huggingface model
    tokenizer: huggingface tokenizer
    MODEL_CONFIG: config variables w/ standardized names

    """
    assert model_name is not None

    print("Loading: ", model_name)

    if 'gpt-j' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True).to(device)

        MODEL_CONFIG={"n_heads":model.config.n_head,
                      "n_layers":model.config.n_layer,
                      "resid_dim":model.config.n_embd,
                      "name_or_path":model.config.name_or_path,
                      "attn_hook_names":[f'transformer.h.{layer}.attn.out_proj' for layer in range(model.config.n_layer)],
                      "layer_hook_names":[f'transformer.h.{layer}' for layer in range(model.config.n_layer)],
                      "prepend_bos":False}

    elif 'gpt2' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained("gpt2")

        MODEL_CONFIG={"n_heads":model.config.n_head,
                      "n_layers":model.config.n_layer,
                      "resid_dim":model.config.n_embd,
                      "name_or_path":model.config.name_or_path,
                      "attn_hook_names":[f'transformer.h.{layer}.attn.out_proj' for layer in range(model.config.n_layer)],
                      "layer_hook_names":[f'transformer.h.{layer}' for layer in range(model.config.n_layer)],
                      "prepend_bos":False}

    elif 'gpt-neo-125m' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name)

        MODEL_CONFIG={"n_heads":model.config.num_heads,
                      "n_layers":model.config.num_layers,
                      "resid_dim": model.config.hidden_size,
                      "name_or_path":model.config.name_or_path,
                      "attn_hook_names":[f'gpt_neo.layers.{layer}.attention.dense' for layer in range(model.config.num_layers)],
                      "layer_hook_names":[f'gpt_neo.layers.{layer}' for layer in range(model.config.num_layers)],
                      "prepend_bos":False}

    elif 'gpt-neox' in model_name.lower() or 'pythia' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)

        MODEL_CONFIG={"n_heads":model.config.num_attention_heads,
                      "n_layers":model.config.num_hidden_layers,
                      "resid_dim": model.config.hidden_size,
                      "name_or_path":model.config.name_or_path,
                      "attn_hook_names":[f'gpt_neox.layers.{layer}.attention.dense' for layer in range(model.config.num_hidden_layers)],
                      "layer_hook_names":[f'gpt_neox.layers.{layer}' for layer in range(model.config.num_hidden_layers)],
                      "prepend_bos":False}

    elif 'llama' in model_name.lower():
        if '70b' in model_name.lower():
            # use quantization. requires `bitsandbytes` library
            from transformers import BitsAndBytesConfig
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type='nf4',
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.float16
            )

            access_token = "hf_findNewOne"

            tokenizer = LlamaTokenizer.from_pretrained(model_name, token=access_token)
            model = LlamaForCausalLM.from_pretrained(
                model_name,
                trust_remote_code=True,
                quantization_config=bnb_config,
                token=access_token
            )
        else:
            if '7b' in model_name.lower() or '8b' in model_name.lower():
                model_dtype = torch.float32
            else: #half precision for bigger llama models
                #This becomes only for the 13B model then. Okay then. What else?
                model_dtype = torch.float16

            # If transformers version is < 4.31 use LlamaLoaders
            # tokenizer = LlamaTokenizer.from_pretrained(model_name)
            # model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype).to(device)

            # If transformers version is >= 4.31, use AutoLoaders
            access_token = "hf_findNewOne"

            tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
            tokenizer.pad_token = tokenizer.eos_token 
            model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, token=access_token).to(device)

        # --- THIS IS THE FIX ---
        if tokenizer.pad_token is None:
            print("Llama tokenizer does not have a pad token. Setting pad_token = eos_token.")
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = model.config.eos_token_id
        
        MODEL_CONFIG={"n_heads":model.config.num_attention_heads,
                      "n_layers":model.config.num_hidden_layers,
                      "resid_dim":model.config.hidden_size,
                      "name_or_path":model.config._name_or_path,
                      "attn_hook_names":[f'model.layers.{layer}.self_attn.o_proj' for layer in range(model.config.num_hidden_layers)],
                      "layer_hook_names":[f'model.layers.{layer}' for layer in range(model.config.num_hidden_layers)],
                      "prepend_bos":True}
    else:
        raise NotImplementedError("Still working to get this model available!")


    return model, tokenizer, MODEL_CONFIG
""""""

''

In [4]:
def load_gpt_model_and_tokenizer(model_name:str, device='cuda'):
    """
    Loads a huggingface model and its tokenizer. This version is corrected to
    handle all model architectures and create the correct config dictionary.
    """
    assert model_name is not None
    print(f"--- Loading model and tokenizer for '{model_name}' ---")

    HUGGING_FACE_TOKEN = "hf_findNewOne" # User's provided token
    kwargs = {'low_cpu_mem_usage': True}
    
    # --- Model Loading Logic ---
    if 'gpt-j' in model_name.lower():
        print("Using float16 revision for gpt-j-6B to ensure PyTorch-only workflow.")
        kwargs['revision'] = 'float16'
        kwargs['torch_dtype'] = torch.float16
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs).to(device)
    elif 'llama' in model_name.lower():
        access_token = HUGGING_FACE_TOKEN if HUGGING_FACE_TOKEN != "YOUR_HF_TOKEN_HERE" else None
        if not access_token: print("Warning: Llama model selected, but no Hugging Face token provided.")
        kwargs['token'] = access_token
        if '70b' in model_name.lower():
            bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.float16)
            kwargs['quantization_config'] = bnb_config
            kwargs['trust_remote_code'] = True
            tokenizer = LlamaTokenizer.from_pretrained(model_name, token=access_token)
            model = LlamaForCausalLM.from_pretrained(model_name, **kwargs)
        else:
            kwargs['torch_dtype'] = torch.float16 if any(k in model_name.lower() for k in ['13b']) else torch.float32
            tokenizer = LlamaTokenizer.from_pretrained(model_name, token=access_token)
            model = LlamaForCausalLM.from_pretrained(model_name, **kwargs).to(device)
    else:
        kwargs['torch_dtype'] = torch.float16 if '20b' in model_name.lower() else torch.float32
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs).to(device)
        
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    # --- THIS IS THE FIX: Create the config dict with the correct keys ---
    model_config = {}
    model_name_lower = model_name.lower()
    if 'gpt-j' in model_name_lower:
        model_config.update({"n_layers": model.config.n_layer, "n_heads": model.config.n_head, "d_model": model.config.n_embd, "attn_hook_name_template": "transformer.h.{}.attn.out_proj", "mlp_hook_name_template": "transformer.h.{}.mlp.fc_out"})
    elif 'gpt2' in model_name_lower:
        model_config.update({"n_layers": model.config.n_layer, "n_heads": model.config.n_head, "d_model": model.config.n_embd, "attn_hook_name_template": "transformer.h.{}.attn.c_proj", "mlp_hook_name_template": "transformer.h.{}.mlp.c_proj"})
    elif 'gpt-neo' in model_name_lower and 'gpt-neox' not in model_name_lower:
        model_config.update({"n_layers": model.config.num_layers, "n_heads": model.config.num_heads, "d_model": model.config.hidden_size, "attn_hook_name_template": "transformer.h.{}.attn.out_proj", "mlp_hook_name_template": "transformer.h.{}.mlp.c_proj"})
    elif 'gpt-neox' in model_name_lower or 'pythia' in model_name_lower:
        model_config.update({"n_layers": model.config.num_hidden_layers, "n_heads": model.config.num_attention_heads, "d_model": model.config.hidden_size, "attn_hook_name_template": "gpt_neox.layers.{}.attention.dense", "mlp_hook_name_template": "gpt_neox.layers.{}.mlp.dense_4h_to_h"})
    elif 'llama' in model_name_lower:
        model_config.update({"n_layers": model.config.num_hidden_layers, "n_heads": model.config.num_attention_heads, "d_model": model.config.hidden_size, "attn_hook_name_template": "model.layers.{}.self_attn.o_proj", "mlp_hook_name_template": "model.layers.{}.mlp.down_proj"})
    else: raise NotImplementedError("Model architecture not recognized.")

    model.eval()
    
    return model, tokenizer, model_config

In [5]:
def get_task_datasets() -> Dict[str, Dict]:
    """Defines the datasets for each task, loading and formatting from JSON files."""
    
    # ===== CHANGE 2: SEED PROMPT SAMPLING =====
    random.seed(SEED)
    # ===== END CHANGE 2 =====
    
    # --- Analogy Task Dataset Generation ---
    with open('../BaselineAccuracy/dataset_files/abstractive/succ_letterstring_basic.json', 'r') as f:
        analogy_data = json.load(f)

    analogy_prompts = []
    # Take a random sample of 20 to ensure variety each time the script is run
    for _ in range(20):
        # Ensure the example and target prompts are different
        # ===== CHANGE 2: USE CHOICE INSTEAD OF SAMPLE =====
        while True:
            example_pair, target_pair = random.sample(analogy_data, 2)
            if example_pair['input'] != target_pair['input']:
                break
        # ===== END CHANGE 2 =====

        # Clean up the strings from the JSON file and format them
        # e.g., "[e f g h]" -> "e:f:g:h"
        example_in = example_pair['input'].strip('[]').replace(' ', '')
        example_out = example_pair['output'].strip('[]').replace(' ', '')
        target_in_full = target_pair['input'].strip('[]').replace(' ', '')
        target_out_full = target_pair['output'].strip('[]').replace(' ', '')

        # Create the prefix and single-token answers
        #target_in_parts = target_in_full.split(':')
        target_prefix = target_out_full[:3]
        correct_answer = target_out_full[-1]
        incorrect_answer = target_in_full[-1]

        analogy_prompts.append({
            "clean_prompt": f"{example_in}:{example_out}::{target_in_full}:{target_prefix}",
            "clean_correct_answer": correct_answer,
            "clean_incorrect_answer": incorrect_answer,
            "corrupted_prompt": f"{example_in}:{example_in}::{target_in_full}:{target_prefix}",
            "corrupted_correct_answer": incorrect_answer,
            "corrupted_incorrect_answer": correct_answer,
        })

    # --- Sequencing Task Dataset Generation ---
    with open('../BaselineAccuracy/dataset_files/abstractive/next_item.json', 'r') as f:
        sequencing_data = json.load(f)

    sequencing_prompts = []
    # Take a random sample of 20
    for _ in range(20):
        # ===== CHANGE 2: USE CHOICE INSTEAD OF SAMPLE =====
        while True:
            example_pair, target_pair = random.sample(sequencing_data, 2)
            if example_pair['input'] != target_pair['input']:
                break
        # ===== END CHANGE 2 =====

        # Add leading spaces to ensure consistent tokenization
        example_in, example_out =  example_pair['input'], example_pair['output']
        target_in, target_out = target_pair['input'], target_pair['output']

        sequencing_prompts.append({
            "clean_prompt": f"{example_in}:{example_out}::{target_in}:",
            "clean_correct_answer": target_out,
            "clean_incorrect_answer": target_in,
            "corrupted_prompt": f"{example_in}:{example_in}::{target_in}:",
            "corrupted_correct_answer": target_in,
            "corrupted_incorrect_answer": target_out,
        })

    datasets = {
        "analogy": {
            "description": "Letter-String Analogy Task ('+1' vs No Rule)",
            "prompts": analogy_prompts
        },
        "sequencing": {
            "description": "Next-Item Sequencing Task",
            "prompts": sequencing_prompts
        }
    }
    #print(datasets)
    return datasets

In [6]:
# ===== CHANGE 4: LOCAL CACHE ISOLATION =====
def caching_hook_factory(cache: dict, hook_name: str) -> Callable:
    def hook(module, input, output):
        tensor_to_cache = output[0] if isinstance(output, tuple) else output
        cache[hook_name] = tensor_to_cache.detach().clone()
    return hook

def patching_hook_factory(cache: dict, hook_name: str, head_index: int = None, d_head: int = None) -> Callable:
    def hook(module, input, output):
        if hook_name not in cache:
            raise ValueError(f"Activation for {hook_name} not found!")
        cached_activation = cache[hook_name]
        patched_output = output.clone()
        min_seq_len = min(patched_output.shape[-2], cached_activation.shape[-2])
        if head_index is not None:
            start, end = head_index * d_head, (head_index + 1) * d_head
            if patched_output.ndim == 3: 
                patched_output[:, :min_seq_len, start:end] = cached_activation[:, :min_seq_len, start:end]
            elif patched_output.ndim == 2: 
                patched_output[:min_seq_len, start:end] = cached_activation[:min_seq_len, start:end]
        else:
            if patched_output.ndim == 3: 
                patched_output[:, :min_seq_len, :] = cached_activation[:, :min_seq_len, :]
            elif patched_output.ndim == 2: 
                patched_output[:min_seq_len, :] = cached_activation[:min_seq_len, :]
        return patched_output
    return hook
# ===== END CHANGE 4 =====

def get_module_by_name(model: nn.Module, name: str) -> nn.Module:
    for part in name.split('.'): model = getattr(model, part)
    return model

def run_with_hooks(model: nn.Module, tokenizer: AutoTokenizer, prompt: str, hook_fns: Dict[str, Callable]) -> torch.Tensor:
    handles = []
    try:
        for name, hook_fn in hook_fns.items():
            module = get_module_by_name(model, name)
            handles.append(module.register_forward_hook(hook_fn))
            
        # ===== CHANGE 7: ATTENTION MASK ENFORCEMENT =====
        inputs = tokenizer(prompt, return_tensors='pt', padding=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move ALL to device
        # ===== END CHANGE 7 =====
        
        with torch.no_grad(): outputs = model(**inputs)
        return outputs.logits[0, -1, :]
    finally:
        for handle in handles: handle.remove()

def calculate_logit_diff(logits: torch.Tensor, tokenizer: AutoTokenizer, correct_answer: str, incorrect_answer: str) -> float:
    """
    Calculates the logit difference.
    FIX: Uses `add_special_tokens=False` to prevent the tokenizer from adding
    a Beginning-Of-Sentence token, which would make the logit difference always zero.
    """
    try:
        # ===== CHANGE 3: TOKENIZATION SAFETY =====
        correct_tokens = tokenizer.encode(correct_answer, add_special_tokens=False)
        incorrect_tokens = tokenizer.encode(incorrect_answer, add_special_tokens=False)
        
        if not correct_tokens or not incorrect_tokens:
            return 0.0
            
        correct_id = correct_tokens[0]
        incorrect_id = incorrect_tokens[0]
        # ===== END CHANGE 3 =====

        return (logits[correct_id] - logits[incorrect_id]).item()

    except IndexError:
        # This can happen if the tokenizer returns an empty list for a given string
        print(f"Warning: Tokenizer failed to encode '{correct_answer}' or '{incorrect_answer}'.")
        return 0.0
    except Exception as e:
        print(f"An unexpected error occurred in calculate_logit_diff: {e}")
        return 0.0

In [7]:
def perform_patching_experiment(model: nn.Module, tokenizer: AutoTokenizer, model_config: Dict, source_prompt: str, dest_prompt: str, dest_correct_answer: str, dest_incorrect_answer: str, layer: int, component_type: str, head_index: int = None) -> float:
   # ===== CHANGE 4: LOCAL CACHE ISOLATION =====
    local_cache = {}
    # ===== END CHANGE 4 =====
    hook_template = model_config['mlp_hook_name_template'] if component_type == 'mlp' else model_config['attn_hook_name_template']
    hook_name = hook_template.format(layer)
    
    # ===== CHANGE 4: USE LOCAL CACHE =====
    run_with_hooks(model, tokenizer, source_prompt, 
                  {hook_name: caching_hook_factory(local_cache, hook_name)})
    # ===== END CHANGE 4 =====
    
    d_head = model_config["d_model"] // model_config["n_heads"] if component_type == 'attn_head' else None
    
    # ===== CHANGE 4: USE LOCAL CACHE =====
    patching_hooks = {hook_name: patching_hook_factory(local_cache, hook_name, head_index, d_head)}
    # ===== END CHANGE 4 =====
    
    patched_logits = run_with_hooks(model, tokenizer, dest_prompt, patching_hooks)
    return calculate_logit_diff(patched_logits, tokenizer, dest_correct_answer, dest_incorrect_answer)

def run_exploratory_sweep(model: nn.Module, tokenizer: AutoTokenizer, model_config: Dict, task_data: Dict, patch_type: str) -> pd.DataFrame:
    n_layers, n_heads = model_config["n_layers"], model_config["n_heads"]
    prompt_dataset = task_data['prompts']

    print(f"\nRunning {patch_type} sweep for '{task_data['description']}' over {len(prompt_dataset)} prompts...")

    # Pre-calculate all baseline scores for efficiency
    clean_baselines, corrupted_baselines = [], []
    for prompt_set in prompt_dataset:
        clean_logits = run_with_hooks(model, tokenizer, prompt_set['clean_prompt'], {})
        clean_baselines.append(calculate_logit_diff(clean_logits, tokenizer, prompt_set['clean_correct_answer'], prompt_set['clean_incorrect_answer']))
        corrupted_logits = run_with_hooks(model, tokenizer, prompt_set['corrupted_prompt'], {})
        corrupted_baselines.append(calculate_logit_diff(corrupted_logits, tokenizer, prompt_set['corrupted_correct_answer'], prompt_set['corrupted_incorrect_answer']))

    results = []
    for component_type, head_range in [('attn_head', range(n_heads)), ('mlp', [-1])]:
        print(f"  - Patching {component_type}s...")
        for layer in range(n_layers):
            for head_index in head_range:
                effects_for_this_component = []
                # Inner loop to iterate over the dataset for each component
                for i, prompt_set in enumerate(prompt_dataset):
                    global activation_cache
                    activation_cache = {}
                    if patch_type == 'noising':
                        source_prompt, dest_prompt = prompt_set['corrupted_prompt'], prompt_set['clean_prompt']
                        dest_correct, dest_incorrect = prompt_set['clean_correct_answer'], prompt_set['clean_incorrect_answer']
                        baseline_to_compare = clean_baselines[i]
                    else: # denoising
                        source_prompt, dest_prompt = prompt_set['clean_prompt'], prompt_set['corrupted_prompt']
                        dest_correct, dest_incorrect = prompt_set['corrupted_correct_answer'], prompt_set['corrupted_incorrect_answer']
                        baseline_to_compare = corrupted_baselines[i]

                    # Perform the patching experiment for this single prompt
                    patched_logit_diff = perform_patching_experiment(model, tokenizer, model_config, source_prompt, dest_prompt, dest_correct, dest_incorrect, layer, component_type, head_index if component_type == 'attn_head' else None)
                    effect = patched_logit_diff - baseline_to_compare
                    effects_for_this_component.append(effect)

                # Calculate the average effect across all prompts for this one component
                average_effect = np.mean(effects_for_this_component)
                results.append({'layer': layer, 'head': head_index, 'type': component_type, 'effect': average_effect})

    return pd.DataFrame(results)

In [8]:
def plot_results(df: pd.DataFrame, title: str, component_type: str, output_path: str = None):
    if component_type == 'attn_head':
        if df[df['type'] == 'attn_head'].empty: return
        pivot_df = df[df['type'] == 'attn_head'].pivot(index='head', columns='layer', values='effect')
        fig, ax = plt.subplots(figsize=(12, 10))
        max_abs_val = pivot_df.abs().max().max() if not pivot_df.empty else 1.0
        im = ax.imshow(pivot_df, cmap='coolwarm', vmin=-max_abs_val, vmax=max_abs_val, aspect='auto')
        cbar = ax.figure.colorbar(im, ax=ax); cbar.ax.set_ylabel("Effect on Logit Difference", rotation=-90, va="bottom")
        ax.set_xticks(np.arange(pivot_df.shape[1])); ax.set_yticks(np.arange(pivot_df.shape[0]))
        ax.set_xticklabels(pivot_df.columns); ax.set_yticklabels(pivot_df.index)
        ax.set_xlabel("Layer"); ax.set_ylabel("Head Index")
    elif component_type == 'mlp':
        if df[df['type'] == 'mlp'].empty: return
        mlp_df = df[df['type'] == 'mlp'].sort_values('layer')
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.bar(mlp_df['layer'], mlp_df['effect'], color='skyblue')
        ax.set_xlabel("Layer"); ax.set_ylabel("Average Effect on Logit Difference")
        ax.grid(axis='y', linestyle='--'); ax.set_xticks(mlp_df['layer'])
    else: raise ValueError("Invalid component type")
    ax.set_title(title); fig.tight_layout()
    if output_path:
        plt.savefig(output_path, format='pdf'); print(f"Saved plot to: {output_path}")
    plt.show(); plt.close(fig)

In [9]:
def main():
    """
    Main function to run the activation patching experiment across multiple models.
    """
    # This model dictionary is taken directly from the user's script
    models_to_test = {
        #'gptneo': 'EleutherAI/gpt-neo-125m',
        #'gpt2': 'gpt2', # Added gpt2 for a quick baseline
        #'gptj6b': 'EleutherAI/gpt-j-6B',
        #'llama27b': 'meta-llama/Llama-2-7b-hf',
        'llama213b': 'meta-llama/Llama-2-13b-hf',
        #'gptneox20b': 'EleutherAI/gpt-neox-20b',
        #'llama270b': 'meta-llama/Llama-2-70b-hf'
    }

    main_output_dir = "Results_5_Random_MultiRun"
    os.makedirs(main_output_dir, exist_ok=True)
    datasets = get_task_datasets()

    for model_short_name, model_hf_name in models_to_test.items():
        torch.cuda.empty_cache()
        try:
            model, tokenizer, model_config = load_gpt_model_and_tokenizer(model_hf_name)
        except Exception as e:
            print(f"\n--- Could not load model {model_hf_name}. Skipping. Error: {e} ---\n")
            
            from transformers import file_utils
            import shutil
            import re
    
            # 1. Get model's cache folder name (convert "/" to "--")
            model_cache_name = f"models--{re.sub(r'/', '--', model_hf_name)}"
            cache_path = os.path.join(file_utils.default_cache_path, model_cache_name)
    
            # 2. Delete only this model's folder
            if os.path.exists(cache_path):
                print(f"Deleting model cache: {cache_path}")
                shutil.rmtree(cache_path, ignore_errors=True)
                
            continue

        # Create model-specific subdirectory inside the main "Results" folder
        model_results_dir = os.path.join(main_output_dir, model_short_name)
        os.makedirs(model_results_dir, exist_ok=True)

        for task_name, task_data in datasets.items():
            for patch_type in ['noising']:
                result_key = f"{task_name}_{patch_type}"
                df = run_exploratory_sweep(model, tokenizer, model_config, task_data, patch_type)

                # Save CSV and Plots inside the model-specific subfolder
                csv_path = os.path.join(model_results_dir, f"{result_key}_results.csv")
                df.to_csv(csv_path, index=False)
                print(f"Saved data to: {csv_path}")

                plot_path_attn = os.path.join(model_results_dir, f"{result_key}_attn_heads.pdf")
                plot_path_mlp = os.path.join(model_results_dir, f"{result_key}_mlp_layers.pdf")
                title_attn = f"Attention Heads Effect ({patch_type.capitalize()})\n{model_short_name} - {datasets[task_name]['description']}"
                title_mlp = f"MLP Layers Effect ({patch_type.capitalize()})\n{model_short_name} - {datasets[task_name]['description']}"
                plot_results(df, title_attn, 'attn_head', output_path=plot_path_attn)
                plot_results(df, title_mlp, 'mlp', output_path=plot_path_mlp)

        print(f"--- Finished with {model_short_name}. Clearing memory. ---")
        del model, tokenizer, model_config
        torch.cuda.empty_cache()

        # --- NEW: Delete ONLY this model's cache ---
        from transformers import file_utils
        import shutil
        import re

        # 1. Get model's cache folder name (convert "/" to "--")
        model_cache_name = f"models--{re.sub(r'/', '--', model_hf_name)}"
        cache_path = os.path.join(file_utils.default_cache_path, model_cache_name)

        # 2. Delete only this model's folder
        if os.path.exists(cache_path):
            print(f"Deleting model cache: {cache_path}")
            shutil.rmtree(cache_path, ignore_errors=True)

In [None]:
if __name__ == '__main__':
    main()

--- Loading model and tokenizer for 'gpt2' ---


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


Running noising sweep for 'Letter-String Analogy Task ('+1' vs No Rule)' over 20 prompts...
  - Patching attn_heads...
  - Patching mlps...
Saved data to: Results_5_Random_MultiRun/gpt2/analogy_noising_results.csv
Saved plot to: Results_5_Random_MultiRun/gpt2/analogy_noising_attn_heads.pdf
Saved plot to: Results_5_Random_MultiRun/gpt2/analogy_noising_mlp_layers.pdf

Running noising sweep for 'Next-Item Sequencing Task' over 20 prompts...


  plt.show(); plt.close(fig)
  plt.show(); plt.close(fig)


  - Patching attn_heads...
  - Patching mlps...
Saved data to: Results_5_Random_MultiRun/gpt2/sequencing_noising_results.csv
Saved plot to: Results_5_Random_MultiRun/gpt2/sequencing_noising_attn_heads.pdf
Saved plot to: Results_5_Random_MultiRun/gpt2/sequencing_noising_mlp_layers.pdf
--- Finished with gpt2. Clearing memory. ---
Deleting model cache: /home/awadehra/.cache/huggingface/hub/models--gpt2
--- Loading model and tokenizer for 'EleutherAI/gpt-j-6B' ---
Using float16 revision for gpt-j-6B to ensure PyTorch-only workflow.


  plt.show(); plt.close(fig)
  plt.show(); plt.close(fig)


tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/836 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/12.1G [00:00<?, ?B/s]

Some weights of the model checkpoint at EleutherAI/gpt-j-6B were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi


Running noising sweep for 'Letter-String Analogy Task ('+1' vs No Rule)' over 20 prompts...
  - Patching attn_heads...
  - Patching mlps...
Saved data to: Results_5_Random_MultiRun/gptj6b/analogy_noising_results.csv
Saved plot to: Results_5_Random_MultiRun/gptj6b/analogy_noising_attn_heads.pdf
Saved plot to: Results_5_Random_MultiRun/gptj6b/analogy_noising_mlp_layers.pdf

Running noising sweep for 'Next-Item Sequencing Task' over 20 prompts...


  plt.show(); plt.close(fig)
  plt.show(); plt.close(fig)


  - Patching attn_heads...
  - Patching mlps...
Saved data to: Results_5_Random_MultiRun/gptj6b/sequencing_noising_results.csv
Saved plot to: Results_5_Random_MultiRun/gptj6b/sequencing_noising_attn_heads.pdf
Saved plot to: Results_5_Random_MultiRun/gptj6b/sequencing_noising_mlp_layers.pdf
--- Finished with gptj6b. Clearing memory. ---
Deleting model cache: /home/awadehra/.cache/huggingface/hub/models--EleutherAI--gpt-j-6B


  plt.show(); plt.close(fig)
  plt.show(); plt.close(fig)


--- Loading model and tokenizer for 'meta-llama/Llama-2-7b-hf' ---


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]


Running noising sweep for 'Letter-String Analogy Task ('+1' vs No Rule)' over 20 prompts...
  - Patching attn_heads...
  - Patching mlps...
Saved data to: Results_5_Random_MultiRun/llama27b/analogy_noising_results.csv
Saved plot to: Results_5_Random_MultiRun/llama27b/analogy_noising_attn_heads.pdf
Saved plot to: Results_5_Random_MultiRun/llama27b/analogy_noising_mlp_layers.pdf

Running noising sweep for 'Next-Item Sequencing Task' over 20 prompts...


  plt.show(); plt.close(fig)
  plt.show(); plt.close(fig)


  - Patching attn_heads...
  - Patching mlps...
Saved data to: Results_5_Random_MultiRun/llama27b/sequencing_noising_results.csv
Saved plot to: Results_5_Random_MultiRun/llama27b/sequencing_noising_attn_heads.pdf
Saved plot to: Results_5_Random_MultiRun/llama27b/sequencing_noising_mlp_layers.pdf
--- Finished with llama27b. Clearing memory. ---


  plt.show(); plt.close(fig)
  plt.show(); plt.close(fig)


Deleting model cache: /home/awadehra/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf
--- Loading model and tokenizer for 'meta-llama/Llama-2-13b-hf' ---


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]


Running noising sweep for 'Letter-String Analogy Task ('+1' vs No Rule)' over 20 prompts...
  - Patching attn_heads...
  - Patching mlps...
Saved data to: Results_5_Random_MultiRun/llama213b/analogy_noising_results.csv
Saved plot to: Results_5_Random_MultiRun/llama213b/analogy_noising_attn_heads.pdf
Saved plot to: Results_5_Random_MultiRun/llama213b/analogy_noising_mlp_layers.pdf

Running noising sweep for 'Next-Item Sequencing Task' over 20 prompts...


  plt.show(); plt.close(fig)
  plt.show(); plt.close(fig)


  - Patching attn_heads...
  - Patching mlps...
Saved data to: Results_5_Random_MultiRun/llama213b/sequencing_noising_results.csv
Saved plot to: Results_5_Random_MultiRun/llama213b/sequencing_noising_attn_heads.pdf
Saved plot to: Results_5_Random_MultiRun/llama213b/sequencing_noising_mlp_layers.pdf
--- Finished with llama213b. Clearing memory. ---


  plt.show(); plt.close(fig)
  plt.show(); plt.close(fig)


Deleting model cache: /home/awadehra/.cache/huggingface/hub/models--meta-llama--Llama-2-13b-hf
--- Loading model and tokenizer for 'EleutherAI/gpt-neox-20b' ---


tokenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 46 files:   0%|          | 0/46 [00:00<?, ?it/s]

model-00003-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00002-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00001-of-00046.safetensors:   0%|          | 0.00/926M [00:00<?, ?B/s]

model-00005-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00008-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00007-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00006-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00004-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00009-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00010-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00011-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00012-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00013-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00014-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00015-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00016-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00017-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00018-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00019-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00020-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00021-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00022-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00023-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00024-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00025-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00026-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00027-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00028-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00029-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00030-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00031-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00033-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00032-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00034-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00035-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00036-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00037-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00038-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00039-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00040-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00041-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00042-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00043-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00044-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00045-of-00046.safetensors:   0%|          | 0.00/604M [00:00<?, ?B/s]

model-00046-of-00046.safetensors:   0%|          | 0.00/620M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/46 [00:00<?, ?it/s]


Running noising sweep for 'Letter-String Analogy Task ('+1' vs No Rule)' over 20 prompts...
  - Patching attn_heads...
