In [44]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
model_name = "Qwen/Qwen2.5-Math-1.5B-Instruct"
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
!git clone https://github.com/ruoyuxie/adversarial_mwps_generation

Cloning into 'adversarial_mwps_generation'...
remote: Enumerating objects: 876, done.[K
remote: Counting objects: 100% (876/876), done.[K
remote: Compressing objects: 100% (551/551), done.[K
remote: Total 876 (delta 15), reused 862 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (876/876), 1.76 MiB | 3.75 MiB/s, done.
Resolving deltas: 100% (15/15), done.


In [None]:
generated_data_path = '/content/adversarial_mwps_generation/data/experimental_data/generated_adversarial_examples/gsm8k.json'
import json
with open(generated_data_path, 'r') as f:
  data = json.loads(f.read())

sample_input = data['2']['M2'][0][0]
sample_output = data['2']['M2'][0][1]

In [None]:
sample_input = data['2']['M3'][0][0]
sample_output = data['2']['M3'][0][1]

In [None]:
sample_input

'Chenny is 15 years old. Alyana is 6 years younger than Chenny. How old is Anne if she is 8 years older than Alyana?'

In [None]:
sample_input

'Chenny is 64 years old. Alyana is 54 years younger than Chenny. How old is Anne if she is 55 years older than Alyana?'

In [None]:
sample_output

65

In [2]:
correct_prompt = "Chenny is 64 years old. Alyana is 54 years younger than Chenny. How old is Anne if she is 55 years older than Alyana?"
purtubed_prompt = "Chenny is 78 years old. Alyana is 21 years younger than Chenny. How old is Anne if she is 31 years older than Alyana?"

correct_ans = '65'
purtubed_ans = '88'

correct_ans_tokens = tokenizer.encode(str(correct_ans))
purtubed_ans_tokens = tokenizer.encode(str(purtubed_ans))


In [3]:
sample_probs = []

In [36]:
indirect_effect = {}
boxed_token = torch.tensor((tokenizer.encode('boxed{')))


In [37]:
# temperature = 0.7 and top_p = 0.8 according to Qwen docs
# https://github.com/QwenLM/Qwen2.5-Math
torch.manual_seed(42)

def inference_with_hidden_state_new(prompt, temperature=0.7, top_p=0.8, save_activations=False, patch_activations = False, patch_layer=0, activations={}):
    """Returns a dictionary with keys: 'sequences', 'logits', 'attentions', 'hidden_states'.

    - sequences: The output token IDs, shape: (batch_size, output_len).
    - logits: Logits for the last generated token, shape: (1, vocab_size).
    - attentions: Attention outputs for the last token, shape: (num_layers, num_heads, seq_len, seq_len).
    - hidden_states: Hidden states for the last token, shape: (num_layers, hidden_size).
    """
    # Prepare the input
    messages = [
        # {"role": "system", "content": "Please put your final answer within \\boxed{}."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    input_ids = model_inputs.input_ids

    global sample_probs
    # Initialize variables to store outputs
    sequences = input_ids
    # last_logits = None
    # last_attentions = None
    # last_hidden_states = None

    # Disable cache to get full attention outputs
    model.config.use_cache = True
    track = False
    # Generate tokens one by one
    stored_activations = {}
    with torch.no_grad():
        for _ in range(512):  # max_new_tokens
            outputs = model(
                input_ids=sequences,
                # output_attentions=True,
                # output_hidden_states=True,
                # return_dict=True
            )

            # Get the logits for the next token
            next_token_logits = outputs.logits[:, -1, :]  # Shape: (1, vocab_size)

            # Apply temperature scaling
            next_token_logits = next_token_logits / temperature

            # Sample the next token
            probs = torch.softmax(next_token_logits, dim=-1)
            next_token_id = torch.multinomial(probs, num_samples=1)

            # Append the new token to the sequence
            # prev_sequences = sequences.clone()
            sequences = torch.cat([sequences, next_token_id], dim=-1)

            if track and cnt < len(correct_ans_tokens):
              pr = f'pr*_{patch_layer}' if patch_activations else 'pr'
              indirect_effect[f'{pr}_{cnt}'] = probs[0, correct_ans_tokens[cnt]]
              indirect_effect[f'{pr}!_{cnt}'] = probs[0, purtubed_ans_tokens[cnt]]
              cnt +=1

            if torch.all(sequences[0,-2:] == boxed_token.to('cuda')):
              track = True
              cnt = 0
              sample_probs = probs

            if track and torch.all(sequences[0,-1:] == torch.tensor(tokenizer.encode('}')).to('cuda')):
              track = False
              print('ggg')


## ----------------------- Save activations ---------------------------
            if save_activations and track and cnt == 0:
              print('saving activations ..')
              def hook_fn(name):
                def hook(module, input, output):
                    # Store the activation for this module
                    stored_activations[name] = output.detach().clone()
                return hook

              # Register forward hook
              hooks = []

              try:

                for layer in range(model.config.num_hidden_layers):
                  print(layer)

                  module = model.model.layers[layer].mlp
                  handle = module.register_forward_hook(hook_fn(f'layer_{layer}'))
                  hooks.append(handle)

                # Run the forward
                output = model(input_ids=sequences)
              finally:
                for h in hooks:
                  h.remove()

              return stored_activations

## ----------------------- Patch activations -------------------------
            if patch_activations and track and cnt == 0:
              print('applying patching')
              def hook_fn(name):
                def hook(module, input, output):
                    # Store the activation for this module
                    # stored_activations[name] = output.detach().clone()
                    output[:,-1,:] = activations[f'layer_{patch_layer}'][:,-1,:]
                return hook

              # Register forward hook
              hooks = []



              module = model.model.layers[patch_layer].mlp
              handle = module.register_forward_hook(hook_fn(f'layer_{patch_layer}'))
              hooks.append(handle)

                # # Run the forward
                # output = model(input_ids=sequences)
              # finally:
              #   for h in hooks:
              #     h.remove()


            # Stop if the end-of-sequence token is generated
            if next_token_id.item() == tokenizer.eos_token_id:
                if patch_activations:
                  print('clearing hook')
                  for h in hooks:
                    h.remove()
                break

    # Prepare the output dictionary
    output_dict = {
        "sequences": sequences,
        # "logits": last_logits,
        # "attentions": last_attentions,
        # "hidden_states": last_hidden_states
    }

    # Clear GPU memory
    torch.cuda.empty_cache()

    return output_dict

In [15]:
purtubed_prompt

In [38]:
activations = inference_with_hidden_state_new(purtubed_prompt, save_activations=True)

saving activations ..
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


In [18]:
activations.keys()

dict_keys(['layer_0', 'layer_1', 'layer_2', 'layer_3', 'layer_4', 'layer_5', 'layer_6', 'layer_7', 'layer_8', 'layer_9', 'layer_10', 'layer_11', 'layer_12', 'layer_13', 'layer_14', 'layer_15', 'layer_16', 'layer_17', 'layer_18', 'layer_19', 'layer_20', 'layer_21', 'layer_22', 'layer_23', 'layer_24', 'layer_25', 'layer_26', 'layer_27'])

In [39]:
ans = inference_with_hidden_state_new(correct_prompt)

In [42]:
tokenizer.decode(ans['sequences'].squeeze())

"<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n<|im_start|>user\nChenny is 64 years old. Alyana is 54 years younger than Chenny. How old is Anne if she is 55 years older than Alyana?<|im_end|>\n<|im_start|>assistant\nTo determine Anne's age, we need to follow these steps:\n\n1. Find out Alyana's age.\n2. Use Alyana's age to find out Anne's age.\n\nStep 1: Calculate Alyana's age.\nChenny is 64 years old, and Alyana is 54 years younger than Chenny. Therefore, Alyana's age is:\n\\[ 64 - 54 = 10 \\]\nSo, Alyana is 10 years old.\n\nStep 2: Calculate Anne's age.\nAnne is 55 years older than Alyana. Therefore, Anne's age is:\n\\[ 10 + 55 = 65 \\]\nSo, Anne is 65 years old.\n\nThe final answer is:\n\\[\n\\boxed{65}\n\\]<|im_end|>"

In [40]:
ans1 = inference_with_hidden_state_new(correct_prompt,patch_activations=True, patch_layer=0, activations = activations)

applying patching
clearing hook


In [43]:
tokenizer.decode(ans1['sequences'].squeeze())

"<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n<|im_start|>user\nChenny is 64 years old. Alyana is 54 years younger than Chenny. How old is Anne if she is 55 years older than Alyana?<|im_end|>\n<|im_start|>assistant\nTo determine Anne's age, we need to follow these steps:\n\n1. Find Alyana's age.\n2. Use Alyana's age to find Anne's age.\n\nFirst, let's find Alyana's age. We know that Alyana is 54 years younger than Chenny. Chenny is 64 years old. Therefore, Alyana's age is:\n\\[ 64 - 54 = 10 \\]\nSo, Alyana is 10 years old.\n\nNext, we need to find Anne's age. We know that Anne is 55 years older than Alyana. Since Alyana is 10 years old, Anne's age is:\n\\[ 10 + 55 = 65 \\]\nSo, Anne is 65 years old.\n\nTherefore, the answer is:\n\\[\n\\boxed{65}\n\\```<|im_end|>"

In [41]:
indirect_effect

{'pr_0': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr!_0': tensor(2.3648e-14, device='cuda:0', dtype=torch.bfloat16),
 'pr_1': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr!_1': tensor(3.7947e-17, device='cuda:0', dtype=torch.bfloat16),
 'pr*_0_0': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr*_0!_0': tensor(5.0182e-14, device='cuda:0', dtype=torch.bfloat16),
 'pr*_0_1': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr*_0!_1': tensor(8.0838e-16, device='cuda:0', dtype=torch.bfloat16)}

In [None]:
# get the probability for correct and purturbed answers from the clean run
output_clean = inference_with_hidden_state_new(corret_prompt)

In [19]:
indirect_effect

{}

In [None]:
|def run_indirect_effect():

  # get the activations from a purtubed prompt
  activations = inference_with_hidden_state_new(purtubed_prompt, save_activations=True)

  # get the probability for correct and purturbed answers from the clean run
  output_clean = inference_with_hidden_state_new(corret_prompt)

  # get the probability for correct and purturbed answers from the patch run for each layer in the model
  for layer in range(model.config.num_hidden_layers):
    inference_with_hidden_state_new(correct_prompt,patch_activations=True, patch_layer=layer, stored_activations = activations)


In [None]:
import torch.nn as nn

In [None]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotary_emb): Qw

In [None]:
isinstance(model.model.layers[1].mlp, nn.Module)

True

In [None]:
stored_activations = []

In [None]:
lets_see = inference_with_hidden_state_new(purtubed_prompt, save_activations=True)

applying
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


In [None]:
lets_see['layer_0'].shape

torch.Size([1, 245, 1536])

In [None]:
def exec_circuit_find():

  activations = inference_with_



In [None]:
tokenizer.decode([90])

'{'

In [None]:
sample_probs

tensor([[1.6213e-22, 2.4945e-24, 5.4528e-21,  ..., 3.8774e-24, 3.8774e-24,
         3.8774e-24]], device='cuda:0', dtype=torch.bfloat16)

In [None]:
# input = "Chenny is 64 years old. Alyana is 54 years younger than Chenny. How old is Anne if she is 55 years older than Alyana?"
ans = inference_with_hidden_state_new(correct_prompt, top_p =1.0)


In [None]:
indirect_effect

{'pr_0': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr!_0': tensor(1.1990e-13, device='cuda:0', dtype=torch.bfloat16),
 'pr_1': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr!_1': tensor(3.8164e-16, device='cuda:0', dtype=torch.bfloat16)}

In [None]:
boxed_token = torch.tensor((tokenizer.encode('boxed{')))

In [None]:
for i in range(2,len(ans['sequences'].squeeze())):
  sample = ans['sequences'][:, :i+1]
  # print(smple)
  # if sample[:, -2:]== boxed_token.to(device):
  #   print('yes', i)


  if torch.all(sample[0,-2:] == boxed_token.to('cuda')):
    print('yoo')
    print(i)
    start_tracking = True



yoo
244


In [None]:
len(ans['sequences'].squeeze)

TypeError: object of type 'builtin_function_or_method' has no len()

In [13]:
tokenizer.decode(ans['sequences'].squeeze())

NameError: name 'ans' is not defined

In [None]:
tokenizer.decode(ans['sequences'].squeeze()[244])

'{'

In [59]:
import torch

class ActivationPatcher:
    def __init__(self, model, tokenizer, device='cuda'):
        """Initialize the ActivationPatcher with model and tokenizer.

        Args:
            model: The language model to use
            tokenizer: The tokenizer for the model
            device: Device to run on (default: 'cuda')
        """
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.sample_probs = None
        self.indirect_effect = {}
        self.boxed_token = None  # This needs to be set

    def prepare_input(self, prompt):
        """Convert a text prompt to model inputs.

        Args:
            prompt: Text prompt to convert

        Returns:
            Tokenized input IDs
        """
        messages = [{"role": "user", "content": prompt}]
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
        return model_inputs.input_ids

    def inference(
        self,
        prompt,
        temperature=0.7,
        top_p=0.8,
        save_activations=False,
        patch_activations=False,
        patch_layer=0,
        activations=None,
        max_tokens=512
    ):
        """Run inference with options for saving or patching MLP activations.

        Args:
            prompt: Text prompt to process
            temperature: Sampling temperature
            top_p: Top-p sampling parameter
            save_activations: Whether to save activations
            patch_activations: Whether to patch activations
            patch_layer: Which layer to patch if patching
            activations: Pre-saved activations to use for patching
            max_tokens: Maximum tokens to generate

        Returns:
            Dictionary with outputs including generated sequences
        """
        # Initialize
        input_ids = self.prepare_input(prompt)
        sequences = input_ids
        stored_activations = {}

        # For tracking boxed answers
        track = False
        cnt = 0
        hooks = []

        # Configure model
        self.model.config.use_cache = True

        # Generate tokens
        with torch.no_grad():
            for _ in range(max_tokens):
                outputs = self.model(
                    input_ids=sequences,
                )

                # Get logits and sample next token
                next_token_logits = outputs.logits[:, -1, :]
                next_token_logits = next_token_logits / temperature
                probs = torch.softmax(next_token_logits, dim=-1)
                next_token_id = torch.multinomial(probs, num_samples=1)

                # Add new token to sequence
                sequences = torch.cat([sequences, next_token_id], dim=-1)

                # Track probabilities for correct and perturbed answers
                if track and cnt < len(self.correct_ans_tokens):
                    pr = f'pr*_{patch_layer}' if patch_activations else 'pr'
                    self.indirect_effect[f'{pr}_{cnt}'] = probs[0, self.correct_ans_tokens[cnt]]
                    self.indirect_effect[f'{pr}!_{cnt}'] = probs[0, self.perturbed_ans_tokens[cnt]]
                    cnt += 1

                # Detect when we enter a boxed answer
                if torch.all(sequences[0, -2:] == self.boxed_token.to(self.device)):
                    track = True
                    cnt = 0
                    # self.sample_probs = probs

                # Detect end of boxed answer
                if track and torch.all(sequences[0, -1:] == torch.tensor(self.tokenizer.encode('}')).to(self.device)):
                    track = False
                    print('End of boxed answer detected')

                # Handle activation saving
                if save_activations and track and cnt == 0:
                    print('Saving activations...')
                    stored_activations = self._save_layer_activations(sequences)
                    return stored_activations

                # Handle activation patching
                if patch_activations and track and cnt == 0:
                    print(f'Applying patching at layer {patch_layer}')
                    hooks = self._apply_activation_patch(patch_layer, activations)

                # Stop on EOS token
                if next_token_id.item() == self.tokenizer.eos_token_id:
                    if patch_activations and hooks:
                        print('Clearing hooks')
                        for h in hooks:
                            h.remove()
                    break

        # Clean up
        torch.cuda.empty_cache()

        return {
            "sequences": sequences,
            "indirect_effect": self.indirect_effect
        }

    def _save_layer_activations(self, sequences):
        """Save MLP activations for all layers.

        Args:
            sequences: Current token sequences

        Returns:
            Dictionary of activations by layer
        """
        stored_activations = {}
        hooks = []

        try:
            # Create hooks for each layer
            for layer in range(self.model.config.num_hidden_layers):
                print(f"Saving activations for layer {layer}")

                def hook_fn(name):
                    def hook(module, input, output):
                        stored_activations[name] = output.detach().clone()
                    return hook

                module = self.model.model.layers[layer].mlp
                handle = module.register_forward_hook(hook_fn(f'layer_{layer}'))
                hooks.append(handle)

            # Run forward pass to collect activations
            self.model(input_ids=sequences)

        finally:
            # Remove hooks when done
            for h in hooks:
                h.remove()

        return stored_activations

    def _apply_activation_patch(self, patch_layer, activations):
        """Apply activation patching to a specific layer.

        Args:
            patch_layer: Layer to patch
            activations: Saved activations to use for patching

        Returns:
            List of hook handles
        """
        hooks = []

        def hook_fn(name):
            def hook(module, input, output):
                output[:, -1, :] = activations[f'layer_{patch_layer}'][:, -1, :]
                return output
            return hook

        module = self.model.model.layers[patch_layer].mlp
        handle = module.register_forward_hook(hook_fn(f'layer_{patch_layer}'))
        hooks.append(handle)

        return hooks

    def run_indirect_effect_analysis(
        self,
        correct_prompt,
        perturbed_prompt,
        correct_ans_tokens,
        perturbed_ans_tokens
    ):
        """Run full indirect effect analysis with patching.

        Args:
            correct_prompt: The original correct prompt
            perturbed_prompt: The modified prompt with perturbations
            correct_ans_tokens: Token IDs for correct answers
            perturbed_ans_tokens: Token IDs for perturbed answers

        Returns:
            Dictionary with analysis results
        """
        # Set up answer tokens
        self.correct_ans_tokens = correct_ans_tokens
        self.perturbed_ans_tokens = perturbed_ans_tokens

        # Step 1: Get activations from perturbed prompt
        print("Getting activations from perturbed prompt...")
        activations = self.inference(perturbed_prompt, save_activations=True)

        # Step 2: Run clean inference on correct prompt
        print("Running inference on clean prompt...")
        clean_results = self.inference(correct_prompt)

        # Step 3: Run patched inference for each layer
        all_results = {"clean": clean_results}
        for layer in range(self.model.config.num_hidden_layers):
            print(f"Testing indirect effect at layer {layer}...")
            patched_results = self.inference(
                correct_prompt,
                patch_activations=True,
                patch_layer=layer,
                activations=activations
            )
            all_results[f"patched_layer_{layer}"] = patched_results

        return all_results



In [60]:
correct_prompt = "Chenny is 64 years old. Alyana is 54 years younger than Chenny. How old is Anne if she is 55 years older than Alyana?"
perturbed_prompt = "Chenny is 78 years old. Alyana is 21 years younger than Chenny. How old is Anne if she is 31 years older than Alyana?"

correct_ans = '65'
perturbed_ans = '88'

correct_ans_tokens = tokenizer.encode(str(correct_ans))
purturbed_ans_tokens = tokenizer.encode(str(perturbed_ans))


In [63]:
patcher = ActivationPatcher(model, tokenizer)

# Set up the boxed token if needed
patcher.boxed_token = torch.tensor(tokenizer.encode('boxed{')).to('cuda')

In [64]:
res = patcher.run_indirect_effect_analysis(
    correct_prompt=correct_prompt,
    perturbed_prompt=perturbed_prompt,
    correct_ans_tokens=correct_ans_tokens,
    perturbed_ans_tokens=purturbed_ans_tokens
)

Getting activations from perturbed prompt...
Saving activations...
Saving activations for layer 0
Saving activations for layer 1
Saving activations for layer 2
Saving activations for layer 3
Saving activations for layer 4
Saving activations for layer 5
Saving activations for layer 6
Saving activations for layer 7
Saving activations for layer 8
Saving activations for layer 9
Saving activations for layer 10
Saving activations for layer 11
Saving activations for layer 12
Saving activations for layer 13
Saving activations for layer 14
Saving activations for layer 15
Saving activations for layer 16
Saving activations for layer 17
Saving activations for layer 18
Saving activations for layer 19
Saving activations for layer 20
Saving activations for layer 21
Saving activations for layer 22
Saving activations for layer 23
Saving activations for layer 24
Saving activations for layer 25
Saving activations for layer 26
Saving activations for layer 27
Running inference on clean prompt...
End of box

In [65]:
patcher.indirect_effect

{'pr_0': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr!_0': tensor(7.8515e-13, device='cuda:0', dtype=torch.bfloat16),
 'pr_1': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr!_1': tensor(8.1601e-15, device='cuda:0', dtype=torch.bfloat16),
 'pr*_0_0': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr*_0!_0': tensor(9.3703e-14, device='cuda:0', dtype=torch.bfloat16),
 'pr*_0_1': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr*_0!_1': tensor(1.4322e-14, device='cuda:0', dtype=torch.bfloat16),
 'pr*_1_0': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr*_1!_0': tensor(4.8431e-11, device='cuda:0', dtype=torch.bfloat16),
 'pr*_1_1': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr*_1!_1': tensor(1.0800e-11, device='cuda:0', dtype=torch.bfloat16),
 'pr*_2_0': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr*_2!_0': tensor(1.0090e-12, device='cuda:0', dtype=torch.bfloat16),
 'pr*_2_1': tensor(1., device='cuda:0', dtype=torch.bfloat16),
 'pr