In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy as np
import utils
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
prompt1 = "LOVE"
prompt2 = "HATE"

In [None]:
class ActivationSteering:
    def __init__(self, model_name):
        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda"
        elif (
            hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
        ):
            device = "mps"

        self.device = device

        print(f"Loading model and tokenizer")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name, device_map=device
        )
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, device_map=device
        )
        print(f"Model and tokenizer loaded on {self.model.device}")

        print("Finding attention layers...")
        self.attention_layers = self._get_attention_layers()
        print(f"Found {len(self.attention_layers)} attention layers")

    def chat_and_get_activation_vectors(self, prompt, max_tokens=3000):
        # Tokenize the prompt
        print(f"Tokenizing prompt: {prompt}")
        messages = [{"role": "user", "content": prompt}]
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False,
        )
        inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)

        # Initialize the attention vectors
        activation_vectors = {}
        hooks = []

        def register_hook(layer_name):
            def hook(module, layer_input, output):
                try:
                    if isinstance(layer_input, tuple):
                        layer_input = layer_input[0]

                    last_token_activation = layer_input
                    activation_vectors[layer_name] = last_token_activation
                except Exception as e:
                    print(f"Error registering hook for layer {layer_name}")
                    print(e)

            return hook

        print("Attaching hooks...")
        for layer in self.attention_layers:
            handle = layer["module"].register_forward_hook(
                register_hook(layer["name"])
            )
            hooks.append(handle)

        print("Running model...")
        with torch.no_grad():
            output_ids = self.model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
            )
            output_text = self.tokenizer.decode(
                output_ids[0][len(inputs.input_ids[0]) :],
                skip_special_tokens=True,
            )

        print("Detaching hooks...")
        for hook in hooks:
            hook.remove()
        hooks = []

        return {
            "output": output_text,
            "activation_vectors": activation_vectors,
        }

    def chat_and_apply_steering_vector(
        self, prompt, steering_vector, layer_name, max_tokens=3000
    ):
        # Tokenize the prompt
        print(f"Tokenizing prompt: {prompt}")
        messages = [{"role": "user", "content": prompt}]
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False,
        )
        inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)

        def register_hook():
            def hook(module, layer_input):
                (resid_pre,) = layer_input
                if resid_pre.shape[1] == 1:
                    return None  # caching for new tokens in generate()

                # We only add to the prompt (first call), not the generated tokens.
                ppos, apos = resid_pre.shape[1], steering_vector.shape[1]
                assert (
                    apos <= ppos
                ), f"More mod tokens ({apos}) then prompt tokens ({ppos})!"

                # TODO: Make this a function-wrapper for flexibility.
                resid_pre[:, :apos, :] += steering_vector
                return resid_pre

            return hook

        print("Attaching hooks...")
        hooks = []
        for layer in self.attention_layers:
            # Only attach the hook to the layer we want to steer
            if layer["name"] == layer_name:
                print(f"Attaching steering hook to layer {layer['name']}")
                handle = layer["module"].register_forward_pre_hook(register_hook())
                hooks.append(handle)

        print("Running model...")
        with torch.no_grad():
            output_ids = self.model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
            )
            output_text = self.tokenizer.decode(
                output_ids[0][len(inputs.input_ids[0]) :],
                skip_special_tokens=True,
            )

        print("Detaching hooks...")
        for hook in hooks:
            hook.remove()
        hooks = []

        return {
            "output": output_text,
        }

    def _get_attention_layers(self):
        layers = []

        for name, module in self.model.named_modules():
            if "post_attention_layernorm" in name or name.endswith("mlp"):
                layers.append({"name": name, "module": module})

        return layers

In [None]:
a = ActivationSteering("Qwen/Qwen3-0.6B")

Loading model and tokenizer
Model and tokenizer loaded on cuda:0
Finding attention layers...
Found 56 attention layers


In [13]:
for name, module in a.model.named_modules():
	print(name)



model
model.embed_tokens
model.layers
model.layers.0
model.layers.0.self_attn
model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.self_attn.q_norm
model.layers.0.self_attn.k_norm
model.layers.0.mlp
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.0.mlp.act_fn
model.layers.0.input_layernorm
model.layers.0.post_attention_layernorm
model.layers.1
model.layers.1.self_attn
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.self_attn.q_norm
model.layers.1.self_attn.k_norm
model.layers.1.mlp
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.1.mlp.act_fn
model.layers.1.input_layernorm
model.layers.1.post_attention_layernorm
model.layers.2
model.layers.2.self_attn
model.layers.2.self_attn.q_proj
model.layers.2.self_att

In [14]:
love_output = a.chat_and_get_activation_vectors("LOVE")

Tokenizing prompt: LOVE
Attaching hooks...
Running model...
Detaching hooks...


In [15]:
love_output["output"]

"I'm glad to hear that you're enjoying the conversation! Love is a powerful emotion that can be expressed in many ways. If you have any questions or need help with something, feel free to ask me. 😊"

In [16]:
love_output["activation_vectors"]["model.layers.25.post_attention_layernorm"][0].shape

torch.Size([1, 1024])

In [17]:
hate_output = a.chat_and_get_activation_vectors("HATE")

Tokenizing prompt: HATE
Attaching hooks...
Running model...
Detaching hooks...


In [18]:
hate_output["output"]

'I am sorry for the confusion. HATE is a complex emotion, and I am here to support and understand it. If you have any questions or need help, feel free to ask.'

In [19]:
prompt = "Do you like dogs?"

In [None]:
love_activation_vectors = list(love_output["activation_vectors"].items())
hate_activation_vectors = list(hate_output["activation_vectors"].items())

for i in range(15, len(love_activation_vectors)):
	layer_name = love_activation_vectors[i][0]
	love_activation_vector = love_activation_vectors[i][1]
	hate_activation_vector = hate_activation_vectors[i][1]

	love_steering_vector = love_activation_vector - hate_activation_vector

	STEERING_STRENGTH = -20
	steering_vector = love_steering_vector * STEERING_STRENGTH
	steered_output = a.chat_and_apply_steering_vector(prompt, steering_vector, layer_name)
	print(f"Output for layer {layer_name}, strength {STEERING_STRENGTH}:\n{steered_output}")

	print()

	# STEERING_STRENGTH = 10
	# steering_vector = love_steering_vector * STEERING_STRENGTH
	# steered_output = a.chat_and_apply_steering_vector(prompt, steering_vector, layer_name)
	# response = steered_output["output"]
	# print(f"Output for layer {layer_name}, strength {STEERING_STRENGTH}:\n{response}")

	print("-" * 100)



Tokenizing prompt: Do you like dogs?
Attaching hooks...
Attaching steering hook to layer model.layers.7.mlp
Running model...
Detaching hooks...
Output for layer model.layers.7.mlp, strength -20:
{'output': 'ampions.\n\ndogs. \n\nyes. always. all. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and. and

: 