In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy as np
import utils
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
prompt1 = "LOVE"
prompt2 = "HATE"

In [11]:
class ActivationSteering:
    def __init__(self, model_name):
        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda"
        elif (
            hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
        ):
            device = "mps"

        self.device = device

        print(f"Loading model and tokenizer")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name, device_map=device
        )
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, device_map=device
        )
        print(f"Model and tokenizer loaded on {self.model.device}")

        print("Finding attention layers...")
        self.attention_layers = self._get_attention_layers()
        print(f"Found {len(self.attention_layers)} attention layers")

    def chat_and_get_activation_vectors(self, prompt):
        # Tokenize the prompt
        print(f"Tokenizing prompt: {prompt}")
        messages = [{"role": "user", "content": prompt}]
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False,
        )
        inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)

        # Initialize the attention vectors
        activation_vectors = {}
        hooks = []

        def register_hook(layer_name):
            def hook(module, layer_input, output):
                if isinstance(layer_input, tuple):
                    layer_input = layer_input[0]

                last_token_activation = layer_input
                activation_vectors[layer_name] = last_token_activation

            return hook

        print("Attaching hooks...")
        for layer in self.attention_layers:
            handle = layer["module"].register_forward_hook(
                register_hook(layer["name"])
            )
            hooks.append(handle)

        print("Running model...")
        with torch.no_grad():
            output_ids = self.model.generate(
                **inputs,
                max_new_tokens=3000,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
            )
            output_text = self.tokenizer.decode(
                output_ids[0][len(inputs.input_ids[0]) :],
                skip_special_tokens=True,
            )

        print("Detaching hooks...")
        for hook in hooks:
            hook.remove()
        hooks = []

        return {
            "output": output_text,
            "activation_vectors": activation_vectors,
        }

    def chat_and_apply_steering_vector(
        self, prompt, steering_vector, layer_name
    ):
        # Tokenize the prompt
        print(f"Tokenizing prompt: {prompt}")
        messages = [{"role": "user", "content": prompt}]
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False,
        )
        inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)

        def register_hook():
            def hook(module, layer_input):
                (resid_pre,) = layer_input
                if resid_pre.shape[1] == 1:
                    return None  # caching for new tokens in generate()

                # We only add to the prompt (first call), not the generated tokens.
                ppos, apos = resid_pre.shape[1], steering_vector.shape[1]
                assert (
                    apos <= ppos
                ), f"More mod tokens ({apos}) then prompt tokens ({ppos})!"

                # TODO: Make this a function-wrapper for flexibility.
                resid_pre[:, :apos, :] += steering_vector
                return resid_pre

            return hook

        print("Attaching hooks...")
        hooks = []
        for layer in self.attention_layers:
            # Only attach the hook to the layer we want to steer
            if layer["name"] == layer_name:
                print(f"Attaching steering hook to layer {layer['name']}")
                handle = layer["module"].register_forward_pre_hook(register_hook())
                hooks.append(handle)

        print("Running model...")
        with torch.no_grad():
            output_ids = self.model.generate(
                **inputs,
                max_new_tokens=3000,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
            )
            output_text = self.tokenizer.decode(
                output_ids[0][len(inputs.input_ids[0]) :],
                skip_special_tokens=True,
            )

        print("Detaching hooks...")
        for hook in hooks:
            hook.remove()
        hooks = []

        return {
            "output": output_text,
        }

    def _get_attention_layers(self):
        layers = []

        for name, module in self.model.named_modules():
            if "post_attention_layernorm" in name:
                layers.append({"name": name, "module": module})

        return layers

In [42]:
a = ActivationSteering("Qwen/Qwen3-0.6B")

Loading model and tokenizer
Model and tokenizer loaded on mps:0
Finding attention layers...
Found 28 attention layers


In [43]:
love_output = a.chat_and_get_activation_vectors("LOVE")

Tokenizing prompt: LOVE
Attaching hooks...
Running model...
Detaching hooks...


In [44]:
love_output["output"]

"I'm glad to hear you! Love is a beautiful and profound emotion, one that brings people together and deepens connections. Whether in personal relationships, friendships, or shared experiences, love shapes our lives and helps us grow. Let me know if you'd like to explore this in another way!"

In [45]:
love_output["activation_vectors"]["model.layers.25.post_attention_layernorm"][0].shape

torch.Size([1, 1024])

In [46]:
hate_output = a.chat_and_get_activation_vectors("HATE")

Tokenizing prompt: HATE
Attaching hooks...
Running model...
Detaching hooks...


In [47]:
hate_output["output"]

"Hate is a complex emotion that can range from a deep sense of anger and frustration to a feeling of discomfort or sadness. It can stem from personal experiences, societal norms, or even political ideologies. It's important to recognize that hate is often a product of ignorance, prejudice, or fear, and it can lead to harmful actions. It's crucial to approach hate with empathy and understanding, rather than reacting impulsively."

In [None]:
prompt = "Do you like dogs?"

In [58]:
love_activation_vectors = list(love_output["activation_vectors"].items())
hate_activation_vectors = list(hate_output["activation_vectors"].items())

for i in range(15, 22):
	layer_name = love_activation_vectors[i][0]
	love_activation_vector = love_activation_vectors[i][1]
	hate_activation_vector = hate_activation_vectors[i][1]

	love_steering_vector = love_activation_vector - hate_activation_vector

	STEERING_STRENGTH = -10
	steering_vector = love_steering_vector * STEERING_STRENGTH
	steered_output = a.chat_and_apply_steering_vector(prompt, steering_vector, layer_name)
	print(f"Output for layer {layer_name}, strength {STEERING_STRENGTH}:\n{steered_output}")

	print()

	# STEERING_STRENGTH = 10
	# steering_vector = love_steering_vector * STEERING_STRENGTH
	# steered_output = a.chat_and_apply_steering_vector(prompt, steering_vector, layer_name)
	# response = steered_output["output"]
	# print(f"Output for layer {layer_name}, strength {STEERING_STRENGTH}:\n{response}")

	print("-" * 100)



Tokenizing prompt: Dogs are nice
Attaching hooks...
Attaching steering hook to layer model.layers.15.post_attention_layernorm
Running model...
Detaching hooks...
Output for layer model.layers.15.post_attention_layernorm, strength -10:
{'output': '做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦做梦整整从一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下，对一下

KeyboardInterrupt: 