<a href="https://colab.research.google.com/github/DataSavvyYT/experiments/blob/main/llm_steering/dev/example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title 1. Install Dependencies
# Run this cell first to install the necessary libraries.
!pip install -q transformers torch accelerate

In [2]:
# @title 2. Load Model and Define Steering Functions
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
# --- Configuration ---
model_name = "gpt2-medium"  # You can swap this for 'gpt2-xl' or 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
layer_id = 15               # The layer to inject the steering vector (0-23 for gpt2-medium)
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading {model_name} on {device}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.eval()

Loading gpt2-medium on cpu...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [4]:
# --- Helper Functions ---

def get_activations(text):
    """
    Runs the model and extracts the hidden states from the specified layer
    for the last token in the sequence.
    """
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # Hidden states tuple: (layer_0, layer_1, ..., layer_N)
    # We grab the specific layer we want to steer.
    # Shape: [batch, sequence_length, hidden_dim]
    hidden_state = outputs.hidden_states[layer_id]

    # We return the activation of the LAST token.
    # This represents the "summary" of the concept at this point.
    return hidden_state[0, -1, :]

In [5]:
def generate_with_steering(prompt, steering_vector, multiplier=1.0, max_new_tokens=30):
    """
    Generates text while injecting the steering vector into the forward pass.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Define the hook function
    def steering_hook(module, input, output):
        # Output of a layer in HF is usually (hidden_states,)
        if isinstance(output, tuple):
            hidden_states = output[0]
        else:
            hidden_states = output

        # Add the steering vector to all token positions
        # The vector shape is [hidden_dim], we broadcast it to [batch, seq_len, hidden_dim]
        hidden_states += (steering_vector * multiplier)

        if isinstance(output, tuple):
            return (hidden_states,) + output[1:]
        return hidden_states

    # Register the hook on the specific layer
    # For GPT-2, layers are in model.transformer.h
    # For Llama, it might be model.model.layers
    try:
        layer_module = model.transformer.h[layer_id]
    except AttributeError:
        # Fallback for Llama/Mistral architectures
        layer_module = model.model.layers[layer_id]

    handle = layer_module.register_forward_hook(steering_hook)

    # Generate text
    try:
        output_sequences = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    finally:
        # VERY IMPORTANT: Remove the hook so the model goes back to normal
        handle.remove()

    return tokenizer.decode(output_sequences[0], skip_special_tokens=True)

print("Setup complete. Ready to steer!")

Setup complete. Ready to steer!


In [6]:
# @title 3. Create the Steering Vector
# We create a vector that represents the direction "Anger" vs "Calmness".

# 1. Define contrasting prompts
# Ideally, these prompts should be identical except for the concept you want to isolate.
pos_concept = "I am feeling extremely happy, loving, and peaceful today."
neg_concept = "I am feeling extremely angry, hateful, and furious today."

In [7]:
# 2. Extract activations
print(f"Extracting vector: '{pos_concept}' minus '{neg_concept}'")
pos_act = get_activations(pos_concept)
neg_act = get_activations(neg_concept)

# 3. Create the steering vector (Positive - Negative)
# This vector points in the "direction" of Happiness/Peace.
steering_vec = pos_act - neg_act

print("Steering vector created.")

Extracting vector: 'I am feeling extremely happy, loving, and peaceful today.' minus 'I am feeling extremely angry, hateful, and furious today.'
Steering vector created.


In [9]:
# @title 4. Test the Steering
# Now we test it on a neutral prompt to see if we can force the mood.

test_prompt = "I went to the store and the clerk said"

print(f"--- BASELINE (Multiplier: 0.0) ---")
print(generate_with_steering(test_prompt, steering_vec, multiplier=0.0))
print("\n")

print(f"--- STEERED POSITIVE (Happy/Peaceful) (Multiplier: 3.0) ---")
print(generate_with_steering(test_prompt, steering_vec, multiplier=3.0))
print("\n")

print(f"--- STEERED NEGATIVE (Angry/Hateful) (Multiplier: -3.0) ---")
# Using a negative multiplier pushes the model in the opposite direction (towards the negative concept)
print(generate_with_steering(test_prompt, steering_vec, multiplier=-3.0))

--- BASELINE (Multiplier: 0.0) ---
I went to the store and the clerk said that we were out of candy; but when I asked for the next batch I was told that it had already been delivered.

I started to


--- STEERED POSITIVE (Happy/Peaceful) (Multiplier: 3.0) ---
I went to the store and the clerk said, "You've got good luck." I was feeling better. It was great. It was a whole boost.

"My body was feeling


--- STEERED NEGATIVE (Angry/Hateful) (Multiplier: -3.0) ---
I went to the store and the clerk said that if you have this video we'll be speaking to you about it. We're in the video for this."

The video is showing Mr
