<a href="https://colab.research.google.com/github/DataSavvyYT/experiments/blob/main/llm_steering/hello_world_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title 1. Install & Load Dependencies
# We use the latest transformers to support Qwen2.5
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q torch accelerate

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# Configuration
# Qwen2.5-1.5B is smart, fast, and fits in free Colab memory.
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LAYER_ID = 14  # Middle-to-late layers (10-20) are best for concepts in this model

print(f"Loading {MODEL_NAME} on {DEVICE}...")

In [None]:
!pip install -q --upgrade huggingface_hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16, # Uses less memory, higher precision than fp16
    device_map="auto"
)

print("Model loaded successfully.")

In [None]:
# @title 2. Define Steering Data (The "Mean Difference" Method)
# To get a clean signal, we average the difference between multiple pairs.
# Concept: "Wedding/Celebration" vs "Funeral/Tragedy"

positive_examples = [
    "The wedding ceremony was absolutely beautiful and full of joy.",
    "We celebrated the victory with cheers, laughter, and champagne.",
    "The birth of the child brought immense happiness to the family.",
    "It was the best day of my life, everything was perfect and bright.",
    "The sun is shining, the birds are singing, and I feel alive."
]

negative_examples = [
    "The funeral service was somber, quiet, and full of tears.",
    "We mourned the tragic loss with silence, sorrow, and regret.",
    "The death of the child brought immense despair to the family.",
    "It was the worst day of my life, everything was dark and hopeless.",
    "The rain is pouring, the sky is grey, and I feel empty."
]

In [None]:
def get_layer_activations(text, layer_idx):
    """Get the hidden state of the last token at a specific layer."""
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    # outputs.hidden_states is a tuple of (layer0, layer1, ... layerN)
    # Shape: [batch, seq_len, hidden_dim] -> get last token [-1]
    return outputs.hidden_states[layer_idx][0, -1, :]

print("Calculating steering vector...")

In [None]:
diffs = []
for pos, neg in zip(positive_examples, negative_examples):
    pos_act = get_layer_activations(pos, LAYER_ID)
    neg_act = get_layer_activations(neg, LAYER_ID)
    diffs.append(pos_act - neg_act)

# Stack and average to get the Mean Difference Vector
steering_vector = torch.stack(diffs).mean(dim=0)

# Normalize vector (optional, helps with control)
steering_vector = steering_vector / torch.norm(steering_vector)

print(f"Steering vector calculated at Layer {LAYER_ID}. Shape: {steering_vector.shape}")

In [None]:
# @title 3. Define the Hook and Generation Function
from transformers import StoppingCriteria, StoppingCriteriaList

# This hook injects the vector into the model's forward pass
def make_hook(steering_vec, coeff):
    def hook(module, input, output):
        # output is usually (hidden_states,) in HF models
        if isinstance(output, tuple):
            hidden_states = output[0]
        else:
            hidden_states = output

        # Add the vector (broadcasted over sequence length)
        # We add it to every token in the sequence
        hidden_states += (steering_vec.to(hidden_states.device) * coeff)

        if isinstance(output, tuple):
            return (hidden_states,) + output[1:]
        return hidden_states
    return hook

def generate_steered(prompt, coeff=0.0, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    # Register the hook
    # For Qwen/Llama, layers are usually in model.model.layers
    target_layer = model.model.layers[LAYER_ID]
    hook_handle = target_layer.register_forward_hook(make_hook(steering_vector, coeff))

    try:
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id
            )
    finally:
        # ALWAYS remove the hook
        hook_handle.remove()

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# @title 4. See the Results
# We test with a neutral prompt that could go either way.

prompt = "I went to the hospital to visit my friend and"

print(f"Prompt: '{prompt}'\n")

print("--- BASELINE (0.0) ---")
print(generate_steered(prompt, coeff=0.0))
print("\n")

print("--- STEERED POSITIVE (+10.0: Celebration/Joy) ---")
# High coefficient because we normalized the vector earlier
print(generate_steered(prompt, coeff=15.0))
print("\n")

print("--- STEERED NEGATIVE (-10.0: Tragedy/Mourning) ---")
# Negative coefficient pushes towards the 'Negative' examples
print(generate_steered(prompt, coeff=-15.0))