# Steps
- load the model
- load the dataset
- filter rows for blue and non blue
- add a hook to get activations
- run the model for blue and non blue prompts
- train a logistic regression model on all layer activations
- test the model on the test set
- visualize the results

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "Qwen/Qwen3-0.6B"

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"

tokenizer = AutoTokenizer.from_pretrained(model_name, device_map=device)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)

print(f"Model loaded on: {model.device}")

Model loaded on: cuda:0


In [3]:
dataset_with_colors = pd.read_csv("data/dataset_with_colors_in_prompt_with_annotated_colors.csv")
dataset_without_colors = pd.read_csv("data/dataset_without_colors_in_prompt_with_annotated_colors.csv")

print(dataset_with_colors.head())
print("-"*100)
print(dataset_without_colors.head())

                                              prompt  thinking_content  \
0  Generate a website for a law firm specializing...               NaN   
1  Create a website for a small accounting firm s...               NaN   
2  Design a landing page for a freelance software...               NaN   
3  Build a corporate website for a management con...               NaN   
4  Generate a portfolio site for an independent g...               NaN   

                                             content      colors  
0  ```html\n<!DOCTYPE html>\n<html lang="en">\n<h...     ['red']  
1  ```html\n<!DOCTYPE html>\n<html lang="en">\n<h...     ['red']  
2  ```html\n<!DOCTYPE html>\n<html lang="en">\n<h...     ['red']  
3  <!DOCTYPE html>\n<html lang="en">\n<head>\n  <...     ['red']  
4  ```html\n<!DOCTYPE html>\n<html lang="en">\n<h...  ['violet']  
----------------------------------------------------------------------------------------------------
                                              prompt

### Create a dataset with blue and non blue prompts

In [4]:
from sklearn.model_selection import train_test_split

# add a column is_blue
blue_dataset = dataset_with_colors.copy()
blue_dataset['is_blue'] = blue_dataset['colors'].apply(lambda x: 1 if 'blue' in x else 0)

train_blue_df, test_blue_df = train_test_split(blue_dataset, test_size=0.2, random_state=42, shuffle=True)

In [38]:
print(train_blue_df.shape)
print(test_blue_df.shape)
test_blue_df.head()

(280, 5)
(70, 5)


Unnamed: 0,prompt,thinking_content,content,colors,is_blue
157,Build a university department homepage. The pr...,,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n <...",['blue'],1
341,Generate an event registration page for a conf...,,"```html\n<!DOCTYPE html>\n<html lang=""en"">\n<h...","['orange', 'blue']",1
315,Create an online store for handmade jewelry. T...,,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n <...",['violet'],0
234,Make a platform for a neighborhood association...,,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n <...",['blue'],1
155,Create a website for an online tutoring servic...,,"```html\n<!DOCTYPE html>\n<html lang=""en"">\n<h...","['green', 'blue']",1


In [6]:
import re

caught_activations = []

def activation_hook(layer_name):
	def hook(module, input, output):
		"""Super simple hook - just grab the output and store it"""
		if isinstance(output, tuple):
			output = output[0]
		# Get the last token: [:, -1, :]
		last_token_activation = output[:, -1, :].detach().cpu()
		caught_activations.append({"layer_name": layer_name, "activation": last_token_activation})
	return hook
	
print("Hook function ready!")

Hook function ready!


In [7]:
# Let's see what layers this thing has
print("Model layers:")
for name, module in model.named_modules():
    if 'layer' in name and 'mlp' in name and name.endswith('mlp'):
        print(f"  {name}")
        
# How many layers total?
layer_count = len([n for n, m in model.named_modules() if 'layers.' in n and '.mlp' in n])
print(f"\nFound ~{layer_count} MLP layers to hook into")

Model layers:
  model.layers.0.mlp
  model.layers.1.mlp
  model.layers.2.mlp
  model.layers.3.mlp
  model.layers.4.mlp
  model.layers.5.mlp
  model.layers.6.mlp
  model.layers.7.mlp
  model.layers.8.mlp
  model.layers.9.mlp
  model.layers.10.mlp
  model.layers.11.mlp
  model.layers.12.mlp
  model.layers.13.mlp
  model.layers.14.mlp
  model.layers.15.mlp
  model.layers.16.mlp
  model.layers.17.mlp
  model.layers.18.mlp
  model.layers.19.mlp
  model.layers.20.mlp
  model.layers.21.mlp
  model.layers.22.mlp
  model.layers.23.mlp
  model.layers.24.mlp
  model.layers.25.mlp
  model.layers.26.mlp
  model.layers.27.mlp

Found ~140 MLP layers to hook into


In [8]:
layers = [i for i in range(18, 27, 4)]
layers

[18, 22, 26]

In [9]:
for layer in layers:
    print(f"Hooking into layer {layer}")

    target_layer = None
    target_layer_name = None
    for name, module in model.named_modules():
        if f'model.layers.{layer}.mlp' == name:
            target_layer = module
            target_layer_name = name
            break

    if target_layer:
        print(f"Found layer {layer}!")
        hook_handle = target_layer.register_forward_hook(activation_hook(target_layer_name))
    else:
        print(f"Couldn't find layer {layer}, try a different number")

Hooking into layer 18
Found layer 18!
Hooking into layer 22
Found layer 22!
Hooking into layer 26
Found layer 26!


In [18]:
caught_activations = []
prompts = []

for idx, row in train_blue_df.iterrows():
    prompts.append(row['prompt'])

    messages = [{"role": "user", "content": row['prompt']}, {"role": "assistant", "content": row['content']}]
    text = tokenizer.apply_chat_template(messages, tokenize=False)
    inputs = tokenizer(text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        model(**inputs)

    # print(f"Caught {len(caught_activations)} activation tensors")

print(f"Got {len(caught_activations)} activations for {len(prompts)} prompts")

Got 840 activations for 280 prompts


In [19]:
caught_activations[:10]

[{'layer_name': 'model.layers.18.mlp',
  'activation': tensor([[ 0.8210,  0.4567,  1.3995,  ..., -2.5661, -0.3879, -0.8005]])},
 {'layer_name': 'model.layers.22.mlp',
  'activation': tensor([[0.1443, 0.7339, 6.7653,  ..., 3.8560, 1.5202, 3.6891]])},
 {'layer_name': 'model.layers.26.mlp',
  'activation': tensor([[  0.8815, -15.3644,  -4.8938,  ...,   4.0958,   8.4347,   0.8488]])},
 {'layer_name': 'model.layers.18.mlp',
  'activation': tensor([[ 0.8163,  0.1512,  1.7470,  ..., -2.7119, -0.6922, -1.3296]])},
 {'layer_name': 'model.layers.22.mlp',
  'activation': tensor([[0.7487, 0.1575, 6.5210,  ..., 3.4652, 1.7159, 4.4692]])},
 {'layer_name': 'model.layers.26.mlp',
  'activation': tensor([[  1.5920, -11.4114,  -4.3878,  ...,   3.4944,   7.8079,   2.4087]])},
 {'layer_name': 'model.layers.18.mlp',
  'activation': tensor([[ 0.9061,  0.2555,  1.9457,  ..., -2.8431, -0.2490, -1.6838]])},
 {'layer_name': 'model.layers.22.mlp',
  'activation': tensor([[0.3857, 0.2532, 9.2819,  ..., 2.6881, 1.

In [35]:
PREFIX = "The primary color should be shades of "

for idx, prompt in enumerate(prompts):
	print(prompt)
	_, rest = prompt.split(PREFIX)
	rest = rest.strip().split(" ")
	color = rest[0]
	
	is_blue = True if color == "blue" else False
	print(color, is_blue)

	for i in range(idx*3, min(len(caught_activations), idx*3+3)):
		print(i, caught_activations[i])
		caught_activations[i]["is_blue"] = is_blue
	print("-"*100)

Make a platform for an independent bookstore. The primary color should be shades of blue so that the website is aesthetically pleasing.
blue True
0 {'layer_name': 'model.layers.18.mlp', 'activation': tensor([[ 0.8210,  0.4567,  1.3995,  ..., -2.5661, -0.3879, -0.8005]]), 'is_blue': True}
1 {'layer_name': 'model.layers.22.mlp', 'activation': tensor([[0.1443, 0.7339, 6.7653,  ..., 3.8560, 1.5202, 3.6891]]), 'is_blue': True}
2 {'layer_name': 'model.layers.26.mlp', 'activation': tensor([[  0.8815, -15.3644,  -4.8938,  ...,   4.0958,   8.4347,   0.8488]]), 'is_blue': True}
----------------------------------------------------------------------------------------------------
Make a platform for an independent bookstore. The primary color should be shades of orange so that the website is aesthetically pleasing.
orange False
3 {'layer_name': 'model.layers.18.mlp', 'activation': tensor([[ 0.8163,  0.1512,  1.7470,  ..., -2.7119, -0.6922, -1.3296]]), 'is_blue': False}
4 {'layer_name': 'model.layer

{'layer_name': 'model.layers.26.mlp', 'activation': tensor([[  1.8259, -12.2932,  -5.0875,  ...,   3.6366,   8.0485,   2.8133]]), 'is_blue': False}
----------------------------------------------------------------------------------------------------
Generate an event registration page for a conference. The primary color should be shades of green so that the website is aesthetically pleasing.
green False
780 {'layer_name': 'model.layers.18.mlp', 'activation': tensor([[ 0.4395,  0.4208,  1.9220,  ..., -2.9023,  0.3168, -2.0888]]), 'is_blue': False}
781 {'layer_name': 'model.layers.22.mlp', 'activation': tensor([[-0.6367,  0.6437,  6.3731,  ...,  3.8474,  2.5665,  2.0406]]), 'is_blue': False}
782 {'layer_name': 'model.layers.26.mlp', 'activation': tensor([[ -0.7847, -14.3843,  -3.7798,  ...,   3.5956,   8.1041,   1.6009]]), 'is_blue': False}
----------------------------------------------------------------------------------------------------
Make a booking site for a yoga studio. The primar

In [36]:
for activation in caught_activations:
	print(activation)


{'layer_name': 'model.layers.18.mlp', 'activation': tensor([[ 0.8210,  0.4567,  1.3995,  ..., -2.5661, -0.3879, -0.8005]]), 'is_blue': True}
{'layer_name': 'model.layers.22.mlp', 'activation': tensor([[0.1443, 0.7339, 6.7653,  ..., 3.8560, 1.5202, 3.6891]]), 'is_blue': True}
{'layer_name': 'model.layers.26.mlp', 'activation': tensor([[  0.8815, -15.3644,  -4.8938,  ...,   4.0958,   8.4347,   0.8488]]), 'is_blue': True}
{'layer_name': 'model.layers.18.mlp', 'activation': tensor([[ 0.8163,  0.1512,  1.7470,  ..., -2.7119, -0.6922, -1.3296]]), 'is_blue': False}
{'layer_name': 'model.layers.22.mlp', 'activation': tensor([[0.7487, 0.1575, 6.5210,  ..., 3.4652, 1.7159, 4.4692]]), 'is_blue': False}
{'layer_name': 'model.layers.26.mlp', 'activation': tensor([[  1.5920, -11.4114,  -4.3878,  ...,   3.4944,   7.8079,   2.4087]]), 'is_blue': False}
{'layer_name': 'model.layers.18.mlp', 'activation': tensor([[ 0.9061,  0.2555,  1.9457,  ..., -2.8431, -0.2490, -1.6838]]), 'is_blue': False}
{'layer_n

In [50]:
test_prompts = test_blue_df['prompt'].tolist()
test_prompts

['Build a university department homepage. The primary color should be shades of green so that the website is aesthetically pleasing.',
 'Generate an event registration page for a conference. The primary color should be shades of violet so that the website is aesthetically pleasing.',
 'Create an online store for handmade jewelry. The primary color should be shades of violet so that the website is aesthetically pleasing.',
 'Make a platform for a neighborhood association. The primary color should be shades of blue so that the website is aesthetically pleasing.',
 'Create a website for an online tutoring service. The primary color should be shades of green so that the website is aesthetically pleasing.',
 'Design a hotel booking site. The primary color should be shades of indigo so that the website is aesthetically pleasing.',
 'Generate a portfolio site for an independent graphic designer. The primary color should be shades of violet so that the website is aesthetically pleasing.',
 'Bu

In [37]:
# Train control probes for each layer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import defaultdict

# Group data by layer
layer_data = defaultdict(list)

for item in caught_activations:  # assuming your array is called activation_data
	layer_name = item['layer_name']
	activation = item['activation'].flatten().numpy()  # Convert to 1D numpy array
	label = 1 if item['is_blue'] else 0

	layer_data[layer_name].append({
		'activation': activation,
		'label': label
	})

# Train a probe for each layer
layer_probes = {}
layer_results = {}

for layer_name, data_list in layer_data.items():
	print(f"\n=== Training probe for {layer_name} ===")

	# Stack activations and labels
	X = np.stack([item['activation'] for item in data_list])
	y = np.array([item['label'] for item in data_list])

	print(f"Data shape: {X.shape}")
	print(f"Blue examples: {y.sum()}, Non-blue: {len(y) - y.sum()}")

	# Skip if we don't have both classes
	if len(np.unique(y)) < 2:
		print("Not enough variety in labels, skipping...")
		continue

	# Train probe (no train/test split for now - just quick test)
	probe = LogisticRegression(random_state=42, max_iter=1000)
	probe.fit(X, y)

	# Check fit on same data (overfitting test)
	predictions = probe.predict(X)
	accuracy = accuracy_score(y, predictions)

	print(f"Training accuracy: {accuracy:.3f}")

	# Store results
	layer_probes[layer_name] = probe
	layer_results[layer_name] = {
		'accuracy': accuracy,
		'n_samples': len(y),
		'n_blue': y.sum()
	}

# Show results summary
print(f"\n=== CONTROL PROBE RESULTS ===")
for layer_name, results in layer_results.items():
	print(f"{layer_name}: {results['accuracy']:.3f} accuracy ({results['n_samples']} samples)")

# Find best layer
if layer_results:
	best_layer = max(layer_results.items(), key=lambda x: x[1]['accuracy'])
	print(f"\nBest layer: {best_layer[0]} with {best_layer[1]['accuracy']:.3f} accuracy")


=== Training probe for model.layers.18.mlp ===
Data shape: (280, 1024)
Blue examples: 42, Non-blue: 238
Training accuracy: 0.993

=== Training probe for model.layers.22.mlp ===
Data shape: (280, 1024)
Blue examples: 42, Non-blue: 238
Training accuracy: 1.000

=== Training probe for model.layers.26.mlp ===
Data shape: (280, 1024)
Blue examples: 42, Non-blue: 238
Training accuracy: 1.000

=== CONTROL PROBE RESULTS ===
model.layers.18.mlp: 0.993 accuracy (280 samples)
model.layers.22.mlp: 1.000 accuracy (280 samples)
model.layers.26.mlp: 1.000 accuracy (280 samples)

Best layer: model.layers.22.mlp with 1.000 accuracy


In [53]:
test_prompts = test_blue_df['prompt'].tolist()
# remove everything that comes after "The primary color"
test_prompts = [prompt.split("The primary color")[0].strip() for prompt in test_prompts]
test_prompts[0]

'Build a university department homepage.'

In [62]:

best_layer_name = 'model.layers.26.mlp'
best_probe = layer_probes[best_layer_name]

print(f"Testing intervention with probe from: {best_layer_name}")

# Get the steering vector from probe coefficients
steering_vector = torch.tensor(best_probe.coef_[0], dtype=torch.float32)
print(f"Steering vector shape: {steering_vector.shape}")

HTML_GENERATION_SYSTEM_PROMPT = """
Generate self-contained HTML pages for the following prompts.
Do not include any other text in your response.
Only return the HTML page.
"""

# Find the actual model layer to hook into
target_layer = None
for name, module in model.named_modules():
	if best_layer_name in name and name.count('.') == 3:  # Exact match
		target_layer = module
		break

if not target_layer:
	print("ERROR: Couldn't find target layer!")
else:
	print(f"Found target layer: {target_layer}")

# Intervention hook function
def steering_hook(module, input, output):
	"""Add steering vector to the output"""
	if isinstance(output, tuple):
		hidden_states = output[0]
	else:
		hidden_states = output

	# Add steering to last token position
	hidden_states[:, -1, :] += steering_strength * steering_vector.to(hidden_states.device)

	if isinstance(output, tuple):
		return (hidden_states,) + output[1:]
	else:
		return hidden_states

# Test prompts
test_prompts = [
	"Create a website for a tech startup. The primary color should be shades of red.",
	"Build a portfolio site for a photographer. The primary color should be shades of red.",
	"Design a homepage for a restaurant. The primary color should be shades of red."
]
# test_prompts = test_blue_df['prompt'].tolist()

baseline = []
steered = []

steering_strength = 5.0  # Try different values: 0.5, 1.0, 1.5, 2.0

print(f"\n=== TESTING BLUE STEERING (strength={steering_strength}) ===")

for prompt in test_prompts:
	print(f"\nTesting: {prompt}")

	# Generate WITHOUT steering (baseline)
	messages = [{"role": "system", "content": HTML_GENERATION_SYSTEM_PROMPT}, {"role": "user", "content": prompt}]
	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer(text, return_tensors="pt").to(model.device)

	with torch.no_grad():
		baseline_ids = model.generate(
			**inputs,
			max_new_tokens=3000,
			temperature=0.7,
			do_sample=True,
			pad_token_id=tokenizer.eos_token_id
		)

	baseline_text = tokenizer.decode(baseline_ids[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
	baseline.append(baseline_text)
	# Generate WITH steering
	hook_handle = target_layer.register_forward_hook(steering_hook)

	try:
		with torch.no_grad():
			steered_ids = model.generate(
				**inputs,
				max_new_tokens=3000,
				temperature=0.7,
				do_sample=True,
				pad_token_id=tokenizer.eos_token_id
			)

		steered_text = tokenizer.decode(steered_ids[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
		steered.append(steered_text)

	finally:
		hook_handle.remove()

		

Testing intervention with probe from: model.layers.26.mlp
Steering vector shape: torch.Size([1024])
Found target layer: Qwen3MLP(
  (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
  (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
  (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
  (act_fn): SiLU()
)

=== TESTING BLUE STEERING (strength=5.0) ===

Testing: Create a website for a tech startup. The primary color should be shades of red.

Testing: Build a portfolio site for a photographer. The primary color should be shades of red.

Testing: Design a homepage for a restaurant. The primary color should be shades of red.


In [63]:
from color_extractor import extract_colors

for (b, s) in zip(baseline, steered):
	baseline_colors = extract_colors(b)
	steered_colors = extract_colors(s)

	print(baseline_colors)
	print(steered_colors)
	print("-"*100)


Response: colors=['red']
Response: colors=['red']
['red']
['red']
----------------------------------------------------------------------------------------------------
Response: colors=['red']
Response: colors=['red']
['red']
['red']
----------------------------------------------------------------------------------------------------
Response: colors=['red']
Response: None
No colors extracted for prompt:  <think>
Okay, the user wants a self-contained HTML homepage for a restaurant with primary colors in 
['red']
[]
----------------------------------------------------------------------------------------------------
