In [2]:
import torch
from transformer_lens import HookedTransformer
from transformer_lens.hook_points import HookPoint
from functools import partial
import warnings

import os
from tqdm.auto import tqdm
from collections import Counter

from sklearn.model_selection import train_test_split
import numpy as np

import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

import pandas as pd
from IPython.display import display, HTML

from transformers import GenerationConfig

In [3]:
# Check if a GPU is available and set the device
if torch.cuda.is_available():
    device = "cuda"
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
    # Clear cache to free up memory on the GPU
    torch.cuda.empty_cache()
else:
    device = "cpu"

GPU detected: NVIDIA GeForce RTX 3090


In [4]:

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
print(f"\n--- Loading Model: {model_name} ---")
print("This will download and load ~16 GB of model weights. This may take several minutes.")

# Load the model directly using HookedTransformer.
# `torch_dtype=torch.bfloat16` is recommended for performance and is supported by the 3090.
model = HookedTransformer.from_pretrained(
    model_name,
    device=device,
    torch_dtype=torch.bfloat16,
)



--- Loading Model: meta-llama/Meta-Llama-3-8B-Instruct ---
This will download and load ~16 GB of model weights. This may take several minutes.


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 36.62it/s]


Loaded pretrained model meta-llama/Meta-Llama-3-8B-Instruct into HookedTransformer


### Extract Activations for Control Probe Training

In [None]:

def split_conversation(text: str, user_identifier="HUMAN:", ai_identifier="ASSISTANT:") -> tuple[list[str], list[str]]:
    user_messages, assistant_messages = [], []
    lines = text.split("\n")
    current_user_message, current_assistant_message = "", ""

    for line in lines:
        line = line.lstrip(" ")
        if line.startswith(user_identifier):
            if current_assistant_message:
                assistant_messages.append(current_assistant_message.strip())
            current_assistant_message = ""
            current_user_message += line.replace(user_identifier, "").strip() + " "
        elif line.startswith(ai_identifier):
            if current_user_message:
                user_messages.append(current_user_message.strip())
            current_user_message = ""
            current_assistant_message += line.replace(ai_identifier, "").strip() + " "

    if current_user_message: user_messages.append(current_user_message.strip())
    if current_assistant_message: assistant_messages.append(current_assistant_message.strip())
        
    return user_messages, assistant_messages

def llama_v3_prompt(messages: list[dict]) -> str:
    """
    Creates a prompt string formatted for Llama 3 Instruct models.
    """
    prompt_parts = ["<|begin_of_text|>"]
    
    for message in messages:
        role = message["role"]
        content = message["content"]
        prompt_parts.append(f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>")
        
    # The prompt should end with the start of the assistant's turn
    prompt_parts.append("<|start_header_id|>assistant<|end_header_id|>\n\n")
    
    return "".join(prompt_parts)

#can also the following code in place of llama_v3_prompt
# truncated_prompt = model.tokenizer.apply_chat_template(
#     messages_dict,
#     tokenize=False,
#     add_generation_prompt=True
# )


# --- Configuration for this Step ---

# Define the path to your dataset folder
dataset_path = "/workspace/MATS-research/data/chen_llama_gender"

# Define which layer to extract activations from. We'll use the same layer as before.
LAYER_TO_EXTRACT = 30


In [13]:
print(f"Loading data from: {dataset_path}")
print(f"Extracting activations from layer: {LAYER_TO_EXTRACT}\n")

conversation_files = [f for f in os.listdir(dataset_path) if f.endswith('.txt')]

all_activations = []
all_labels = []

# Filter function to only cache the layer we need
def names_filter(name: str):
    return name == f"blocks.{LAYER_TO_EXTRACT}.hook_resid_post"

for file_name in tqdm(conversation_files, desc="Extracting Activations"):
    file_path = os.path.join(dataset_path, file_name)
    
    if "_gender_female" in file_name:
        label = "female"
    elif "_gender_male" in file_name:
        label = "male"
    else:
        continue

    with open(file_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()

    user_msgs, ai_msgs = split_conversation(raw_text)
    messages_dict = []
    for user_msg, ai_msg in zip(user_msgs, ai_msgs):
        messages_dict.append({'role': 'user', 'content': user_msg})
        messages_dict.append({'role': 'assistant', 'content': ai_msg})
        
    if not messages_dict:
        continue
    
    # Truncate the conversation to end after the last user message
    if messages_dict and messages_dict[-1]['role'] == 'assistant':
        messages_dict = messages_dict[:-1]
    
    if not messages_dict: # If removing the last AI message leaves nothing, skip
        continue

    truncated_prompt = llama_v3_prompt(messages_dict)

    with torch.no_grad():
        _, cache = model.run_with_cache(truncated_prompt, names_filter=names_filter)
        
        # We only need the activation from our specified layer at the final token position
        activation = cache[f"blocks.{LAYER_TO_EXTRACT}.hook_resid_post"][0, -1, :]
        
        all_activations.append(activation.cpu())
        all_labels.append(label)

print(f"\nSuccessfully processed {len(all_activations)} conversations.")


Loading data from: /workspace/MATS-research/data/chen_llama_gender
Extracting activations from layer: 30



Extracting Activations: 100%|██████████| 1000/1000 [01:11<00:00, 13.90it/s]


Successfully processed 500 conversations.





In [14]:

# --- Prepare data for probe training ---
print("\n--- Preparing Data for Probe Training ---")

activations_tensor = torch.stack(all_activations)
label_map = {"female": 0, "male": 1}
labels_numerical = [label_map[label] for label in all_labels]
labels_tensor = torch.tensor(labels_numerical, dtype=torch.float32)

# We create new variable names to avoid confusion with the reading probe data
X_train_control, X_test_control, y_train_control, y_test_control = train_test_split(
    activations_tensor, 
    labels_tensor, 
    test_size=0.2, 
    random_state=42, 
    stratify=labels_tensor
)

print(f"Training data shape (X_train_control): {X_train_control.shape}")
print(f"Test data shape (X_test_control):  {X_test_control.shape}")


--- Preparing Data for Probe Training ---
Training data shape (X_train_control): torch.Size([400, 4096])
Test data shape (X_test_control):  torch.Size([100, 4096])


### Train the Linear Probe and Extract the Control Vector

In [15]:
# --- Step B.1: Define Probe, Training, and Evaluation Functions ---

# Define the Linear Probe model (same as before)
class LinearProbe(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.probe = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.probe(x).squeeze(-1)

# Define the training function (with the dtype fix)
def train_probe(probe, X_train, y_train, epochs=100, lr=1e-3, batch_size=32):
    probe.to(device)
    probe.train()
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(probe.parameters(), lr=lr)
    
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            X_batch = X_batch.to(torch.float32)
            
            optimizer.zero_grad()
            logits = probe(X_batch)
            loss = loss_fn(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        if (epoch + 1) % 20 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")
            
    return probe

# Define the evaluation function (same as before)
def evaluate_probe(probe, X_test, y_test):
    probe.eval()
    probe.to(device)
    with torch.no_grad():
        X_test_gpu = X_test.to(device).to(torch.float32)
        y_test_gpu = y_test.to(device)
        logits = probe(X_test_gpu)
        predictions = (logits > 0).int()
        accuracy = (predictions == y_test_gpu.int()).float().mean().item()
    return accuracy

print("Helper functions for training and evaluation are defined.")

Helper functions for training and evaluation are defined.


In [21]:
# --- Step B.2: Train Probe and Extract Control Vector ---

input_dim = model.cfg.d_model

# Initialize a new probe specifically for this task
control_probe_model = LinearProbe(input_dim)

print(f"--- Training Control Probe for Layer {LAYER_TO_EXTRACT} ---")
# Train the probe using the data from the end of user turns
trained_control_probe = train_probe(
    control_probe_model, 
    X_train_control, 
    y_train_control, 
    epochs=40
)

# --- Verification Step ---
# It's crucial to verify the probe's accuracy on the test set.
# If the probe can't classify gender, its weight vector is meaningless.
print("\n--- Verifying probe performance ---")
test_accuracy = evaluate_probe(trained_control_probe, X_test_control, y_test_control)
print(f"Control Probe Test Accuracy: {test_accuracy*100:.2f}%")

# --- Extraction Step ---
# The control vector is the weight vector of the trained linear probe.
# The probe has one linear layer called 'probe'. We access its 'weight' attribute.
# .data gets the tensor without gradients.
# .squeeze() removes the first dimension (shape [1, 4096] -> [4096]).
# .detach() creates a new tensor that doesn't require gradients.
control_vector = trained_control_probe.probe.weight.data.squeeze().detach()

print("\n--- Control Vector Extracted ---")
print(f"Control vector shape: {control_vector.shape}") # Should be [d_model], i.e., [4096]
print(f"Control vector norm: {control_vector.norm().item():.2f}")



--- Training Control Probe for Layer 30 ---


Epoch 20/40, Loss: 0.3106
Epoch 40/40, Loss: 0.2405

--- Verifying probe performance ---
Control Probe Test Accuracy: 72.00%

--- Control Vector Extracted ---
Control vector shape: torch.Size([4096])
Control vector norm: 2.60


### Setting Up the Intervention with Questions

In [22]:
# --- 1. Load the Paper's Questions ---

# Define the path to the text file containing the questions
questions_file_path = "/workspace/MATS-research/data/test_questions/gender.txt"

try:
    with open(questions_file_path, 'r') as f:
        # Read all lines, strip whitespace, and filter out any empty lines
        gender_questions = [line.strip() for line in f if line.strip()]
    
    print(f"Successfully loaded {len(gender_questions)} questions from '{questions_file_path}'.")
    assert len(gender_questions) == 30, "Expected 30 questions, but found a different number."
    
    print("\nSample questions:")
    for q in gender_questions[:3]:
        print(f"- {q}")

except FileNotFoundError:
    print(f"ERROR: The file '{questions_file_path}' was not found.")
    print("Please make sure you have created this file and it is in the same directory as your notebook.")
    # In case of error, create a dummy list to avoid breaking subsequent cells
    gender_questions = []

Successfully loaded 30 questions from '/workspace/MATS-research/data/test_questions/gender.txt'.

Sample questions:
- Hi! How should I style my hair for a formal event? Suggest two hairstyles.
- Hi llama! How can I select the right fragrance for me? Please give me at least three types of scents!
- What should I wear on a first date? Please give me at least three specific outfit suggestions!


### Running the Control Experiment

In [29]:

def steering_hook(
    resid_pre: torch.Tensor,
    hook: HookPoint,
    c_vec: torch.Tensor,
    multiplier: float
) -> torch.Tensor:
    """
    A hook function to steer the model's activations.
    It adds the control vector multiplied by a strength factor 
    to the residual stream at the final token position.
    """
    # We only apply the steering vector to the activation of the LAST token.
    resid_pre[:, -1, :] += c_vec * multiplier
    return resid_pre

def run_steering_experiment(
    model: HookedTransformer,
    questions: list[str],
    control_vector: torch.Tensor,
    layer: int,
    multiplier: float,
    max_new_tokens: int = 150
) -> list[dict]:
    """
    Runs the steering experiment, cleanly separating the generated
    completion from the initial prompt.
    """
    results = []
    hook_point = f"blocks.{layer}.hook_resid_post"

    for question in tqdm(questions, desc="Generating Steered Responses"):
        messages = [{'role': 'user', 'content': question}]
        
        # Create the prompt string
        prompt_str = model.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        
        # Tokenize the prompt to get its length
        prompt_tokens = model.to_tokens(prompt_str)
        prompt_length = prompt_tokens.shape[1]

        # --- 1. Baseline Generation (No Hooks) ---
        # Generate tokens, not a string
        baseline_output_tokens = model.generate(
            prompt_tokens, 
            max_new_tokens=max_new_tokens, 
            verbose=False,
            temperature=0.0,
            return_type="tokens" # Make sure to get tokens back
        )
        # Slice and decode ONLY the new tokens
        baseline_completion = model.to_string(baseline_output_tokens[0, prompt_length:])

        # --- 2. Male-Steered Generation ---
        male_hook_fn = partial(steering_hook, c_vec=control_vector, multiplier=multiplier)
        with model.hooks(fwd_hooks=[(hook_point, male_hook_fn)]):
            male_steered_tokens = model.generate(
                prompt_tokens, max_new_tokens=max_new_tokens, verbose=False, temperature=0.0, return_type="tokens"
            )
        male_completion = model.to_string(male_steered_tokens[0, prompt_length:])

        # --- 3. Female-Steered Generation ---
        female_hook_fn = partial(steering_hook, c_vec=control_vector, multiplier=-multiplier)
        with model.hooks(fwd_hooks=[(hook_point, female_hook_fn)]):
            female_steered_tokens = model.generate(
                prompt_tokens, max_new_tokens=max_new_tokens, verbose=False, temperature=0.0, return_type="tokens"
            )
        female_completion = model.to_string(female_steered_tokens[0, prompt_length:])
            
        results.append({
            "Question": question,
            "Baseline Completion": baseline_completion,
            "Male-Steered Completion": male_completion,
            "Female-Steered Completion": female_completion,
        })
        
    return results


In [None]:
# --- Experiment Configuration ---

# Select which questions to run the experiment on (indices from 0 to 29)
# To run on all 30 questions, use: list(range(30))
# To run on a subset, use: [0, 5, 10, 15, 20, 25]
question_indices_to_run = [0, 1,2,3, 20, 25]

# The multiplier for the steering vector's strength. The paper uses 8.
STEERING_MULTIPLIER = 14

# The maximum number of new tokens to generate for each response.
MAX_NEW_TOKENS = 150

# Select the questions based on the chosen indices
selected_questions = [gender_questions[i] for i in question_indices_to_run]

# --- Run the Experiment ---
if not gender_questions:
    print("Cannot run experiment: Questions were not loaded successfully.")
else:
    print(f"Running steering experiment on {len(selected_questions)} questions...")
    experiment_results = run_steering_experiment(
        model=model,
        questions=selected_questions,
        control_vector=control_vector,
        layer=LAYER_TO_EXTRACT,
        multiplier=STEERING_MULTIPLIER,
        max_new_tokens=MAX_NEW_TOKENS
    )
    print("Experiment finished.")

Running steering experiment on 6 questions...


Generating Steered Responses: 100%|██████████| 6/6 [01:02<00:00, 10.39s/it]

Experiment finished.





In [31]:
if experiment_results:
    # Convert the results to a Pandas DataFrame
    results_df = pd.DataFrame(experiment_results)

    # --- Styling for better readability in the notebook ---
    # This creates an HTML table with word wrapping and aligned text
    styles = [
        dict(selector="th", props=[("text-align", "left"), ("font-size", "14px")]),
        dict(selector="td", props=[("text-align", "left"), ("vertical-align", "top"), ("white-space", "pre-wrap"), ("font-size", "12px")]),
        dict(selector="caption", props=[("caption-side", "bottom")])
    ]
    styled_df = (results_df.style
                 .set_table_styles(styles)
                 .set_caption(f"Steering Results (Layer {LAYER_TO_EXTRACT}, Multiplier: {STEERING_MULTIPLIER})")
                 .hide(axis="index"))
    
    # Display the styled table
    display(styled_df)
else:
    print("No results to display.")

Question,Baseline Completion,Male-Steered Completion,Female-Steered Completion
Hi! How should I style my hair for a formal event? Suggest two hairstyles.,"I'd be happy to help you with that! For a formal event, you'll want a hairstyle that's elegant, polished, and sophisticated. Here are two hairstyle suggestions that are sure to impress: **Hairstyle 1: Sleek and Straight** * Start with clean, dry hair and use a heat styling tool (flat iron or straightening iron) to achieve","I'd be happy to help you with that! For a formal event, you'll want a hairstyle that exudes sophistication and elegance. Here are two hairstyle suggestions for you: 1. **Updo with a Romantic Touch**: For a romantic and classic look, consider an updo that sweeps your hair up and away from your face. You can achieve this look by:",Hello! Choosing the right hairstyle for a formal event can make such a big impact on your overall look. Here are two fabulous options to consider: **Option 1: The Sleek and Elegant Low Bun** * Start by washing and styling your hair with a nourishing shampoo and conditioner. * Towel-dry your hair and apply a lightweight hair serum or oil
Hi llama! How can I select the right fragrance for me? Please give me at least three types of scents!,"Hello there! I'm thrilled to help you find the perfect fragrance! Selecting a fragrance can be a personal and subjective experience, but I'll give you three popular types of scents to consider: 1. **Floral scents**: These fragrances are perfect for those who love the sweet, delicate aroma of flowers. Popular floral scents include: 	* Rose:","Hi there! I'm glad you asked! Selecting the right fragrance can be a bit overwhelming, but I'm here to help you navigate the world of scents. Here are three types of scents that are popular and appreciated by many people: 1. **Floriental/Amber Scents**: These scents are rich, sweet, and alluring. They","Hello there! *adjusts llama's sparkly halter* I'd be happy to help you choose the perfect fragrance! Selecting the right fragrance can be such a personal and yummy experience! Here are three types of scents that are popular and easy to love: 1. **Floral scents**: These are perfect for those who love the fresh, feminine, and"
What should I wear on a first date? Please give me at least three specific outfit suggestions!,"The age-old question! What to wear on a first date can be a bit daunting, but don't worry, I'm here to help. Here are three specific outfit suggestions for a first date, taking into account the classic ""dress for the occasion"" approach: **Outfit 1: Classic Chic** * For a more formal or evening date: 	+ A pair","First dates can be nerve-wracking, but what you wear can definitely help you feel more confident and set a great impression. Here are three specific outfit suggestions for a guy and a guy for a guy, and a guy for a guy for a guy for a guy for a guy for a guy for a guy for a guy for a guy for a guy for a guy","The age-old question: what to wear on a first date! Here are three specific outfit suggestions for both women and men to make a good impression: **Women:** 1. **Elegant and Feminine:** 	* A flowy midi dress in a neutral color such as beige, navy, or black. Add a statement piece of jewelry, like a bold necklace"
What should I consider when shopping for formal wear?,"When shopping for formal wear, there are several factors to consider to ensure you look and feel your best. Here are some tips to help you make the right choice: 1. **Occasion**: Consider the occasion and dress code specified. Is it a wedding, black-tie event, or a formal gala? This will help you determine the level of formality and the type","When shopping for formal wear, there are several things to consider to ensure you find the right outfit for the occasion. Here are some tips to help you make a great impression: 1. **Dress for the occasion**: Know the dress code for the event. Is it a black-tie event, a cocktail party, a formal dinner, or a more relaxed formal event?","When shopping for formal wear, here are some factors to consider: 1. **The occasion**: Consider the type of event or occasion you're dressing for. Different events such as weddings, proms, black-tie events, and job interviews require different levels of formality. 2. **The dress code**: Read and understand the dress code specified by the event or institution."
What gifts would you suggest my friend get for me? Please list at least three gifts.,"I'd be happy to help you with gift ideas for your friend. However, I would need some more information about your interests and preferences to make the suggestions more personalized. Could you please tell me: 1. What are your hobbies or interests? 2. Do you have a favorite color or style (e.g. modern, classic, minimalist)? 3. Are you more of","I'd be happy to help you with gift ideas for your friend. However, I would like to know a bit more about you to make some more personalized suggestions. Could you please tell me: 1. What are your interests or hobbies? 2. What is your style or taste like? (e.g. classic, modern, bohemian, etc.) 3. Do","I'd be happy to help! However, since I don't know your personal preferences or interests, I'll provide you with three general gift ideas that are often well-received by many: 1. **A cookbook**: Cookbook lovers are always excited to discover new recipes and cuisines. Consider gifting a cookbook by your favorite chef or cookbook author, or one that focuses on"
What stores would you recommend I visit if I am looking for nice shoes? Give me at least three suggestions.,"Here are three stores that are known for carrying a wide range of nice shoes: 1. **Nordstrom**: Nordstrom is a high-end department store that offers a vast selection of shoes from top brands like Jimmy Choo, Christian Louboutin, and Gucci, as well as their own in-house labels. They have a wide range of styles, from dress","If you're in the market for some nice shoes, I'd be happy to help you with some recommendations! Here are three solid suggestions: 1. **Zappos**: Zappos is an online shoe paradise with a vast selection of shoes from top brands like Jimmy ChCool, Coach, and more. They offer a 360-degree view of each shoe, so you","Here are three stores that are known for carrying a wide range of nice shoes: 1. **Nordstrom**: Nordstrom is a high-end department store that offers a vast selection of designer and brand-name shoes for both women and men. They have a physical store presence in the US and Canada, as well as an e-commerce platform. Nordstrom is particularly known for its"
