## **Part 1: Extract Activations from Pre-Modified Text**

This script loads the Excel file, which *already* contains defactualized (placeholder) responses. It reads these modified columns, extracts activations from them, and saves the `activations.npz` file.

### **Step 0**: Install Dependencies

In [3]:
!pip install transformers torch scikit-learn accelerate tqdm pandas openpyxl numpy -q


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


### **Step 1**: Log in to Hugging Face

In [2]:
try:
    from huggingface_hub import notebook_login
    notebook_login()
except ImportError:
    print("huggingface_hub not found. Please log in using 'huggingface-cli login' in your terminal.")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### **Step 2**: Activation Extractor Class

In [4]:
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import pandas as pd
import sys
import argparse
import re
from typing import Dict, List, Tuple, Optional

class ActivationExtractor:
    def __init__(self, model_name: str):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        self.device = self.model.device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self._layers_attr_path = self._find_layer_attr_path()
        self.num_layers = len(self._get_layers_list())
        print(f"[ActivationExtractor] Model loaded. Path: {self._layers_attr_path}, Layers: {self.num_layers}")

    def _find_layer_attr_path(self):
        candidates = [["model", "layers"], ["transformer", "h"], ["model", "decoder", "layers"]]
        for path in candidates:
            cur = self.model
            valid = True
            for p in path:
                if hasattr(cur, p): cur = getattr(cur, p)
                else: valid = False; break
            if valid and isinstance(cur, (list, nn.ModuleList)): return path
        raise AttributeError("Could not find transformer layer list in model.")

    def _get_layers_list(self):
        cur = self.model
        for p in self._layers_attr_path: cur = getattr(cur, p)
        return list(cur)

    def _resolve_layer_idx(self, idx: int):
        L = self.num_layers
        if idx < 0: idx = L + idx
        assert 0 <= idx < L, f"layer_index {idx} out of range"
        return idx

    @torch.no_grad()
    def get_activation_for_pair(self, input_text: str, output_text: str, layer_index: int) -> np.ndarray:
        idx = self._resolve_layer_idx(layer_index)
        concat = f"{input_text.strip()} {output_text.strip()}"
        tok = self.tokenizer(concat, return_tensors="pt").to(self.model.device)
        outputs = self.model(**tok, output_hidden_states=True, return_dict=True)
        hs = outputs.hidden_states[idx + 1]
        return hs[0, -1, :].detach().cpu().numpy()

## **Step 3**: Main Extraction Driver

This is the main part of the script. It will:
1.  Load the Excel file.
2.  Read the defactualized text from `response_Neutral_Modified` and `response_Modified`.
3.  Create a placeholder prompt for each row.
4.  Run activation extraction and save the activations by the name `activations.npz`.

In [None]:
def load_data_for_extraction(file_path: str) -> Optional[Dict[str, List[Tuple[str, str, str]]]]:
    """
    Loads pre-defactualized data from the XLSX file and creates prompts.
    """
    try:
        df = pd.read_excel(file_path)
    except Exception as e:
        print(f"An error occurred while reading the Excel file: {e}")
        return None

    # --- Column Names --- 
    
    DEFACT_NEUTRAL_COL = 'response_Neutral'
    DEFACT_STYLED_COL = 'response_Modified'
    FACT_COLS_PLACEHOLDERS = ['extracted_DATE', 'extracted_TIME', 'extracted_VENUE', 'extracted_HOST', 'extracted_EVENT']
    # -----------------------------------
    
    required_cols = FACT_COLS_PLACEHOLDERS + [DEFACT_NEUTRAL_COL, DEFACT_STYLED_COL]
    if not all(col in df.columns for col in required_cols):
        print("Error: Missing one of the required columns.")
        print(f"Script needs: {required_cols}")
        print(f"Found in file: {df.columns.to_list()}")
        # Try to continue if at least the core columns are there
        if DEFACT_NEUTRAL_COL not in df.columns or DEFACT_STYLED_COL not in df.columns:
            return None 
        print("Warning: Missing some fact columns, prompts may be incomplete.")

    print(f"INFO: Loaded {len(df)} examples from the file.")
    train_examples = []
    
    for idx, row in df.iterrows():
        # 1. Get facts (as placeholders, e.g., "<DATE>") from the row
        date = str(row.get('extracted_DATE', '<DATE>'))
        time_ = str(row.get('extracted_TIME', '<TIME>'))
        venue = str(row.get('extracted_VENUE', '<VENUE>'))
        host = str(row.get('extracted_HOST', '<HOST>'))
        event = str(row.get('extracted_EVENT', '<EVENT>'))
        
        # 2. Get the *already defactualized* email text
        neutral_email = str(row.get(DEFACT_NEUTRAL_COL))
        styled_email = str(row.get(DEFACT_STYLED_COL))

        if pd.isna(neutral_email) or pd.isna(styled_email) or neutral_email == 'nan' or styled_email == 'nan':
            print(f"Skipping row {idx} due to missing email text.")
            continue

        # 3. Extract subject from the defactualized neutral email
        subject_line = "Subject: Invitation" # Default fallback
        match = re.search(r'Subject:\\s*(.*)', neutral_email, re.IGNORECASE)
        if match:
            subject_line = match.group(0).strip() # Get the full "Subject: ..." line

        # 4. Create the dynamic prompt using the placeholders
        prompt = f"""Write an email invitation for the {event}, scheduled for {date}, at {time_} in the {venue}. The event is hosted and sent by {host}.
        
{subject_line}
"""
        
        # 5. Add to training set
        train_examples.append((prompt, styled_email, neutral_email))

    print(f"Successfully created {len(train_examples)} training pairs.")
    return {"user_1": train_examples}

# --- Main Execution Block ---
if __name__ == "__main__":
    if 'ipykernel' in sys.modules:
        sys.argv = sys.argv[:1]

    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="meta-llama/Llama-2-7b-hf")
    parser.add_argument("--layer", type=int, default=-15)
    parser.add_argument("--xlsx_file", type=str, default="generated_email_responses_modified (2).xlsx")
    parser.add_argument("--output_file", type=str, default="activations.npz")
    args = parser.parse_args()

    print(f"Running Activation Extraction: model={args.model}, layer={args.layer}")
    
    # 1. Load data
    train_hist = load_data_for_extraction(args.xlsx_file)
    if not train_hist:
        print("Halting execution due to data loading error.")
    else:
        # 2. Load model
        ae = ActivationExtractor(args.model)

        # 3. Extract Activations
        user_id = "user_1"
        examples = train_hist[user_id]
        
        print(f"\n[Pipeline] Extracting activations for '{user_id}' with {len(examples)} examples...")
        pos_acts, neg_acts = [], []
        
        for (inp_prompt, user_out, neutral_out) in tqdm(examples, desc="Extracting training activations"):
            pos_acts.append(ae.get_activation_for_pair(inp_prompt, user_out, args.layer))
            neg_acts.append(ae.get_activation_for_pair(inp_prompt, neutral_out, args.layer))

        # 4. Save the activations to a file
        if not pos_acts:
            print("\nNo activations were extracted. Please check your data file.")
        else:
            pos_arr = np.vstack(pos_acts)
            neg_arr = np.vstack(neg_acts)
            
            np.savez_compressed(args.output_file, pos_acts=pos_arr, neg_acts=neg_arr)
            
            print(f"\n[SUCCESS] Activations saved successfully to '{args.output_file}'")
            print(f"  Positive activations shape: {pos_arr.shape}")
            print(f"  Negative activations shape: {neg_arr.shape}")
            print("You can now run the next notebook for activation steering.")

Running Activation Extraction: model=meta-llama/Llama-2-7b-hf, layer=-15
INFO: Loaded 41 examples from the file.
Successfully created 41 training pairs.


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


[ActivationExtractor] Model loaded. Path: ['model', 'layers'], Layers: 32

[Pipeline] Extracting activations for 'user_1' with 41 examples...


Extracting training activations: 100%|██████████| 41/41 [00:59<00:00,  1.44s/it]


[SUCCESS] Activations saved successfully to 'activations.npz'
  Positive activations shape: (41, 4096)
  Negative activations shape: (41, 4096)
You can now run the next notebook for activation steering.



