## **Step 0**: Install Dependencies

In [1]:
!pip install transformers torch scikit-learn accelerate tqdm pandas openpyxl numpy -q


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## **Step 1**: Log in to Hugging Face

Run this cell once. If you are running locally and have already used `huggingface-cli login` in your terminal, you can skip this.

In [2]:
try:
    from huggingface_hub import notebook_login
    notebook_login()
except ImportError:
    print("huggingface_hub not found. Please log in using 'huggingface-cli login' in your terminal.")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## **Step 2**: Helper Classes & Functions

This cell contains all helper classes:
1.  **`ModelSteeringWrapper`**: For generation.
2.  **`PlaceholderReplacer`**: Your code for re-hydrating text.
3.  **`SteeringHook`**: For applying vectors.
4.  **`compute_...` functions**: For building vectors from loaded data.

In [3]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from tqdm import tqdm
import sys
import argparse
import re
import json
import ast
from collections import defaultdict
from typing import Dict, List, Tuple

# --- 1. Lightweight Model Wrapper (for Generation) ---
class ModelSteeringWrapper:
    def __init__(self, model_name: str):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        self.device = self.model.device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self._layers_attr_path = self._find_layer_attr_path()
        self.num_layers = len(self._get_layers_list())
        print(f"[ModelSteeringWrapper] Model loaded. Path: {self._layers_attr_path}, Layers: {self.num_layers}")

    def _find_layer_attr_path(self):
        candidates = [["model", "layers"], ["transformer", "h"], ["model", "decoder", "layers"]]
        for path in candidates:
            cur = self.model
            valid = True
            for p in path:
                if hasattr(cur, p): cur = getattr(cur, p)
                else: valid = False; break
            if valid and isinstance(cur, (list, nn.ModuleList)): return path
        raise AttributeError("Could not find transformer layer list in model.")

    def _get_layers_list(self):
        cur = self.model
        for p in self._layers_attr_path: cur = getattr(cur, p)
        return list(cur)

    def generate(self, prompt: str, max_new_tokens: int = 150, **kwargs) -> str:
        tok = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        input_token_len = tok.input_ids.shape[1]
        out = self.model.generate(**tok, max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.pad_token_id, **kwargs)
        full_tokens = out[0]
        new_tokens = full_tokens[input_token_len:]
        generated_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
        return generated_text.strip()

# --- 2. Your PlaceholderReplacer Class (for Re-hydration) ---
class PlaceholderReplacer:
    """Replace placeholders with actual entity values from extracted columns"""
    
    def __init__(self):
        self.entity_types = ['EVENT', 'DATE', 'TIME', 'VENUE', 'HOST']
    
    def parse_entity_list(self, entity_str):
        """Parse string representation of list back to actual list"""
        if pd.isna(entity_str) or entity_str == '[]' or entity_str == '':
            return []
        
        try:
            # Try to evaluate as Python literal
            return ast.literal_eval(entity_str)
        except:
            # If that fails, return empty list
            return []
            
    def build_entity_dict_from_row(self, row, fact_cols):
        """Helper to create the entity dict from a DataFrame row"""
        entities_dict = {}
        for entity_type in self.entity_types:
            column_name = f'extracted_{entity_type}'
            if column_name in fact_cols and column_name in row:
                entity_str = row[column_name]
                entities_dict[entity_type] = self.parse_entity_list(entity_str)
        return entities_dict
    
    def replace_placeholders(self, text, entities_dict):
        """Replace all placeholders in text with actual entity values"""
        
        if not text or pd.isna(text):
            return text, {}
        
        replaced_text = str(text)
        replacement_log = {}
        
        # Sort entities by length of first fact (longest first) to avoid partial matches
        sorted_entity_types = sorted(
            self.entity_types,
            key=lambda et: len(str(entities_dict.get(et, [''])[0])) if entities_dict.get(et) else 0,
            reverse=True
        )

        for entity_type in sorted_entity_types:
            entity_list = entities_dict.get(entity_type, [])
            
            if not entity_list:
                continue
            
            placeholder = f'<{entity_type}>'
            # Use regex for case-insensitive placeholder matching
            placeholder_pattern = re.compile(re.escape(placeholder), re.IGNORECASE)
            
            # Find all matches
            matches = list(placeholder_pattern.finditer(replaced_text))
            placeholder_count = len(matches)
            
            if placeholder_count == 0:
                continue
            
            replacements_made = []
            # We reverse the matches to replace from the end first to not mess up indices
            for i, match in enumerate(reversed(matches)):
                # Find which entity to use
                entity_idx = i % len(entity_list)
                replacement_value = str(entity_list[entity_idx])
                
                # Replace this specific match
                start, end = match.span()
                replaced_text = replaced_text[:start] + replacement_value + replaced_text[end:]
                replacements_made.append(f"{match.group(0)} → {replacement_value}")
            
            replacement_log[entity_type] = list(reversed(replacements_made))
            
        return replaced_text, replacement_log

# --- 3. Style Vector Extraction Methods ---
def compute_mean_difference(pos: np.ndarray, neg: np.ndarray) -> np.ndarray:
    diff = (pos - neg).mean(axis=0)
    return diff / (np.linalg.norm(diff) + 1e-12)

def compute_logistic_regression(pos: np.ndarray, neg: np.ndarray) -> np.ndarray:
    X = np.vstack([pos, neg])
    y = np.concatenate([np.ones(len(pos)), np.zeros(len(neg))])
    clf = LogisticRegression(max_iter=1000).fit(X, y)
    w = clf.coef_.reshape(-1)
    return w / (np.linalg.norm(w) + 1e-12)

def compute_pca_vector(pos: np.ndarray, neg: np.ndarray) -> np.ndarray:
    diffs = pos - neg
    pca = PCA(n_components=1).fit(np.vstack([diffs, -diffs]))
    vec = pca.components_[0]
    return vec / (np.linalg.norm(vec) + 1e-12)

# --- 4. Steering Hook Class ---
class SteeringHook:
    def __init__(self, model, layer_path, layer_idx, style_vector, multiplier):
        self.model, self.layer_path, self.layer_idx = model, layer_path, layer_idx
        self.style_vector_cpu = torch.from_numpy(style_vector).float() * multiplier
        self.handle = None
        self._register_hook()

    def _get_layer_module(self):
        cur = self.model
        for p in self.layer_path: cur = getattr(cur, p)
        idx = self.layer_idx if self.layer_idx >= 0 else len(cur) + self.layer_idx
        return cur[idx]

    def _hook(self, module, input, output):
        tensor_output = output[0] if isinstance(output, tuple) else output
        add_vec = self.style_vector_cpu.to(tensor_output.device, dtype=tensor_output.dtype)
        modified_tensor = tensor_output + add_vec.view(1, 1, -1)
        return (modified_tensor,) + output[1:] if isinstance(output, tuple) else modified_tensor

    def _register_hook(self):
        self.handle = self._get_layer_module().register_forward_hook(self._hook)

    def remove(self):
        if self.handle: self.handle.remove()

## **Step 3**: Load Activations, Compute Vectors, and Run Test

This is the main driver cell. It loads your saved `activations.npz`, calculates the PCA vector, and generates a steered response for the **second email** in your spreadsheet (index 1), showing both the "before" (redacted) and "after" (re-hydrated) results.

In [4]:
def run_inference_test(model_name: str, layer_index: int, xlsx_path: str, activations_path: str):
    
    # --- 1. Load Activations and Compute Vectors ---
    try:
        data = np.load(activations_path)
        pos_arr = data['pos_acts']
        neg_arr = data['neg_acts']
        print(f"Successfully loaded activations from '{activations_path}'")
    except Exception as e:
        print(f"Error loading '{activations_path}'. Please run the activation extraction script first.")
        print(f"Error details: {e}")
        return

    print("Computing PCA style vector...")
    # --- MODIFICATION: Only compute PCA as requested ---
    pca_style_vector = compute_pca_vector(pos_arr, neg_arr)
    print("PCA style vector computed.")

    # --- 2. Load the Second Row (index 1) from Excel for the Test --- 
    try:
        df = pd.read_excel(xlsx_path, nrows=2) 
        if len(df) < 2:
            print(f"Error: Your Excel file '{xlsx_path}' has fewer than 2 rows. Cannot test on the second row.")
            return
        test_row = df.iloc[1] # Select the second row (index 1)
    except Exception as e:
        print(f"Error reading Excel file '{xlsx_path}': {e}")
        return

    # --- 3. Get Real Facts and Ideal Response from the Test Row ---
    STYLED_COL = 'response_styled' # The *original* styled email, for comparison
    MODIFIED_COL = 'response_Modified' # The *defactualized* styled email
    # These are the *real* facts we will use for re-hydration.
    FACT_COLS = ['extracted_DATE', 'extracted_TIME', 'extracted_VENUE', 'extracted_HOST', 'extracted_EVENT']
    
    # Check if all required columns are present in the loaded dataframe
    if STYLED_COL not in df.columns or MODIFIED_COL not in df.columns or not all(col in df.columns for col in FACT_COLS):
        print(f"Error: Your Excel file is missing required columns for testing.")
        print(f"Script needs: {STYLED_COL}, {MODIFIED_COL}, and all {FACT_COLS}")
        print(f"Found: {df.columns.to_list()}")
        return
        
    ideal_response = str(test_row.get(STYLED_COL))
    
    # Instantiate the replacer and build the dictionary of real facts
    replacer = PlaceholderReplacer()
    real_facts_dict = replacer.build_entity_dict_from_row(test_row, FACT_COLS)
    
    print(f"Loaded facts for re-hydration: {real_facts_dict}")

    # --- 4. Create the Defactualized Prompt --- 
    # This prompt must use placeholders, as the model was trained on them.
    test_query = f"Draft an email invitation for the <EVENT>, scheduled for <DATE>, at <TIME> in the <VENUE>. The event is hosted and sent by <HOST>."
    
    # Get the defactualized subject from the test row
    neutral_email_text = str(test_row.get('response_Neutral'))
    subject_line = "Subject: <SUBJECT>" # Default
    match = re.search(r'Subject:\s*(<[^>]+>.*)', neutral_email_text, re.IGNORECASE)
    if match:
        subject_line = match.group(0).strip()
    
    prompt = f"{test_query}\n\n{subject_line}\n\n"

    # --- 5. Load Model --- 
    print("Loading Llama 2 model... (This may take a few minutes)")
    ae = ModelSteeringWrapper(model_name)

    # --- 6. Run Steering Demonstration (PCA Only @ 3.0) --- 
    print("\n" + "="*50)
    print("Steering Demonstration (PCA Only, Multiplier 3.0)")
    print("="*50)
    print(f"Test Query (with placeholders):\n{prompt}")
    print("\n--- Ideal Styled Response (from file) ---")
    print(ideal_response)

    MULTIPLIER = 3.0
    method = "pca"
    style_vec = pca_style_vector

    print(f"\n--- Steered Generated Response (Method: {method.upper()}) ---")
    
    hook = SteeringHook(ae.model, ae._layers_attr_path, layer_index, style_vec, MULTIPLIER)
    try:
        # 1. Generate the response with placeholders
        redacted_output = ae.generate(prompt, temperature=0.7, do_sample=True, top_p=0.9)
        print(f"\n[RAW OUTPUT (with placeholders)]:\n{redacted_output}")
        
        # 2. Re-hydrate the response with facts
        final_output, log = replacer.replace_placeholders(redacted_output, real_facts_dict)
        print(f"\n[FINAL OUTPUT (re-hydrated)]:\n{final_output}")
        print(f"\n[Re-hydration Log]:\n{json.dumps(log, indent=2)}")
        
    finally:
        hook.remove()

# --- Main Execution Block ---
if __name__ == "__main__":
    import sys
    
    if 'ipykernel' in sys.modules: sys.argv = sys.argv[:1]

    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="meta-llama/Llama-2-7b-hf")
    parser.add_argument("--layer", type=int, default=-15)
    parser.add_argument("--xlsx_file", type=str, default="generated_email_responses_modified (2).xlsx")
    parser.add_argument("--activations_file", type=str, default="activations.npz")
    args = parser.parse_args()

    print(f"Running inference: model={args.model}, layer={args.layer}")
    run_inference_test(args.model, args.layer, args.xlsx_file, args.activations_file)
    print("\nTest finished.")

Running inference: model=meta-llama/Llama-2-7b-hf, layer=-15
Successfully loaded activations from 'activations.npz'
Computing PCA style vector...
PCA style vector computed.
Loaded facts for re-hydration: {'EVENT': ['LiveSouJava - Microservice Patterns - Implemented by Eclipse Microprofile.'], 'DATE': ['April 20, 2020'], 'TIME': ['7:00 AM'], 'VENUE': ['San Francisco'], 'HOST': ['Product School Bangalore']}
Loading Llama 2 model... (This may take a few minutes)


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


[ModelSteeringWrapper] Model loaded. Path: ['model', 'layers'], Layers: 32

Steering Demonstration (PCA Only, Multiplier 3.0)
Test Query (with placeholders):
Draft an email invitation for the <EVENT>, scheduled for <DATE>, at <TIME> in the <VENUE>. The event is hosted and sent by <HOST>.

Subject: <SUBJECT>



--- Ideal Styled Response (from file) ---
Subject: You're Invited! Live SouJava: Microservice Patterns in San-Francisco!

Hi there,

We'd be delighted if you could join us for a special Product School Bangalore event!

Live SouJava: Microservice Patterns - Implemented by Eclipse Microprofile

It's happening on April 20th at 7:00 AM in San-Francisco. We think you'll find this blend of technology and business insights super valuable.

Hope to see you there!

Warmly,

The Product School Bangalore Team

--- Steered Generated Response (Method: PCA) ---

[RAW OUTPUT (with placeholders)]:
Dear <ADDRESS>,

I'm happy to announce that <HOST> is hosting <EVENT>.

The event will take place a