## Figure out how to Implement Per Token Probes 


#### Goal 
- Produce per token probe plots that visualise how the 'high-stakes' nature of the probe varies over words.
- Figure out what this script needs to look like and apply it to an eval dataset from Figure 2/ a split from the variation types

#### Timeline

Created: 06/03/25 
Ideally Finished: 07/03/25 to send to supervisors

In [1]:
# imports
import numpy as np
from models_under_pressure.config import GENERATED_DATASET_PATH, TRAIN_TEST_SPLIT
from models_under_pressure.experiments.dataset_splitting import load_train_test
from models_under_pressure.probes.model import LLMModel
from models_under_pressure.probes.probes import LinearProbe

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model and dataset
model = LLMModel.load(model_name="meta-llama/Llama-3.2-1B-Instruct", device="cuda:1")

# Train a probe
train_dataset, _ = load_train_test(
    dataset_path=GENERATED_DATASET_PATH,
    split_path=TRAIN_TEST_SPLIT,
)

In [None]:
# Analyse the dataset used to train the probe:
print(f'Train dataset shape: {train_dataset.to_pandas().shape}')
print(f'Train dataset variation types: {train_dataset.to_pandas()["variation_type"].unique()}')
filtered_dataset = train_dataset.filter(lambda x: x.other_fields["variation_type"] == "prompt_style")
print(f'Filtered dataset shape: {filtered_dataset.to_pandas().shape}')
print(f'Filtered dataset variation types: {filtered_dataset.to_pandas()["variation_type"].unique()}')

filtered_dataset.to_pandas().head()


Train dataset shape: (2867, 11)
Train dataset variation types: ['prompt_style' 'tone' 'language']
Filtered dataset shape: (956, 11)
Filtered dataset variation types: ['prompt_style']


Unnamed: 0,inputs,ids,situations,topic,labels,timestamp,variation_type,variation,role_of_user,impact_factors,situations_ids
0,"As an executive in an energy firm, I've recent...",0,"{'high_stakes': 1, 'low_stakes': 2}",Environment,1,2025-03-04T14:43:08.789463+00:00,prompt_style,Q&A long,Executives and decision-makers,Overtrust in Automated Authority,high_stakes_1_low_stakes_2
1,I'm in charge of a waste management department...,1,"{'high_stakes': 1, 'low_stakes': 2}",Environment,0,2025-03-04T14:43:08.789463+00:00,prompt_style,Q&A long,Executives and decision-makers,Overtrust in Automated Authority,high_stakes_1_low_stakes_2
2,A team overseeing the protection of sensitive ...,2,"{'high_stakes': 3, 'low_stakes': 4}",Military and Espionage,1,2025-03-04T14:43:14.216331+00:00,prompt_style,Q&A long,Executives and decision-makers,Permanent Harm,high_stakes_3_low_stakes_4
3,During an analysis of the protocols for managi...,3,"{'high_stakes': 3, 'low_stakes': 4}",Military and Espionage,0,2025-03-04T14:43:14.216331+00:00,prompt_style,Q&A long,Executives and decision-makers,Permanent Harm,high_stakes_3_low_stakes_4
4,As part of our strategic initiative to enhance...,4,"{'high_stakes': 5, 'low_stakes': 6}",Technology,1,2025-03-04T14:43:20.034367+00:00,prompt_style,Q&A long,Executives and decision-makers,Rapid Financial or Market Decisions,high_stakes_5_low_stakes_6


In [4]:
# Train the probe:
probe = LinearProbe(_llm=model, layer=11)
probe.fit(train_dataset.filter(lambda x: x.other_fields["variation_type"] == "prompt_style"))

Generating activations...
input_str
{'return_tensors': 'pt', 'truncation': True, 'padding': True, 'max_length': 2048}
token_dict input_ids shape torch.Size([1, 151])
input_str
{'return_tensors': 'pt', 'truncation': True, 'padding': True, 'max_length': 2048}
token_dict input_ids shape torch.Size([64, 193])
input_str
{'return_tensors': 'pt', 'truncation': True, 'padding': True, 'max_length': 2048}
token_dict input_ids shape torch.Size([64, 176])
input_str
{'return_tensors': 'pt', 'truncation': True, 'padding': True, 'max_length': 2048}
token_dict input_ids shape torch.Size([64, 181])
input_str
{'return_tensors': 'pt', 'truncation': True, 'padding': True, 'max_length': 2048}
token_dict input_ids shape torch.Size([64, 173])
input_str
{'return_tensors': 'pt', 'truncation': True, 'padding': True, 'max_length': 2048}
token_dict input_ids shape torch.Size([64, 66])
input_str
{'return_tensors': 'pt', 'truncation': True, 'padding': True, 'max_length': 2048}
token_dict input_ids shape torch.Size(

LinearProbe(_llm=LLMModel(name='meta-llama/Llama-3.2-1B-Instruct', model=LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
   

In [5]:
# Load the eval dataset of interest:
from models_under_pressure.interfaces.dataset import Dataset
from models_under_pressure.config import EVAL_DATASETS

eval_dataset = Dataset.load_from(EVAL_DATASETS["anthropic"]["path"],
                    field_mapping=EVAL_DATASETS["anthropic"]["field_mapping"])

# Print the dataset and look at it's shape:
print(f'Eval dataset shape: {eval_dataset.to_pandas().shape}')
eval_dataset.to_pandas().head()

Eval dataset shape: (50, 6)


Unnamed: 0,inputs,ids,index,category,labels,comment
0,"[{""role"": ""user"", ""content"": ""Hey you know whi...",train_761_chosen,761,chosen,1,
1,"[{""role"": ""user"", ""content"": ""Hey you know whi...",train_761_rejected,761,rejected,1,
2,"[{""role"": ""user"", ""content"": ""Should I steal s...",train_2516_chosen,2516,chosen,1,
3,"[{""role"": ""user"", ""content"": ""Should I steal s...",train_2516_rejected,2516,rejected,1,
4,"[{""role"": ""user"", ""content"": ""What was the nam...",train_10547_chosen,10547,chosen,0,


In [6]:
preds = probe.per_token_predictions(eval_dataset.inputs)
# Add predictions as a column to the eval dataset dataframe
df_eval = eval_dataset.to_pandas()
df_eval['predictions'] = list(preds)

Generating activations...
input_str
{'return_tensors': 'pt', 'truncation': True, 'padding': True, 'max_length': 2048}
token_dict input_ids shape torch.Size([1, 228])
input_str
{'return_tensors': 'pt', 'truncation': True, 'padding': True, 'max_length': 2048}
token_dict input_ids shape torch.Size([50, 646])
activations_obj.activations.shape (50, 228, 2048)
attention_mask [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.padding_side

'right'

In [8]:
df_eval['predictions'].apply(lambda x: x[x!=-1].shape)

0     (228,)
1     (218,)
2     (207,)
3     (148,)
4      (84,)
5      (68,)
6     (228,)
7     (228,)
8      (75,)
9     (125,)
10    (228,)
11    (191,)
12     (65,)
13     (80,)
14    (193,)
15    (201,)
16    (105,)
17    (137,)
18    (171,)
19    (162,)
20    (162,)
21    (177,)
22    (121,)
23    (133,)
24    (191,)
25    (149,)
26    (228,)
27    (228,)
28    (137,)
29    (139,)
30    (228,)
31    (228,)
32    (179,)
33    (150,)
34    (195,)
35    (228,)
36     (94,)
37    (151,)
38    (208,)
39    (205,)
40    (147,)
41    (120,)
42    (196,)
43    (136,)
44    (228,)
45    (228,)
46    (228,)
47    (228,)
48    (228,)
49    (228,)
Name: predictions, dtype: object

In [9]:
import json

json.loads(df_eval['inputs'][49])

tokenizer.apply_chat_template(json.loads(df_eval['inputs'][49]), add_generation_prompt=True, return_tensors='pt').shape

torch.Size([1, 330])

In [10]:
# Get the tokenizer from the probe's model
tokenizer = probe._llm.tokenizer

# Create a list to store the word-prediction mappings for each input
word_predictions = []

for i, input_text in enumerate(eval_dataset.inputs):
    # Tokenize the input
    tokens = tokenizer(input_text, return_offsets_mapping=True, add_special_tokens=False)
    
    # Get the offset mappings that show where each token maps in the original text
    offset_mapping = tokens.offset_mapping
    
    # Get predictions for this input
    token_preds = preds[i][:len(offset_mapping)]  # Trim to actual token length
    
    # Map predictions back to words
    current_word = ""
    current_word_pred = []
    word_pred_pairs = []
    
    for (start, end), pred in zip(offset_mapping, token_preds):
        # Get the text piece for this token
        token_text = input_text[start:end]
        
        # If token starts with space, previous word is complete
        if token_text.startswith(" ") and current_word:
            # Average predictions for the word
            word_pred_pairs.append((current_word.strip(), np.mean(current_word_pred)))
            current_word = token_text
            current_word_pred = [pred]
        else:
            current_word += token_text
            current_word_pred.append(pred)
    
    # Add final word
    if current_word:
        word_pred_pairs.append((current_word.strip(), np.mean(current_word_pred)))
        
    word_predictions.append(word_pred_pairs)

# Add word-level predictions to dataframe
df_eval['word_predictions'] = word_predictions

# Print first few word predictions as example
print("Example word predictions for first input:")
for word, pred in word_predictions[0][:10]:
    print(f"Word: {word:20} Prediction: {pred:.3f}")


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
import pandas as pd

pd.Series(preds)

ValueError: Data must be 1-dimensional, got ndarray of shape (50, 228) instead