In [1]:
from huggingface_hub import login
login(token="")


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
from probity.probing.datasets.templated import TemplatedDataset
from probity.probing.datasets.tokenized import TokenizedProbingDataset
from transformers import AutoTokenizer

# Create movie sentiment dataset
adjectives = {
    "positive": ["incredible", "amazing", "fantastic"],
    "negative": ["terrible", "awful", "horrible"]
}
verbs = {
    "positive": ["loved", "enjoyed", "adored"],
    "negative": ["hated", "disliked", "detested"]
}

# Create dataset using factory method
movie_dataset = TemplatedDataset.from_movie_sentiment_template(
    adjectives=adjectives,
    verbs=verbs
)

# Convert to probing dataset with automatic position finding
# and label mapping from sentiment metadata
probing_dataset = movie_dataset.to_probing_dataset(
    label_from_metadata="sentiment",
    label_map={"positive": 1, "negative": 0},
    auto_add_positions=True
)

# Convert to tokenized dataset
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
tokenized_dataset = TokenizedProbingDataset.from_probing_dataset(
    dataset=probing_dataset,
    tokenizer=tokenizer
)



In [3]:
# Get an example from the probing dataset
probing_example = probing_dataset.examples[16]
print("Probing Dataset Example:")
print(f"Text: {probing_example.text}")
print(f"Label: {probing_example.label}")
print(f"Label Text: {probing_example.label_text}")
print(f"Character Positions: {probing_example.character_positions}")
print(f"Metadata: {probing_example.metadata}\n")

# Get an example from the tokenized dataset 
tokenized_example = tokenized_dataset.examples[16]
print("Tokenized Dataset Example:")
print(f"Text: {tokenized_example.text}")
print(f"Label: {tokenized_example.label}")
print(f"Label Text: {tokenized_example.label_text}")
print(f"Token Positions: {tokenized_example.token_positions}")
print(f"Tokens: {tokenized_example.tokens}")
print(f"Attention Mask: {tokenized_example.attention_mask}")
print(f"Metadata: {tokenized_example.metadata}")


Probing Dataset Example:
Text: I thought this movie was fantastic, I enjoyed it.
Label: 1
Label Text: positive
Character Positions: CharacterPositions(positions={'ADJ': Position(start=25, end=34), 'VERB': Position(start=38, end=45)})
Metadata: {'template': 'I thought this movie was {ADJ}, I {VERB} it.', 'variables': {'ADJ': {'sentiment': ['positive', 'positive', 'positive', 'negative', 'negative', 'negative']}, 'VERB': {'sentiment': ['positive', 'positive', 'positive', 'negative', 'negative', 'negative']}}, 'class': 'positive', 'task': 'sentiment_classification'}

Tokenized Dataset Example:
Text: I thought this movie was fantastic, I enjoyed it.
Label: 1
Label Text: positive
Token Positions: TokenPositions(positions={'ADJ': 6, 'VERB': 9})
Tokens: [2, 235285, 3421, 736, 7344, 729, 15814, 235269, 590, 13177, 665, 235265]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Metadata: {'template': 'I thought this movie was {ADJ}, I {VERB} it.', 'variables': {'ADJ': {'sentiment': ['positive

In [4]:
# Loop through dataset and decode token positions
for example in tokenized_dataset.examples:
    print(f"\nFull text: {example.text}")
    print("Decoded tokens at positions:")
    for pos_key in example.token_positions.keys():
        pos = example.token_positions[pos_key]
        decoded = tokenizer.decode(example.tokens[pos])
        print(f"  {pos_key}: {decoded}")



Full text: I thought this movie was terrible, I hated it.
Decoded tokens at positions:
  ADJ:  terrible
  VERB:  hated

Full text: I thought this movie was terrible, I disliked it.
Decoded tokens at positions:
  ADJ:  terrible
  VERB:  disliked

Full text: I thought this movie was terrible, I detested it.
Decoded tokens at positions:
  ADJ:  terrible
  VERB:  de

Full text: I thought this movie was awful, I hated it.
Decoded tokens at positions:
  ADJ:  awful
  VERB:  hated

Full text: I thought this movie was awful, I disliked it.
Decoded tokens at positions:
  ADJ:  awful
  VERB:  disliked

Full text: I thought this movie was awful, I detested it.
Decoded tokens at positions:
  ADJ:  awful
  VERB:  de

Full text: I thought this movie was horrible, I hated it.
Decoded tokens at positions:
  ADJ:  horrible
  VERB:  hated

Full text: I thought this movie was horrible, I disliked it.
Decoded tokens at positions:
  ADJ:  horrible
  VERB:  disliked

Full text: I thought this movie was hor

In [4]:
probing_example.text[probing_example.character_positions['ADJ'].start:probing_example.character_positions['ADJ'].end]


'horrible'

In [9]:
# Or create a custom template
from probity.probing.datasets.templated import Template, TemplateVariable

# Create variables
subject = TemplateVariable(
    name="SUBJECT",
    values=["cat", "dog"],
    metadata={"entity_type": "animal"}
)

action = TemplateVariable(
    name="ACTION",
    values=["jumped over", "ran under"],
    metadata={"movement_type": "locomotion"}
)

object_var = TemplateVariable(
    name="OBJECT",
    values=["fence", "table"],
    metadata={"object_type": "barrier"}
)

# Create template
template = Template(
    template="The {SUBJECT} {ACTION} the {OBJECT}.",
    variables={
        "SUBJECT": subject,
        "ACTION": action,
        "OBJECT": object_var
    },
    metadata={"task": "entity_movement"}
)

# Create dataset
custom_dataset = TemplatedDataset(templates=[template])

# Convert to probing dataset
probing_dataset = custom_dataset.to_probing_dataset(auto_add_positions=True)

# Convert to tokenized dataset
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
tokenized_dataset = TokenizedProbingDataset.from_probing_dataset(
    dataset=probing_dataset,
    tokenizer=tokenizer
)

In [10]:
# Loop through dataset and decode token positions
for example in tokenized_dataset.examples:
    print(f"\nFull text: {example.text}")
    print("Decoded tokens at positions:")
    for pos_key in example.token_positions.keys():
        pos = example.token_positions[pos_key]
        decoded = tokenizer.decode(example.tokens[pos])
        print(f"  {pos_key}: {decoded}")



Full text: The cat jumped over the fence.
Decoded tokens at positions:
  ACTION:  jumped
  SUBJECT:  cat
  OBJECT:  fence

Full text: The cat jumped over the table.
Decoded tokens at positions:
  ACTION:  jumped
  SUBJECT:  cat
  OBJECT:  table

Full text: The cat ran under the fence.
Decoded tokens at positions:
  ACTION:  ran
  SUBJECT:  cat
  OBJECT:  fence

Full text: The cat ran under the table.
Decoded tokens at positions:
  ACTION:  ran
  SUBJECT:  cat
  OBJECT:  table

Full text: The dog jumped over the fence.
Decoded tokens at positions:
  ACTION:  jumped
  SUBJECT:  dog
  OBJECT:  fence

Full text: The dog jumped over the table.
Decoded tokens at positions:
  ACTION:  jumped
  SUBJECT:  dog
  OBJECT:  table

Full text: The dog ran under the fence.
Decoded tokens at positions:
  ACTION:  ran
  SUBJECT:  dog
  OBJECT:  fence

Full text: The dog ran under the table.
Decoded tokens at positions:
  ACTION:  ran
  SUBJECT:  dog
  OBJECT:  table


In [13]:
from probity.probing.datasets.templated import Template, TemplateVariable

# Create emotion-based variables
reactions = TemplateVariable(
    name="REACTION",
    values=["smiled warmly", "laughed joyfully", "frowned deeply", "scowled angrily"],
    metadata={"emotion": ["positive", "positive", "negative", "negative"]},
    class_bound=True,
    class_key="emotion"
)

responses = TemplateVariable(
    name="RESPONSE",
    values=["embraced", "welcomed", "rejected", "avoided"],
    metadata={"emotion": ["positive", "positive", "negative", "negative"]},
    class_bound=True,
    class_key="emotion"
)

# Create neutral variables for context
locations = TemplateVariable(
    name="LOCATION",
    values=["coffee shop", "library", "park", "restaurant"],
    metadata={"setting_type": "public_space"}
)

activities = TemplateVariable(
    name="ACTIVITY",
    values=["reading books", "playing chess", "sharing stories", "having lunch"],
    metadata={"activity_type": "social"}
)

# Create template with more complex structure
template = Template(
    template="When Sarah {REACTION} at the {LOCATION} while {ACTIVITY}, her friends {RESPONSE} her company.",
    variables={
        "REACTION": reactions,
        "RESPONSE": responses,
        "LOCATION": locations,
        "ACTIVITY": activities
    },
    metadata={"task": "emotion_analysis"}
)

# Create dataset
social_dataset = TemplatedDataset(templates=[template])

# Convert to probing dataset with emotion labels
probing_dataset = social_dataset.to_probing_dataset(
    label_from_metadata="emotion",
    label_map={"positive": 1, "negative": 0},
    auto_add_positions=True
)

# Convert to tokenized dataset
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
tokenized_dataset = TokenizedProbingDataset.from_probing_dataset(
    dataset=probing_dataset,
    tokenizer=tokenizer
)

# Print some random examples
import random
for example in random.sample(list(tokenized_dataset.examples), 3):
    print(f"\nFull text: {example.text}")
    print(f"Emotion class: {example.label_text}")
    print("Decoded tokens at positions:")
    for pos_key in example.token_positions.keys():
        pos = example.token_positions[pos_key]
        decoded = tokenizer.decode(example.tokens[pos])
        print(f"  {pos_key}: {decoded}")


Full text: When Sarah frowned deeply at the coffee shop while reading books, her friends avoided her company.
Emotion class: negative
Decoded tokens at positions:
  ACTIVITY:  reading
  REACTION:  frowned
  LOCATION:  coffee
  RESPONSE:  avoided

Full text: When Sarah frowned deeply at the library while sharing stories, her friends avoided her company.
Emotion class: negative
Decoded tokens at positions:
  ACTIVITY:  sharing
  REACTION:  frowned
  LOCATION:  library
  RESPONSE:  avoided

Full text: When Sarah smiled warmly at the coffee shop while playing chess, her friends embraced her company.
Emotion class: positive
Decoded tokens at positions:
  ACTIVITY:  playing
  REACTION:  smiled
  LOCATION:  coffee
  RESPONSE:  embraced


In [14]:
from transformer_lens import HookedTransformer

model = HookedTransformer.from_pretrained_no_processing("gpt2")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loaded pretrained model gpt2 into HookedTransformer


In [None]:
model.run_with_cache()

## Probing

In [1]:
import torch
from probity.datasets.templated import TemplatedDataset
from probity.datasets.tokenized import TokenizedProbingDataset
from transformers import AutoTokenizer

In [2]:
# Set torch device to mps
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [3]:
# Create movie sentiment dataset
adjectives = {
    "positive": ["incredible", "amazing", "fantastic", "awesome", "beautiful", "brilliant", "exceptional", "extraordinary", "fabulous", "great", "lovely", "outstanding", "remarkable", "wonderful"],
    "negative": ["terrible", "awful", "horrible", "bad", "disappointing", "disgusting", "dreadful", "horrendous", "mediocre", "miserable", "offensive", "terrible", "unpleasant", "wretched"]
}
verbs = {
    "positive": ["loved", "enjoyed", "adored"],
    "negative": ["hated", "disliked", "detested"]
}

# Create dataset using factory method
movie_dataset = TemplatedDataset.from_movie_sentiment_template(
    adjectives=adjectives,
    verbs=verbs
)

# Convert to probing dataset with automatic position finding
# and label mapping from sentiment metadata
probing_dataset = movie_dataset.to_probing_dataset(
    label_from_metadata="sentiment",
    label_map={"positive": 1, "negative": 0},
    auto_add_positions=True
)

# Convert to tokenized dataset
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = TokenizedProbingDataset.from_probing_dataset(
    dataset=probing_dataset,
    tokenizer=tokenizer,
    padding=True,  # Add padding
    max_length=128  # Specify max length
)

# Verify the tokenization worked
example = tokenized_dataset.examples[0]
print("First example tokens:", example.tokens)
print("First example text:", example.text)

First example tokens: [40, 1807, 428, 3807, 373, 8082, 11, 314, 6151, 340, 13]
First example text: I thought this movie was incredible, I loved it.




In [4]:
from probity.probes.linear_probe import LinearProbe, LinearProbeConfig
from probity.training.trainer import SupervisedProbeTrainer, SupervisedTrainerConfig
from probity.pipeline.pipeline import ProbePipeline, ProbePipelineConfig

# First, configure the probe
# GPT2-small has hidden size 768
probe_config = LinearProbeConfig(
    input_size=768,
    normalize_weights=True,  # Normalize the learned direction
    bias=False  # No bias term needed for direction finding
)

# Configure the trainer
trainer_config = SupervisedTrainerConfig(
    batch_size=32,
    learning_rate=1e-3,
    num_epochs=10,
    weight_decay=0.01,
    train_ratio=0.8,  # 80-20 train-val split
    handle_class_imbalance=True,  # Important since our classes are balanced
    show_progress=True
)

# Create pipeline configuration
pipeline_config = ProbePipelineConfig(
    dataset=tokenized_dataset,
    probe_cls=LinearProbe,
    probe_config=probe_config,
    trainer_cls=SupervisedProbeTrainer,
    trainer_config=trainer_config,
    position_key="ADJ",  # We want to probe at the adjective position
    model_name="gpt2",
    hook_points=["blocks.6.hook_resid_post"],  # Layer 6
    cache_dir="./sentiment_probe_cache"  # Cache activations for reuse
)

# Create and run pipeline
pipeline = ProbePipeline(pipeline_config)
probe, training_history = pipeline.run()

# The probe now contains our learned sentiment direction
sentiment_direction = probe.get_direction()

# We can analyze training history
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(training_history['train_loss'], label='Train Loss')
plt.plot(training_history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Probe Training History')
plt.legend()
plt.show()

# Save the pipeline for later use
pipeline.save("./sentiment_probe")

# To test the probe, we can get predictions for new examples
def analyze_sentiment(text: str, pipeline: ProbePipeline):
    # Tokenize new text
    tokens = tokenizer(text, return_tensors="pt")["input_ids"]
    
    # Get activations for the new text
    with torch.no_grad():
        _, cache = pipeline.collector.model.run_with_cache(
            tokens,
            names_filter=["blocks.6.hook_resid_post"]
        )
    
    # Get the activations at layer 6
    activations = cache["blocks.6.hook_resid_post"]
    
    # Apply the probe
    logits = pipeline.probe(activations)
    probs = torch.sigmoid(logits)
    
    return probs.item()

# Test the probe
test_text = "I thought this movie was fantastic, I loved it."
sentiment_score = analyze_sentiment(test_text, pipeline)
print(f"Sentiment score (0=negative, 1=positive): {sentiment_score:.3f}")

Loaded pretrained model gpt2 into HookedTransformer
Moving model to device:  cpu


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Loaded pretrained model gpt2 into HookedTransformer
Moving model to device:  cpu


In [5]:
print(f"Training history: {history}")

Training history: {'train_loss': [0.8292038440704346, 0.7475414474805196, 0.7830476959546407, 0.6590535243352255, 0.6349036892255148, 0.603614350159963, 0.5691607991854349, 0.5244813859462738, 0.6492942372957865, 0.4872782329718272, 0.5733036895593008, 0.5423712829748789, 0.441368967294693, 0.39737847447395325, 0.46008838216463727, 0.37618409593900043, 0.3719136615594228, 0.4497372309366862, 0.3339608907699585, 0.34183535973231], 'train_accuracy': [0.4305555621782939, 0.4826388955116272, 0.3854166666666667, 0.71875, 0.6180555621782938, 0.7604166666666666, 0.6805555621782938, 0.8125, 0.621527781089147, 0.875, 0.7743055621782938, 0.8055555621782938, 0.8055555621782938, 0.9375, 0.8368055621782938, 0.9375, 0.9375, 0.8368055621782938, 0.9583333333333334, 0.96875], 'train_precision': [np.float64(0.6062271062271062), np.float64(0.6749999999999999), np.float64(0.45555555555555555), np.float64(0.8083333333333332), np.float64(0.49673202614379086), np.float64(0.882051282051282), np.float64(0.9222

In [13]:
inference.get_direction_activations("I thought this movie was superb")

tensor([[[ 2.8914],
         [ 1.1352],
         [-0.6983],
         [ 1.2239],
         [ 1.4124],
         [ 3.0506]]])

In [9]:
inference.get_direction_activations("I thought this movie was lousy")

tensor([[[ 2.8914],
         [ 1.1352],
         [-0.6983],
         [ 1.2239],
         [ 1.4124],
         [ 1.5386]]])

In [11]:
# Test with very clear positive/negative examples
print(inference.get_direction_activations("This was the most amazing wonderful perfect movie ever"))
print(inference.get_direction_activations("This was the most horrible terrible awful movie ever"))

tensor([[[119.5551],
         [ -1.9024],
         [ -2.5790],
         [ -1.0393],
         [ -0.1709],
         [ -0.9175],
         [ -0.9112],
         [ -4.0115],
         [ -1.5986]]])
tensor([[[119.5551],
         [ -1.9024],
         [ -2.5790],
         [ -1.0393],
         [ -1.5023],
         [ -2.1044],
         [ -3.8507],
         [ -3.3782],
         [ -2.1310]]])


In [15]:
import torch
import json

# Load the probe weights
probe_weights = torch.load("probes/probe.pt")['state_dict']['linear.weight'].squeeze(0)

# Convert to numpy array and then to list for JSON serialization
weights_list = probe_weights.numpy().tolist()

# Save as JSON
with open("probes/probe.json", "w") as f:
    json.dump(weights_list, f)


  probe_weights = torch.load("probes/probe.pt")['state_dict']['linear.weight'].squeeze(0)
