## Creating Datasets

### Using Built-in Template

In [20]:
from transformers import AutoTokenizer
from probity.datasets.templated import TemplatedDataset
from probity.datasets.tokenized import TokenizedProbingDataset

# Define sentiment vocabularies
adjectives = {
    "positive": ["excellent", "amazing", "fantastic", "wonderful"],
    "negative": ["terrible", "awful", "horrible", "dreadful"]
}

verbs = {
    "positive": ["loved", "enjoyed", "adored", "cherished"],
    "negative": ["hated", "disliked", "despised", "loathed"]
}

# Create templated dataset
dataset = TemplatedDataset.from_movie_sentiment_template(
    adjectives=adjectives,
    verbs=verbs
)

# Convert to probing dataset with automatic position tracking
probing_dataset = dataset.to_probing_dataset(
    label_from_metadata="sentiment",
    label_map={"positive": 1, "negative": 0},
    auto_add_positions=True
)

# Tokenize for your target model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = TokenizedProbingDataset.from_probing_dataset(
    dataset=probing_dataset,
    tokenizer=tokenizer,
    padding=True,
    truncation=True
)


In [None]:
# print the first example
print(tokenized_dataset.examples[0])

### Building a New Templated Dataset

In [22]:
from transformers import AutoTokenizer
from probity.datasets.templated import TemplatedDataset, Template, TemplateVariable
from probity.datasets.tokenized import TokenizedProbingDataset
from probity.probes.linear_probe import LinearProbeConfig, LinearProbe

# Define vocabularies for subject-verb agreement template
singular_subjects = ["The cat", "The dog", "A student", "The teacher"]
plural_subjects = ["The cats", "The dogs", "The students", "The teachers"]

singular_verbs = ["walks", "runs", "sleeps", "jumps"]
plural_verbs = ["walk", "run", "sleep", "jump"]

locations = ["in the park", "at home", "near the school", "by the river"]

# Create template variables
subject_var = TemplateVariable(
    name="SUBJECT",
    values=singular_subjects + plural_subjects,
    metadata={
        "number": ["singular"] * len(singular_subjects) + ["plural"] * len(plural_subjects)
    },
    class_bound=True,
    class_key="number"
)

verb_var = TemplateVariable(
    name="VERB",
    values=singular_verbs + plural_verbs,
    metadata={
        "number": ["singular"] * len(singular_verbs) + ["plural"] * len(plural_verbs)
    },
    class_bound=True,
    class_key="number"
)

location_var = TemplateVariable(
    name="LOCATION",
    values=locations
)

# Create template
template = Template(
    template="{SUBJECT} {VERB} {LOCATION}.",
    variables={
        "SUBJECT": subject_var,
        "VERB": verb_var,
        "LOCATION": location_var
    },
    metadata={"task": "subject_verb_agreement"}
)

# Create dataset
dataset = TemplatedDataset(templates=[template])

# Convert to probing dataset
probing_dataset = dataset.to_probing_dataset(
    label_from_metadata="number",
    label_map={"singular": 0, "plural": 1},
    auto_add_positions=True
)

# Tokenize for GPT2
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = TokenizedProbingDataset.from_probing_dataset(
    dataset=probing_dataset,
    tokenizer=tokenizer,
    padding=True,
    truncation=True
)

In [None]:
# print the first example
print(tokenized_dataset.examples[0])

### Creating a Probing Dataset Without Templates

In [24]:
from probity.datasets.base import ProbingDataset, ProbingExample, Position, CharacterPositions
from transformers import AutoTokenizer

# Create examples manually
examples = [
    ProbingExample(
        text="The cat sat on the mat.",
        label=0,
        label_text="animal",
        character_positions=CharacterPositions({
            "subject": Position(start=4, end=7)  # "cat"
        })
    ),
    ProbingExample(
        text="The dog ran in the park.",
        label=0,
        label_text="animal",
        character_positions=CharacterPositions({
            "subject": Position(start=4, end=7)  # "dog"
        })
    )
]

# Create probing dataset
probing_dataset = ProbingDataset(
    examples=examples,
    task_type="classification",
    metadata={"task": "subject_identification"}
)

# Tokenize for your target model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenized_dataset = TokenizedProbingDataset.from_probing_dataset(
    dataset=probing_dataset,
    tokenizer=tokenizer,
    padding=True,
    truncation=True
)

In [None]:
# print the first example
print(tokenized_dataset.examples[0])

## Probing

### Movie Review Dataset


In [5]:
import torch
from probity.datasets.templated import TemplatedDataset
from probity.datasets.tokenized import TokenizedProbingDataset
from transformers import AutoTokenizer

In [6]:
# Set torch device to mps
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [None]:
# Create movie sentiment dataset
adjectives = {
    "positive": ["incredible", "amazing", "fantastic", "awesome", "beautiful", "brilliant", "exceptional", "extraordinary", "fabulous", "great", "lovely", "outstanding", "remarkable", "wonderful"],
    "negative": ["terrible", "awful", "horrible", "bad", "disappointing", "disgusting", "dreadful", "horrendous", "mediocre", "miserable", "offensive", "terrible", "unpleasant", "wretched"]
}
verbs = {
    "positive": ["loved", "enjoyed", "adored"],
    "negative": ["hated", "disliked", "detested"]
}

# Create dataset using factory method
movie_dataset = TemplatedDataset.from_movie_sentiment_template(
    adjectives=adjectives,
    verbs=verbs
)

# Convert to probing dataset with automatic position finding
# and label mapping from sentiment metadata
probing_dataset = movie_dataset.to_probing_dataset(
    label_from_metadata="sentiment",
    label_map={"positive": 1, "negative": 0},
    auto_add_positions=True
)

# Convert to tokenized dataset
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = TokenizedProbingDataset.from_probing_dataset(
    dataset=probing_dataset,
    tokenizer=tokenizer,
    padding=True,  # Add padding
    max_length=128  # Specify max length
)

# Verify the tokenization worked
example = tokenized_dataset.examples[0]
print("First example tokens:", example.tokens)
print("First example text:", example.text)

### Logistic Probe Training

In [None]:
from probity.probes.linear_probe import LogisticProbe, LogisticProbeConfig
from probity.training.trainer import SupervisedProbeTrainer, SupervisedTrainerConfig
from probity.pipeline.pipeline import ProbePipeline, ProbePipelineConfig

# First, configure the probe
# GPT2-small has hidden size 768
probe_config = LogisticProbeConfig(
    input_size=768,
    normalize_weights=True,  # Normalize the learned direction
    bias=False  # No bias term needed for direction finding
)

# Configure the trainer
trainer_config = SupervisedTrainerConfig(
    batch_size=32,
    learning_rate=1e-3,
    num_epochs=10,
    weight_decay=0.01,
    train_ratio=0.8,  # 80-20 train-val split
    handle_class_imbalance=True,  # Important since our classes are balanced
    show_progress=True
)

print(f"Dataset size: {len(tokenized_dataset.examples)}")

# Create pipeline configuration
pipeline_config = ProbePipelineConfig(
    dataset=tokenized_dataset,
    probe_cls=LogisticProbe,
    probe_config=probe_config,
    trainer_cls=SupervisedProbeTrainer,
    trainer_config=trainer_config,
    position_key="ADJ",  # We want to probe at the adjective position
    model_name="gpt2",
    hook_points=["blocks.7.hook_resid_pre"],  # Layer 6
    cache_dir="./cache/sentiment_probe_cache"  # Cache activations for reuse
)

# Create and run pipeline
pipeline = ProbePipeline(pipeline_config)

# Add this debugging code before running the pipeline
example = tokenized_dataset.examples[0]
print(f"Example text: {example.text}")
print(f"Token positions: {example.token_positions}")
print(f"Available position keys: {list(example.token_positions.keys())}")

# Verify the position key matches what's in the dataset
print(f"\nPipeline position key: {pipeline_config.position_key}")

probe, training_history = pipeline.run()

# The probe now contains our learned sentiment direction
sentiment_direction = probe.get_direction()

# We can analyze training history
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(training_history['train_loss'], label='Train Loss')
plt.plot(training_history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Probe Training History')
plt.legend()
plt.show()

# Save the pipeline for later use
pipeline.save("./probes/sentiment_probe")



In [None]:
# To test the probe, we can get predictions for new examples
def analyze_sentiment(text: str, pipeline: ProbePipeline):
    # Tokenize new text
    tokens = tokenizer(text, return_tensors="pt")["input_ids"]
    
    # Get activations for the new text
    with torch.no_grad():
        _, cache = pipeline.collector.model.run_with_cache(
            tokens,
            names_filter=["blocks.7.hook_resid_pre"]
        )
    
    # Get the activations at target layer
    activations = cache["blocks.7.hook_resid_pre"]
    
    # Apply the probe
    logits = pipeline.probe(activations)
    probs = torch.sigmoid(logits)
    # Take the mean probability across all tokens
    return probs[0, -2].item()  # Changed this line

# Test the probe
test_text = "I thought this movie was amazing, I loved it."
sentiment_score = analyze_sentiment(test_text, pipeline)
print(f"Sentiment score (0=negative, 1=positive): {sentiment_score:.3f}")

In [None]:
# Test the probe
test_text = "I thought this movie was detestable, I hated it."
sentiment_score = analyze_sentiment(test_text, pipeline)
print(f"Sentiment score (0=negative, 1=positive): {sentiment_score:.3f}")

### Multi-probe Training

In [None]:
from typing import Dict, Type
import torch
from probity.collection.collectors import TransformerLensCollector, TransformerLensConfig
from probity.probes.linear_probe import (
    LinearProbe, LogisticProbe, KMeansProbe, PCAProbe, MeanDifferenceProbe,
    LinearProbeConfig, LogisticProbeConfig, KMeansProbeConfig, PCAProbeConfig, MeanDiffProbeConfig,
    SklearnLogisticProbe, SklearnLogisticProbeConfig
)
from probity.training.trainer import (
    SupervisedProbeTrainer, 
    SupervisedTrainerConfig, 
    DirectionalProbeTrainer,
    DirectionalTrainerConfig
)
from probity.pipeline.pipeline import ProbePipeline, ProbePipelineConfig
import matplotlib.pyplot as plt

# Example dataset setup (assuming you have this)
dataset = tokenized_dataset

# Model and hook point configuration
model_name = "gpt2-small"
hook_points = ["blocks.7.hook_resid_pre"]  # Examine nth layer residual stream

# Function to run different probe types and compare results
def compare_probes(dataset, hidden_size: int = 768) -> Dict[str, torch.Tensor]:
    """Run all probe types and return their directions for comparison."""
    
    # Common pipeline settings
    base_config = {
        "dataset": dataset,
        "position_key": "ADJ",
        "model_name": model_name,
        "hook_points": ["blocks.7.hook_resid_pre"],
        "cache_dir": "cache"
    }
    
    # Set up configurations for each probe type
    probe_configs = {
        "linear": (
            LinearProbe,
            LinearProbeConfig(
                input_size=hidden_size,
                loss_type="hinge",
                normalize_weights=True,
                bias=True  # Enable bias for better comparison with sklearn
            ),
            SupervisedProbeTrainer
        ),
        "logistic": (
            LogisticProbe,
            LogisticProbeConfig(
                input_size=hidden_size,
                normalize_weights=True,
                bias=True
            ),
            SupervisedProbeTrainer
        ),
        "logistic_skl": (
            SklearnLogisticProbe,
            SklearnLogisticProbeConfig(
                input_size=hidden_size,
                standardize=True,
                normalize_weights=True,
                max_iter=100
            ),
            DirectionalProbeTrainer
        ),
        "kmeans": (
            KMeansProbe,
            KMeansProbeConfig(
                input_size=hidden_size,
                normalize_weights=True
            ),
            DirectionalProbeTrainer
        ),
        "pca": (
            PCAProbe,
            PCAProbeConfig(
                input_size=hidden_size,
                normalize_weights=True
            ),
            DirectionalProbeTrainer
        ),
        "mean_diff": (
            MeanDifferenceProbe,
            MeanDiffProbeConfig(
                input_size=hidden_size,
                normalize_weights=True
            ),
            DirectionalProbeTrainer
        )
    }
    
    directions = {}
    
    # Run each probe type
    for name, (probe_cls, probe_config, trainer_cls) in probe_configs.items():
        print(f"\nRunning {name} probe...")
        
        # Configure trainer based on probe type
        if trainer_cls == SupervisedProbeTrainer:
            trainer_config = SupervisedTrainerConfig(
                batch_size=32,
                learning_rate=1e-3,
                num_epochs=20,
                weight_decay=0.01,  # Matches sklearn's default regularization
                handle_class_imbalance=True,
                patience=5,  # Early stopping patience
                min_delta=1e-4  # Minimum improvement for early stopping
            )
        else:
            trainer_config = DirectionalTrainerConfig(
                batch_size=32,
                num_epochs=1  # Direct computation methods only need one epoch
            )
        
        # Create pipeline configuration
        pipeline_config = ProbePipelineConfig(
            **base_config,
            probe_cls=probe_cls,
            probe_config=probe_config,
            trainer_cls=trainer_cls,
            trainer_config=trainer_config
        )
        
        # Run pipeline
        pipeline = ProbePipeline(pipeline_config)
        probe, history = pipeline.run()
        
        # Save direction
        probe.save_json(f"probes/{name}_direction.json")

        # Plot training history for gradient-based probes
        if isinstance(probe, (LogisticProbe, LinearProbe)):
            plt.figure(figsize=(10, 5))
            plt.plot(history["train_loss"], label='Train Loss')
            plt.plot(history["val_loss"], label='Validation Loss')
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            plt.title(f'{name} Probe Training History')
            plt.legend()
            plt.show()

        # Store direction
        directions[name] = probe.get_direction()
        
        # Print final training loss if available
        if "train_loss" in history:
            final_train_loss = history["train_loss"][-1]
            print(f"{name} final training loss: {final_train_loss:.4f}")
    
    return directions

def compare_directions(directions: Dict[str, torch.Tensor]) -> None:
    """Compare directions using cosine similarity."""
    # Calculate cosine similarities between all pairs
    n_probes = len(directions)
    names = list(directions.keys())
    similarities = torch.zeros((n_probes, n_probes))
    
    for i, name1 in enumerate(names):
        for j, name2 in enumerate(names):
            dir1 = directions[name1]
            dir2 = directions[name2]

            # Ensure both directions are float32
            dir1 = dir1.to(dtype=torch.float32)
            dir2 = dir2.to(dtype=torch.float32)
            
            # Ensure directions are 1D and normalized
            dir1 = dir1.flatten()
            dir2 = dir2.flatten()
            dir1 = dir1 / dir1.norm()
            dir2 = dir2 / dir2.norm()
            # Take absolute value of cosine similarity since direction sign is arbitrary
            similarity = abs(torch.dot(dir1, dir2))
            similarities[i, j] = similarity
    
    # Create a prettier visualization using pandas
    import pandas as pd
    import seaborn as sns
    
    df = pd.DataFrame(
        similarities.numpy(),
        index=names,
        columns=names
    )
    
    plt.figure(figsize=(8, 4))
    sns.heatmap(
        df,
        annot=True,
        fmt='.3f',
        cmap='viridis',
        vmin=0,
        vmax=1
    )
    plt.title('Cosine Similarities Between Probe Directions')
    plt.tight_layout()
    plt.show()

# Run comparison
directions = compare_probes(dataset)
compare_directions(directions)



In [None]:
def compare_probes(dataset, hidden_size: int = 768) -> Dict[str, torch.Tensor]:
    """Run all probe types and return their directions for comparison."""
    
    # Common pipeline settings
    base_config = {
        "dataset": dataset,
        "position_key": "ADJ",
        "model_name": model_name,
        "hook_points": ["blocks.7.hook_resid_pre"],
        "cache_dir": "cache"
    }
    
    # Set up configurations for each probe type
    probe_configs = {
        "linear": (
            LinearProbe,
            LinearProbeConfig(
                input_size=hidden_size,
                loss_type="hinge",
                normalize_weights=True,
                bias=True  # Enable bias for better comparison with sklearn
            ),
            SupervisedProbeTrainer
        ),
        "logistic": (
            LogisticProbe,
            LogisticProbeConfig(
                input_size=hidden_size,
                normalize_weights=True,
                bias=True
            ),
            SupervisedProbeTrainer
        ),
        "logistic_skl": (
            SklearnLogisticProbe,
            SklearnLogisticProbeConfig(
                input_size=hidden_size,
                standardize=True,
                normalize_weights=True,
                max_iter=100
            ),
            DirectionalProbeTrainer
        ),
        "kmeans": (
            KMeansProbe,
            KMeansProbeConfig(
                input_size=hidden_size,
                normalize_weights=True
            ),
            DirectionalProbeTrainer
        ),
        "pca": (
            PCAProbe,
            PCAProbeConfig(
                input_size=hidden_size,
                normalize_weights=True
            ),
            DirectionalProbeTrainer
        ),
        "mean_diff": (
            MeanDifferenceProbe,
            MeanDiffProbeConfig(
                input_size=hidden_size,
                normalize_weights=True
            ),
            DirectionalProbeTrainer
        )
    }
    
    directions = {}
    
    # Run each probe type
    for name, (probe_cls, probe_config, trainer_cls) in probe_configs.items():
        print(f"\nRunning {name} probe...")
        
        # Configure trainer based on probe type
        if trainer_cls == SupervisedProbeTrainer:
            trainer_config = SupervisedTrainerConfig(
                batch_size=32,
                learning_rate=1e-3,
                num_epochs=20,
                weight_decay=0.01,  # Matches sklearn's default regularization
                handle_class_imbalance=True,
                patience=5,  # Early stopping patience
                min_delta=1e-4  # Minimum improvement for early stopping
            )
        else:
            trainer_config = DirectionalTrainerConfig(
                batch_size=32,
                num_epochs=1  # Direct computation methods only need one epoch
            )
        
        # Create pipeline configuration
        pipeline_config = ProbePipelineConfig(
            **base_config,
            probe_cls=probe_cls,
            probe_config=probe_config,
            trainer_cls=trainer_cls,
            trainer_config=trainer_config
        )
        
        # Run pipeline
        pipeline = ProbePipeline(pipeline_config)
        probe, history = pipeline.run()
        
        # Save direction
        probe.save_json(f"probes/{name}_direction.json")

        # Plot training history for gradient-based probes
        if isinstance(probe, (LogisticProbe, LinearProbe)):
            plt.figure(figsize=(10, 5))
            plt.plot(history["train_loss"], label='Train Loss')
            plt.plot(history["val_loss"], label='Validation Loss')
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            plt.title(f'{name} Probe Training History')
            plt.legend()
            plt.show()

        # Store direction
        directions[name] = probe.get_direction()
        
        # Print final training loss if available
        if "train_loss" in history:
            final_train_loss = history["train_loss"][-1]
            print(f"{name} final training loss: {final_train_loss:.4f}")
    
    return directions

def compare_directions(directions: Dict[str, torch.Tensor]) -> None:
    """Compare directions using cosine similarity."""
    # Calculate cosine similarities between all pairs
    n_probes = len(directions)
    names = list(directions.keys())
    similarities = torch.zeros((n_probes, n_probes))
    
    for i, name1 in enumerate(names):
        for j, name2 in enumerate(names):
            dir1 = directions[name1]
            dir2 = directions[name2]

            # Ensure both directions are float32
            dir1 = dir1.to(dtype=torch.float32)
            dir2 = dir2.to(dtype=torch.float32)
            
            # Ensure directions are 1D and normalized
            dir1 = dir1.flatten()
            dir2 = dir2.flatten()
            dir1 = dir1 / dir1.norm()
            dir2 = dir2 / dir2.norm()
            # Take absolute value of cosine similarity since direction sign is arbitrary
            similarity = abs(torch.dot(dir1, dir2))
            similarities[i, j] = similarity
    
    # Create a prettier visualization using pandas
    import pandas as pd
    import seaborn as sns
    
    df = pd.DataFrame(
        similarities.numpy(),
        index=names,
        columns=names
    )
    
    plt.figure(figsize=(8, 4))
    sns.heatmap(
        df,
        annot=True,
        fmt='.3f',
        cmap='viridis',
        vmin=0,
        vmax=1
    )
    plt.title('Cosine Similarities Between Probe Directions')
    plt.tight_layout()
    plt.show()

# Run comparison
directions = compare_probes(dataset)
compare_directions(directions)

In [16]:
# Example of using a single probe type
def run_single_probe(
    dataset,
    probe_type: str = "logistic",
    hidden_size: int = 768
) -> torch.Tensor:
    """Run a single probe type on the dataset."""
    
    # Probe type mapping
    probe_mapping = {
        "linear": (LinearProbe, LinearProbeConfig, SupervisedProbeTrainer),
        "logistic": (LogisticProbe, LogisticProbeConfig, SupervisedProbeTrainer),
        "kmeans": (KMeansProbe, KMeansProbeConfig, DirectionalProbeTrainer),
        "pca": (PCAProbe, PCAProbeConfig, DirectionalProbeTrainer),
        "mean_diff": (MeanDifferenceProbe, MeanDiffProbeConfig, DirectionalProbeTrainer)
    }
    
    if probe_type not in probe_mapping:
        raise ValueError(f"Unknown probe type: {probe_type}")
        
    probe_cls, config_cls, trainer_cls = probe_mapping[probe_type]
    
    # Create pipeline configuration
    pipeline_config = ProbePipelineConfig(
        dataset=dataset,
        probe_cls=probe_cls,
        probe_config=config_cls(input_size=hidden_size),
        trainer_cls=trainer_cls,
        trainer_config=SupervisedTrainerConfig(  # Use the config class directly
            batch_size=32,
            learning_rate=1e-3,
            num_epochs=10 if probe_type in ["linear", "logistic"] else 1
        ) if trainer_cls == SupervisedProbeTrainer else DirectionalTrainerConfig(
            batch_size=32,
            learning_rate=1e-3,
            num_epochs=1
        ),
        position_key="ADJ",
        model_name=model_name,
        hook_points=hook_points,
        cache_dir="cache"
    )
    
    # Run pipeline
    pipeline = ProbePipeline(pipeline_config)
    probe, history = pipeline.run()

    
    return probe.get_direction()

In [None]:
# Example usage of single probe
logistic_direction = run_single_probe(dataset, "logistic")
kmeans_direction = run_single_probe(dataset, "kmeans")