# Probe Training and Evaluation

In [None]:
%%capture
import os
# If getting 'Could not find project LASR_probe_gen' get key from https://wandb.ai/authorize and paste below
os.environ["WANDB_SILENT"] = "true"
# os.environ["WANDB_API_KEY"] = ""
import wandb
wandb_token = os.getenv("WANDB_API_KEY")
wandb.login(key=wandb_token)

## Do probe training

Load the activations and labels from HF, aggregate, and construct datasets to train the probe on (note sklearn doesn't require a validation dataset). Create a probe and fit it.

In [None]:
import probe_gen.probes as probes
from sklearn.metrics import classification_report
from probe_gen.config import ConfigDict

probe_type = ["mean", "attention_torch", "mean_torch"][0]
behaviour = "science"
datasource = "mmlu"
activations_model = "llama_3b"
generation_method = "on_policy"
response_model = "llama_3b"
off_policy_model = "qwen_3b"
mode = "train"

# Load the best hyperparameters or set your own
dataset_name = None
cfg = ConfigDict.from_json(activations_model, probe_type, behaviour)
# cfg = ConfigDict(layer=12, use_bias=True, normalize=True, c=0.001)
# cfg = ConfigDict(layer=12, use_bias=True, normalize=True, lr=0.001, weight_decay=0.001)

# Create train, val, and test datasets
used_model = response_model if generation_method != "off_policy" else off_policy_model
activations_tensor, attention_mask, labels_tensor = probes.load_hf_activations_at_layer(
    behaviour, datasource, activations_model, used_model, generation_method, mode, cfg.layer, and_labels=True, verbose=True)
if "mean" in probe_type:
    activations_tensor = probes.MeanAggregation()(activations_tensor, attention_mask)
split_val = 2500 if behaviour in ["deception", "sandbagging"] else 3500
split_val = 3000 if datasource == "shakespeare" else split_val
train_dataset, val_dataset, test_dataset = probes.create_activation_datasets(
    activations_tensor, labels_tensor, splits=[split_val, 500, 0], verbose=True)

# Initialise and fit a probe with the datasets
if probe_type == "mean":
    probe = probes.SklearnLogisticProbe(cfg)
elif probe_type == "mean_torch":
    probe = probes.TorchLinearProbe(cfg)
elif probe_type == "attention_torch":
    probe = probes.TorchAttentionProbe(cfg)
probe.fit(train_dataset, val_dataset, verbose=True)

# Print val results
eval_dict, y_pred, y_pred_proba = probe.eval(val_dataset)
print('\nroc_auc:', eval_dict['roc_auc'])

Evaluate the probe on test dataset.

In [None]:
# Eval against seperate test datasets
for generation_method in ["incentivised", "prompted", "off_policy"]:
    print(f"\nEvaluating on {generation_method}")
    used_model = response_model if generation_method != "off_policy" else off_policy_model
    activations_tensor, attention_mask, labels_tensor = probes.load_hf_activations_at_layer(
        behaviour, datasource, activations_model, used_model, generation_method, "test", cfg.layer, and_labels=True, verbose=False)
    if "mean" in probe_type:
        activations_tensor = probes.MeanAggregation()(activations_tensor, attention_mask)
    _, _, test_dataset = probes.create_activation_datasets(
        activations_tensor, labels_tensor, splits=[0, 0, 1000], verbose=True)

    # Evaluate the model
    eval_dict, y_pred, y_pred_proba = probe.eval(test_dataset)
    print(eval_dict)
    print(classification_report(test_dataset['y'], y_pred))

Nice visualisation to see how the probe splits the two classes

In [None]:
from probe_gen.standard_experiments.experiment_plotting import plot_per_class_prediction_distributions

plot_per_class_prediction_distributions(test_dataset['y'], y_pred_proba)

## Hyperparameter Search

In [None]:
from probe_gen.standard_experiments.hyperparameter_search import run_full_hyp_search_on_layers
from probe_gen.standard_experiments.hyperparameter_search import load_best_params_from_search
from probe_gen.config import ConfigDict

probe_type = ["mean", "attention_torch", "mean_torch"][1]
behaviour = "science"
datasource = "mmlu"
activations_model = "llama_3b"
generation_method = "on_policy"
response_model = "llama_3b"
off_policy_model = "qwen_3b"
mode = "train"

# You might not be able to run all layers at once, so can do them in batches like below
layers_list = [6,9,12,15,18,21]
run_full_hyp_search_on_layers(
    probe_type, behaviour, datasource, activations_model, response_model, generation_method, mode, layers_list
)

# Can load the best params from the search at any time
cfg_dict = load_best_params_from_search(
    probe_type, behaviour, datasource, generation_method, response_model, activations_model, layers_list
)

# Should add them to the local json if havent already
cfg = ConfigDict(cfg_dict)
cfg.add_to_json(activations_model, probe_type, behaviour)

## Testing On-Off Probes

In [None]:
from probe_gen.config import ConfigDict

# Specify rows of probes and columns of tests
# [behaviour, datasource, activations_model, generation_method, response_model, mode]
test_setup = [
    ['sycophancy', 'haikus', 'llama_3b', 'on_policy', 'llama_3b', 'test'],
    ['sycophancy', 'haikus', 'llama_3b', 'incentivised', 'llama_3b', 'test'],
    ['sycophancy', 'haikus', 'llama_3b', 'prompted', 'llama_3b', 'test'],
    ['sycophancy', 'haikus', 'llama_3b', 'off_policy', 'qwen_7b', 'test'],
]
# [probe_type, behaviour, datasource, activations_model, generation_method, response_model, mode, cfg]
train_setup = [
    ['mean', 'sycophancy', 'haikus', 'llama_3b', 'on_policy', 'llama_3b', 'train'],
    ['mean', 'sycophancy', 'haikus', 'llama_3b', 'incentivised', 'llama_3b', 'train'],
    ['mean', 'sycophancy', 'haikus', 'llama_3b', 'prompted', 'llama_3b', 'train'],
    ['mean', 'sycophancy', 'haikus', 'llama_3b', 'off_policy', 'qwen_7b', 'train'],
]

# Can specify hyperparameters for each row if dont want to load best hyperparameters, can also do a mix of specified and not specified
# probes_setup = [['mean', 'science', 'mmlu', 'llama_3b', 'on_policy', 'llama_3b', 'train', ConfigDict(layer=12, use_bias=False, normalize=True, c=0.001)]]

In [2]:
from probe_gen.standard_experiments.grid_experiments import run_grid_experiment_lean

run_grid_experiment_lean(train_setup, test_setup)

  0%|          | 0/4 [00:00<?, ?it/s]

haikus/llama_3b/llama_3b_on_policy_train(â€¦):   0%|          | 0.00/4.56G [00:00<?, ?B/s]

llama_3b_on_policy_train.jsonl: 0.00B [00:00, ?B/s]



  0%|          | 0/24 [00:00<?, ?it/s]
  0%|          | 0/4 [00:20<?, ?it/s]


RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-690ce0a3-00c181671fd36def40bd8ca9;f2a6687d-f472-41b6-bf34-acc79926f7c9)

Repository Not Found for url: https://huggingface.co/datasets/lasrprobegen/s-activations/resolve/main/y/c/p_o_h_layer_12.pkl.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication

In [None]:
from probe_gen.standard_experiments.grid_experiments import plot_grid_experiment_lean

plot_grid_experiment_lean(train_setup, test_setup, metric="roc_auc")

In [None]:
from probe_gen.standard_experiments.grid_experiments import plot_grid_experiment_lean_with_means

plot_grid_experiment_lean_with_means(train_setup, test_setup, metric="roc_auc", save=True) #, min_metric=0.5, max_metric=1)