In [1]:
%%capture
# Run to set environment variables if want to
%env HF_TOKEN=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
# %env OPENAI_API_KEY=

## Full Dataset Pipeline

In [2]:
# global imports
from probe_gen.paths import data

### Sample prompts dataset

- Samples dataset of prompts from hugging face to jsonl file
- The `skip` parameter allows you to only start taking prompts after the `skip`th valid entry in the dataset.

In [None]:
from probe_gen.annotation.refusal_behaviour import create_refusal_dataset
from probe_gen.annotation.ultrachat_behaviour import create_ultrachat_dataset, create_ultrachat_dataset_brazilian
from probe_gen.annotation.jailbreak_behaviour import create_jailbreak_dataset, create_harfmul_request_dataset
from probe_gen.paths import data

# dataset = create_harfmul_request_dataset(num_samples=10000)
# dataset.save_to(data.jailbreaks / "harmful_requests.jsonl", overwrite=True)

# dataset = create_jailbreak_dataset(num_samples=10800)
# dataset.save_to(data.jailbreaks / "prompts.jsonl", overwrite=True)


dataset = create_ultrachat_dataset_brazilian(num_samples=10000)
dataset.save_to(str(data.data / "ultrachat_brazilian_10.jsonl"), overwrite=True)

# Skipping example
# dataset = create_refusal_dataset(num_samples=1000, skip = 1000) 
# dataset.save_to("../data/refusal/prompts_with_skip.jsonl", overwrite=True)

### Generate outputs dataset (on-policy)

- Uses LLM (Llama-3.2-3B-Instruct default) to generate outputs for inputs dataset
- Takes 10 minutes to do 5k samples with 200 batch size
- Hardware requirements: high GPU, low RAM, low disk
- Make sure you have done 'export HF_TOKEN=<key>' or just paste it here but cant push to git

```uv run scripts/get_outputs.py --data data/refusal/claude_outputs.jsonl --out llama_3b_outputs.jsonl --batch-size 200 --sample 0 --behaviour refusal --save-increment -1```

In [None]:
from probe_gen.gen_data.utils import get_model, process_file_outputs_only

model, tokenizer = get_model("mistralai/Ministral-8B-Instruct-2410")
process_file_outputs_only(
    model,
    tokenizer,
    dataset_path= str(data.data / "ultrachat_brazilian_10.jsonl"),
    output_file= str(data.jailbreaks / "ministral-8b-outputs.jsonl"),
    batch_size=50,
    behaviour="jailbreaks",
    sample=0,
    add_prompt=False,
    max_new_tokens=150,
)

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/26.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/181k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


Processing 10000 examples in 200 batches of size 50


Processing batches: 100%|██████████| 200/200 [21:24<00:00,  6.42s/it]

Final DataFrame saved to: data/jailbreaks/ministral-8b-outputs.jsonl
DataFrame shape: (10000, 5)





### Generate outputs dataset (on-policy prompted)

- Uses LLM (Llama-3.2-3B-Instruct default) to generate outputs for inputs dataset
- Takes 10 minutes to do 5k samples with 200 batch size
- Hardware requirements: high GPU, low RAM, low disk
- Make sure you have done 'export HF_TOKEN=<key>' or just paste it here but cant push to git

- If `--add-prompt` is set to yes, then by default it will use alternating positive / negative prompts.

```uv run scripts/get_outputs.py --data data/refusal/claude_outputs.jsonl --out llama_3b_outputs1.jsonl --batch-size 200 --behaviour refusal --add-prompt yes --save-increment -1```

- You can also further specify if you want to only use positive or negative prompts with `--prompt-type`.

```uv run scripts/get_outputs.py --data data/refusal/claude_outputs.jsonl --out llama_3b_outputs2.jsonl --batch-size 200 --behaviour refusal --add-prompt yes --prompt-type positive --save-increment -1```

In [None]:
from probe_gen.gen_data.utils import get_model, process_file_outputs_only

model, tokenizer = get_model("meta-llama/Llama-3.2-3B-Instruct")
process_file_outputs_only(
    model,
    tokenizer,
    dataset_path= str(data.ultrachat_brazil / "llama_3b_ultrachat_brazilian.jsonl"),
    output_file= str(data.ultrachat_brazil / "llama_3b_prompted_outputs1.jsonl"),
    batch_size=150,
    behaviour="jailbreaks",
    add_prompt=True,
    save_increment=-1,
    max_new_tokens=150,
)

# model, tokenizer = get_model("meta-llama/Llama-3.2-3B-Instruct")
# process_file_outputs_only(
#     model,
#     tokenizer,
#     dataset_path= str(data.jailbreaks / "prompts.jsonl"),
#     output_file= str(data.ultrachat_brazil / "llama_3b_prompted_outputs1.jsonl"),
#     batch_size=150,
#     behaviour="jailbreaks",
#     add_prompt=True,
#     prompt_type="positive", # You can further specify if you want to only use positive or negative prompts
#     save_increment=-1,
#     max_new_tokens=150,
# )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Error loading data from /workspace/LASR-probe-gen/data/jailbreaks/prompts.jsonl: list index out of range
No data loaded from /workspace/LASR-probe-gen/data/jailbreaks/prompts.jsonl


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Error loading data from /workspace/LASR-probe-gen/data/jailbreaks/prompts.jsonl: list index out of range
No data loaded from /workspace/LASR-probe-gen/data/jailbreaks/prompts.jsonl


### Label and balance dataset

- Uses GPT-4o API to label refusal behaviour
- Takes 4 minutes to do 10k samples
- Hardware requirements: None
- Make sure you have done 'export OPENAI_API_KEY=<key>'

```uv run scripts/get_labels.py --behaviour refusal --out-path data/refusal/llama_3b_raw.jsonl --in-path data/refusal/llama_3b_outputs.jsonl --num-balanced 5000```

CODE CANNOT BE RUN IN NOTEBOOK BECAUSE OF ASYNC, RUN COMMAND OR A NEW SCRIPT

In [None]:
# from probe_gen.annotation.interface_dataset import Dataset, LabelledDataset
# from probe_gen.annotation.label_dataset import label_and_save_dataset
# from probe_gen.annotation.refusal_behaviour import SYSTEM_PROMPT_REFUSE

# try:
#     dataset = LabelledDataset.load_from("../data/refusal/llama_3b_outputs.jsonl")
# except Exception:
#     dataset = Dataset.load_from("../data/refusal/llama_3b_outputs.jsonl")
# label_and_save_dataset(
#     dataset=dataset,
#     dataset_path="../data/refusal/llama_3b_raw.jsonl",
#     system_prompt=SYSTEM_PROMPT_REFUSE,
#     do_subsample=True,
# )

In [None]:
# Can further downsample the balanced dataset to 5k, 2.5k examples each
import json
import random

amount = 1000

with open(str(data.science_brazil / 'llama_3b_prompted_5k_balanced.jsonl'), 'r') as f:
    data = [json.loads(line) for line in f]

positive_data = [data[i] for i in range(len(data)) if data[i]["labels"] == "positive"][:amount]
negative_data = [data[i] for i in range(len(data)) if data[i]["labels"] == "negative"][:amount]
print("positive data: ", len(positive_data))
print("negative data: ", len(negative_data))
balanced_data = positive_data + negative_data
random.shuffle(balanced_data)

with open(data.refusal / 'llama_3b_balanced_1k.jsonl', 'w') as f:
    for item in balanced_data:
        f.write(json.dumps(item) + "\n")

In [None]:
# Create train and test splits

import json
import random
from pathlib import Path
from probe_gen.paths import data

# You can adjust this as needed
amount = 2500  # number of examples per class (positive/negative)
train_ratio = 0.8

# Paths (adjust if needed)
input_path = str(data.science_brazil / 'qwen_3b_balanced.jsonl')
train_output_path = str(data.science_brazil / 'qwen_3b_balanced_4k_train.jsonl')
test_output_path = str(data.science_brazil / 'qwen_3b_balanced_1k_test.jsonl')

# Load all data
with open(input_path, 'r') as f:
    data = [json.loads(line) for line in f]

# Get balanced dataset
positive_data = [item for item in data if item["labels"] == "positive"][:amount]
negative_data = [item for item in data if item["labels"] == "negative"][:amount]

print("Positive examples:", len(positive_data))
print("Negative examples:", len(negative_data))

# Combine and shuffle
balanced_data = positive_data + negative_data
random.shuffle(balanced_data)

# Split into train/test
split_idx = int(len(balanced_data) * train_ratio)
train_data = balanced_data[:split_idx]
test_data = balanced_data[split_idx:]

print("Train set size:", len(train_data))
print("Test set size:", len(test_data))

# Write to output files
with open(train_output_path, 'w') as f:
    for item in train_data:
        f.write(json.dumps(item) + "\n")

with open(test_output_path, 'w') as f:
    for item in test_data:
        f.write(json.dumps(item) + "\n")


Positive examples: 2500
Negative examples: 2500
Train set size: 4000
Test set size: 1000


### Get ouputs and labels in one go

- You can get outputs and label them together with the following command:

```uv run scripts/get_outputs_and_labels.py --behaviour refusal --data data/refusal/claude_outputs.jsonl --out-path data/refusal/llama_3b_raw.jsonl --batch-size 200 --num-balanced 5000```

```uv run scripts/get_outputs_and_labels.py --behaviour refusal --data data/refusal/claude_outputs.jsonl --out-path data/refusal/llama_3b_raw.jsonl --batch-size 200 --add-prompt yes --num-balanced 5000```

### Get activations dataset

- Uses LLM (Llama-3.2-3B-Instruct default) to get actviations for datasets
- Takes 10 minutes to generate output activations for 5k samples with 200 batch size
- Hardware requirements: high GPU, super high (150 GB) RAM, super high (150 GB) Disk
- Make sure you have done 'export HF_TOKEN=<key>' or just paste it here but cant push to git

```uv run scripts/get_activations.py --model llama_3b --data data/refusal/llama_3b_balanced_5k.jsonl --batch-size 1 --layers "0,3,6,9,12,15,18,21,24,27" --save-increment -1```

Click the top file and then shift+click the bottom file for easy deleting of pkl files once uploaded to HF

In [None]:
# Standard library imports
from probe_gen.gen_data.utils import get_model, process_file, process_file_multiturn

model, tokenizer = get_model("meta-llama/Llama-3.2-3B-Instruct")
# process_file_multiturn(
process_file(
    model,
    tokenizer,
    dataset_path= str(data.sycophancy_short / "sycophancy_short_qwen_3b_4k.jsonl"),
    output_file= str(data.sycophancy_short / "sycophancy_short_qwen_3b_4k.pkl"),
    batch_size=32,
    sample=0,
    layers_str="0,3,6,9,12,15,18,21,24,27",
    save_increment=-1,
)

### Upload activations dataset

- Upload activations to hugging face repo

In [3]:
import os
from huggingface_hub import HfApi

# Create a new repo for each behaviour
REPO_NAME = "lasrprobegen/sandbagging-deception-activations"
HF_TOKEN = os.environ["HF_TOKEN"]

api = HfApi()
print("Creating repository...")
api.create_repo(
    repo_id=REPO_NAME,
    repo_type="dataset",
    token=HF_TOKEN,
    private=False,
    exist_ok=True
)

Creating repository...


RepoUrl('https://huggingface.co/datasets/lasrprobegen/sandbagging-deception-activations', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lasrprobegen/sandbagging-deception-activations')

In [6]:
from probe_gen.paths import data
# For uploading a single pickle file to huggingface
FILE_PATH = str(data.sandbagging / "mistral_7b_balanced_3.5k.jsonl")
PATH_IN_REPO = str(FILE_PATH).split("/")[-1]
api.upload_file(
    path_or_fileobj=FILE_PATH,
    path_in_repo=PATH_IN_REPO,
    repo_id=REPO_NAME,
    repo_type="dataset",
    token=HF_TOKEN,
)
print(f"✅ File uploaded to: https://huggingface.co/datasets/{REPO_NAME}")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ging/mistral_7b_balanced_3.5k.jsonl:  98%|#########8| 32.1MB / 32.8MB            

✅ File uploaded to: https://huggingface.co/datasets/lasrprobegen/sandbagging-deception-activations


In [None]:
# For uploading many layer pickle files to huggingface
for i in range(0,28,3):
    FILE_PATH = str(data.deception_rp / f"mistral_7b_balanced_3.5k_layer_{i}.pkl")
    PATH_IN_REPO = FILE_PATH.split("/")[-1]
    api.upload_file(
        path_or_fileobj=FILE_PATH,
        path_in_repo=PATH_IN_REPO,
        repo_id=REPO_NAME,
        repo_type="dataset",
        token=HF_TOKEN,
    )
    print(f"✅ File uploaded to: https://huggingface.co/datasets/{REPO_NAME}")

In [None]:
# For uploading many jsonl files to huggingface
FILE_PATH_PREFIX = str(data.refusal / "llama_3b")
for SUFFIX in ["balanced_5k", "balanced_20k", "raw_20k", "outputs_20k"]:
    FILE_PATH = FILE_PATH_PREFIX + "_" + SUFFIX + ".jsonl"
    PATH_IN_REPO = FILE_PATH.split("/")[-1]
    api.upload_file(
        path_or_fileobj=FILE_PATH,
        path_in_repo=PATH_IN_REPO,
        repo_id=REPO_NAME,
        repo_type="dataset",
        token=HF_TOKEN,
    )
    print(f"✅ File uploaded to: https://huggingface.co/datasets/{REPO_NAME}")