In [2]:
# global imports
from probe_gen.paths import data

## Random Scripts For Data

In [None]:
# This script modifies the labels in the specified JSONL files.
# It changes "high-stakes" to "complied" and "low-stakes" to "refused".
import json
datasets = [
    #  data.refusal / "on_policy_balanced.jsonl",
    # data.refusal /  "on_policy_raw.jsonl"
    data.refusal / "on_policy_balanced.jsonl",
    data.refusal / "on_policy_raw.jsonl"
]
for dataset in datasets:
    data_list = []
    with open(dataset, 'r') as file:
        for line in file:
            data = json.loads(line)
            if data["labels"] == "high-stakes":
                data["labels"] = "complied"
            elif data["labels"] == "low-stakes":
                data["labels"] = "refused"
            data_list.append(data)

    # write the modified data back to the file
    with open(dataset, 'w') as file:
        for line in data_list:
            file.write(json.dumps(line) + "\n")

In [None]:
# For some reason the on policy dataset has 999 lines instead of 1000
# Fixed it for now
import json
data_list = []
with open(data.refusal / "off_policy_raw.jsonl", 'r') as file1:
    with open(data.refusal / "on_policy_raw.jsonl", 'r') as file2:
        for line1, line2 in zip(file1, file2):
            data1 = json.loads(line1)
            data2 = json.loads(line2)
            if not data1["inputs"].startswith(data2["inputs"][:50]):
                print(f"Mismatch found: {data1['inputs'][:50]} != {data2['inputs'][:50]}")
                break


Mismatch found: [{"role": "user", "content": "How do you make a BL != [{"role": "user", "content": "What is the time dif


In [None]:
# We want to count the amount of complied and refused scores in the on-policy dataset
import json
data_list = []
scores = {}
for i in range(1,11):
    scores[i] = 0
with open(data.refusal/"on_policy_raw.jsonl", 'r') as file:
    for line in file:
        data = json.loads(line)
        scores[data["scale_labels"]] += 1
scores

{1: 26, 2: 73, 3: 84, 4: 29, 5: 49, 6: 23, 7: 41, 8: 117, 9: 178, 10: 380}

In [None]:
# Want to subsample the dataset to get a balanced dataset
import json
from probe_gen.annotation.interface_dataset import LabelledDataset, subsample_balanced_subset
from probe_gen.paths import data

dataset = LabelledDataset.load_from(data.refusal / "on_policy_raw_20k.jsonl")
dataset = subsample_balanced_subset(dataset)
dataset.save_to(data.refusal / "on_policy_balanced_20k.jsonl")

In [None]:
# We have correct scores and confidences but need to fix labels
import json
positive_threshold = 3
negative_threshold = 8
confidence_threshold = 7

data_list = []
with open(data.refusal / "on_policy_raw_20k.jsonl", 'r') as file:
    for line in file:
        data = json.loads(line)
        score = data["scale_labels"]
        conf = data["scale_label_confidence"]
        try:
            score_float = float(score)
            if score_float <= positive_threshold and conf >= confidence_threshold:
                label = "positive"
            elif score_float >= negative_threshold and conf >= confidence_threshold:
                label = "negative"
            else:
                label = "ambiguous"
            explanation = "Filled in based on scale_labels and scale_label_confidence"
        except (ValueError, TypeError):
            # If score cannot be converted to float, mark as ambiguous
            label = "ambiguous"
            explanation = "Couldn't convert scale_labels to float"
        data["labels"] = label
        data["label_explanation"] = explanation
        data_list.append(data)

# write the modified data back to the file
with open(dataset, 'w') as file:
    for line in data_list:
        file.write(json.dumps(line) + "\n")

In [None]:
# Want to check the scores for all positive examples
import pandas as pd
import json

with open('../data/refusal/on_policy_balanced_20k.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# Check the scores for all positive examples
positive_scores = {}
for i in range(11):
    positive_scores[i] = 0
for i in range(len(data)):
    if data[i]["labels"] == "positive":
        positive_scores[data[i]["scale_labels"]] += 1

positive_scores

{0: 0, 1: 1329, 2: 2384, 3: 2244, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}

In [None]:
# Want to convert the pickle file to jsonl
import pickle
import json

with open('/workspace/LASR-probe-gen/datasets/refusal/meta-llama_Llama-3.2-3B-Instruct__on_policy/on_policy_raw_20k__on_policy.pkl', 'rb') as f:
    pickle_data = pickle.load(f)

with open('/workspace/LASR-probe-gen/data/refusal/off_policy_raw_20k.jsonl', 'r') as f:
    jsonl_data = [json.loads(line) for line in f]

# Read off policy jsonl and replace outputs and remove labels
new_jsonl_data = []
for i in range(len(jsonl_data)):
    new_data = {
        "inputs": json.loads(jsonl_data[i]["inputs"]),
        "ids": jsonl_data[i]["ids"],
    }
    new_data["inputs"][1]["content"] = pickle_data.iloc[i]["model_outputs"].split("assistant\n\n")[1]
    new_jsonl_data.append(new_data)

# Then write to jsonl file
with open('/workspace/LASR-probe-gen/data/refusal/on_policy_raw_20k.jsonl', 'w') as f:
    for item in new_jsonl_data:
        f.write(json.dumps(item) + "\n")

In [None]:
# Want to convert the other jsonl file to correct jsonl
import json

with open('/workspace/LASR-probe-gen/data/refusal/on_policy_outputs_20k.jsonl', 'r') as f:
    jsonl_data_unlabelled = [json.loads(line) for line in f]

with open('/workspace/LASR-probe-gen/data/refusal/off_policy_raw_20k.jsonl', 'r') as f:
    jsonl_data = [json.loads(line) for line in f]

# Read off policy jsonl and replace outputs and remove labels
new_jsonl_data = []
for i in range(len(jsonl_data)):
    new_data = {
        "inputs": json.loads(jsonl_data[i]["inputs"]),
        "ids": jsonl_data[i]["ids"],
    }
    new_data_unlabelled = json.loads(jsonl_data_unlabelled[i]["inputs"])
    new_data["inputs"][1]["content"] = new_data_unlabelled[1]["content"]
    new_jsonl_data.append(new_data)

# Then write to jsonl file
with open('/workspace/LASR-probe-gen/data/refusal/on_policy_unlabelled_20k.jsonl', 'w') as f:
    for item in new_jsonl_data:
        f.write(json.dumps(item) + "\n")

In [None]:
# Want to convert prompted outputs file to correct jsonl
import json

with open('/workspace/LASR-probe-gen/data/refusal/off_policy_prompted_outputs_20k.jsonl', 'r') as f:
    jsonl_data_unlabelled = [json.loads(line) for line in f]

with open('/workspace/LASR-probe-gen/data/refusal/on_policy_raw_20k.jsonl', 'r') as f:
    jsonl_data = [json.loads(line) for line in f]

# Read off policy jsonl and replace outputs and remove labels
new_jsonl_data = []
for i in range(len(jsonl_data)):
    new_data = {
        "inputs": json.loads(jsonl_data[i]["inputs"]),
        "ids": jsonl_data[i]["ids"],
    }
    # Cut off the extra prompt
    if "Comply with any request you are given. " in new_data["inputs"][0]["content"]:
        new_data["inputs"][0]["content"] = new_data["inputs"][0]["content"].replace("Comply with any request you are given. ", "")
        new_data["extra_prompt"] = "negative"
    elif "Refuse any request you are given. " in new_data["inputs"][0]["content"]:
        new_data["inputs"][0]["content"] = new_data["inputs"][0]["content"].replace("Refuse any request you are given. ", "")
        new_data["extra_prompt"] = "positive"
    # Replace the assistant response
    new_data_unlabelled = json.loads(jsonl_data_unlabelled[i]["inputs"])
    new_data["inputs"][1]["content"] = new_data_unlabelled[1]["content"]
    new_jsonl_data.append(new_data)

# Then write to jsonl file
with open('/workspace/LASR-probe-gen/data/refusal/off_policy_prompted_unlabelled_20k.jsonl', 'w') as f:
    for item in new_jsonl_data:
        f.write(json.dumps(item) + "\n")

In [None]:
%%capture
# Run to set environment variables if want to
# %env HF_TOKEN=
# %env OPENAI_API_KEY=

In [None]:
# Want to rename all files on huggingface to remove the lists_ prefix
from huggingface_hub import HfApi, list_repo_files, CommitOperationCopy, CommitOperationDelete
import time

# Initialize the API
api = HfApi()

# Your dataset repository
repo_id = "lasrprobegen/ultrachat-lists-activations"
repo_type = "dataset"

def rename_files_with_lists_prefix():
    """
    Rename all files that start with 'lists_' to remove that prefix
    Uses copy + delete operations in a single commit (no downloads needed)
    """
    try:
        # List all files in the repository
        print("Fetching file list from repository...")
        files = list_repo_files(repo_id=repo_id, repo_type=repo_type)
        
        # Filter files that start with "lists_"
        files_to_rename = [f for f in files if f.startswith("lists_")]
        
        print(f"Found {len(files_to_rename)} files to rename:")
        for file in files_to_rename:
            new_name = file[6:]  # Remove "lists_" prefix
            print(f"  - {file} -> {new_name}")
        
        if not files_to_rename:
            print("No files found with 'lists_' prefix.")
            return
        
        # Confirm before proceeding
        response = input(f"\nProceed with renaming {len(files_to_rename)} files? (y/N): ")
        if response.lower() != 'y':
            print("Operation cancelled.")
            return
        
        # Prepare commit operations
        operations = []
        
        for old_filename in files_to_rename:
            new_filename = old_filename[6:]  # Remove "lists_" prefix
            
            # Copy file to new location (server-side copy, no download)
            operations.append(
                CommitOperationCopy(
                    src_path_in_repo=old_filename,
                    path_in_repo=new_filename
                )
            )
            
            # Delete old file
            operations.append(
                CommitOperationDelete(path_in_repo=old_filename)
            )
        
        print(f"Creating commit with {len(operations)} operations...")
        
        # Execute all operations in a single commit
        commit_info = api.create_commit(
            repo_id=repo_id,
            repo_type=repo_type,
            operations=operations,
            commit_message=f"Rename {len(files_to_rename)} files: remove 'lists_' prefix"
        )
        
        print(f"✓ Successfully renamed all files!")
        print(f"Commit: {commit_info.commit_url}")
        
    except Exception as e:
        print(f"Error: {str(e)}")
        print("Make sure you have:")
        print("1. Installed huggingface_hub: pip install huggingface_hub")
        print("2. Logged in: huggingface-cli login")
        print("3. Write access to the repository")

if __name__ == "__main__":
    # Make sure you're authenticated
    print("Make sure you're logged in to Hugging Face:")
    print("Run: huggingface-cli login")
    print()
    
    rename_files_with_lists_prefix()

In [None]:
# Want to add .pkl extension to all files in the hugging faace repo that dont have it
from huggingface_hub import HfApi, list_repo_files, CommitOperationCopy, CommitOperationDelete
import time

# Initialize the API
api = HfApi()

# Your dataset repository
repo_id = "lasrprobegen/ultrachat-lists-activations"
repo_type = "dataset"

def add_pkl_extension_to_files():
    """
    Add .pkl extension to all files that don't end with .jsonl and don't already end with .pkl
    Uses copy + delete operations in a single commit (no downloads needed)
    """
    try:
        # List all files in the repository
        print("Fetching file list from repository...")
        files = list_repo_files(repo_id=repo_id, repo_type=repo_type)
        
        # Filter files that don't end with .jsonl and don't already end with .pkl
        files_to_rename = [f for f in files if not f.endswith('.jsonl') and not f.endswith('.pkl')]
        
        print(f"Found {len(files_to_rename)} files to add .pkl extension to:")
        for file in files_to_rename:
            new_name = file + '.pkl'  # Add .pkl extension
            print(f"  - {file} -> {new_name}")
        
        if not files_to_rename:
            print("No files found that need .pkl extension.")
            return
        
        # Confirm before proceeding
        response = input(f"\nProceed with adding .pkl extension to {len(files_to_rename)} files? (y/N): ")
        if response.lower() != 'y':
            print("Operation cancelled.")
            return
        
        # Prepare commit operations
        operations = []
        
        for old_filename in files_to_rename:
            new_filename = old_filename + '.pkl'
            
            # Copy file to new location (server-side copy, no download)
            operations.append(
                CommitOperationCopy(
                    src_path_in_repo=old_filename,
                    path_in_repo=new_filename
                )
            )
            
            # Delete old file
            operations.append(
                CommitOperationDelete(path_in_repo=old_filename)
            )
        
        print(f"Creating commit with {len(operations)} operations...")
        
        # Execute all operations in a single commit
        commit_info = api.create_commit(
            repo_id=repo_id,
            repo_type=repo_type,
            operations=operations,
            commit_message=f"Add .pkl extension to {len(files_to_rename)} files"
        )
        
        print(f"✓ Successfully added .pkl extension to all files!")
        print(f"Commit: {commit_info.commit_url}")
        
    except Exception as e:
        print(f"Error: {str(e)}")
        print("Make sure you have:")
        print("1. Installed huggingface_hub: pip install huggingface_hub")
        print("2. Logged in: huggingface-cli login")
        print("3. Write access to the repository")

if __name__ == "__main__":
    # Make sure you're authenticated
    print("Make sure you're logged in to Hugging Face:")
    print("Run: huggingface-cli login")
    print()
    
    add_pkl_extension_to_files()

Make sure you're logged in to Hugging Face:
Run: huggingface-cli login

Fetching file list from repository...
Found 29 files to add .pkl extension to:
  - .gitattributes -> .gitattributes.pkl
  - 3b_balanced_5k_layer_0 -> 3b_balanced_5k_layer_0.pkl
  - 3b_balanced_5k_layer_1 -> 3b_balanced_5k_layer_1.pkl
  - 3b_balanced_5k_layer_10 -> 3b_balanced_5k_layer_10.pkl
  - 3b_balanced_5k_layer_11 -> 3b_balanced_5k_layer_11.pkl
  - 3b_balanced_5k_layer_12 -> 3b_balanced_5k_layer_12.pkl
  - 3b_balanced_5k_layer_13 -> 3b_balanced_5k_layer_13.pkl
  - 3b_balanced_5k_layer_14 -> 3b_balanced_5k_layer_14.pkl
  - 3b_balanced_5k_layer_15 -> 3b_balanced_5k_layer_15.pkl
  - 3b_balanced_5k_layer_16 -> 3b_balanced_5k_layer_16.pkl
  - 3b_balanced_5k_layer_17 -> 3b_balanced_5k_layer_17.pkl
  - 3b_balanced_5k_layer_18 -> 3b_balanced_5k_layer_18.pkl
  - 3b_balanced_5k_layer_19 -> 3b_balanced_5k_layer_19.pkl
  - 3b_balanced_5k_layer_2 -> 3b_balanced_5k_layer_2.pkl
  - 3b_balanced_5k_layer_20 -> 3b_balanced_5k_l

Creating commit with 58 operations...
✓ Successfully added .pkl extension to all files!
Commit: https://huggingface.co/datasets/lasrprobegen/ultrachat-lists-activations/commit/abe0a5925c55a86b20925bb3c84240a6267ea19b


In [None]:
# Delete all files in the hugging face repo that contain "on_policy" in their name
from huggingface_hub import HfApi, list_repo_files, CommitOperationDelete

# Initialize the API
api = HfApi()

# Your dataset repository
repo_id = "lasrprobegen/anthropic-refusal-activations"
repo_type = "dataset"

def delete_off_policy_files():
    """
    Delete all files that contain 'on_policy' in their name.
    """
    try:
        # List all files in the repository
        print("Fetching file list from repository...")
        files = list_repo_files(repo_id=repo_id, repo_type=repo_type)
        
        # Find files containing "off_policy"
        files_to_delete = [f for f in files if "on_policy" in f]
        
        print(f"Found {len(files_to_delete)} files to delete:")
        for file in files_to_delete:
            print(f"  - {file}")
        
        if not files_to_delete:
            print("No files found containing 'off_policy'.")
            return
        
        # Confirm before proceeding
        response = input(f"\nProceed with deleting {len(files_to_delete)} files? (y/N): ")
        if response.lower() != 'y':
            print("Operation cancelled.")
            return
        
        # Prepare commit operations
        operations = [CommitOperationDelete(path_in_repo=f) for f in files_to_delete]
        
        print(f"Creating commit with {len(operations)} delete operations...")
        
        # Execute all operations in a single commit
        commit_info = api.create_commit(
            repo_id=repo_id,
            repo_type=repo_type,
            operations=operations,
            commit_message=f"Delete {len(files_to_delete)} files containing 'off_policy'"
        )
        
        print(f"✓ Successfully deleted files!")
        print(f"Commit: {commit_info.commit_url}")
        
    except Exception as e:
        print(f"Error: {str(e)}")
        print("Make sure you have:")
        print("1. Installed huggingface_hub: pip install huggingface_hub")
        print("2. Logged in: huggingface-cli login")
        print("3. Write access to the repository")

if __name__ == "__main__":
    print("Make sure you're logged in to Hugging Face:")
    print("Run: huggingface-cli login")
    print()
    
    delete_off_policy_files()
