# HuggingFace Scripts

In [9]:
# global imports
from probe_gen.paths import data

In [None]:
%%capture
# Run to set environment variables if want to
# %env HF_TOKEN=

In [None]:
# Run to download jsonl files from hugging face if want to
import os
from huggingface_hub import hf_hub_download

behavior = "refusal"
datasource = "rlhf"
repo_id = f"lasrprobegen/{behavior}-activations"
files_to_download = [
    "llama_3b_incentivised_test.jsonl",
    "llama_3b_incentivised_train.jsonl", 
    "llama_3b_on_policy_test.jsonl", 
    "llama_3b_on_policy_train.jsonl", 
    "llama_3b_prompted_test.jsonl", 
    "llama_3b_prompted_train.jsonl", 
    "llama_3b_prompted_test.jsonl", 
    # "qwen_3b_on_policy_train.jsonl", 
    # "qwen_3b_on_policy_test.jsonl", 
    "ministral_8b_on_policy_train.jsonl", 
    "ministral_8b_on_policy_test.jsonl", 
    # "qwen_7b_on_policy_train.jsonl", 
    # "qwen_7b_on_policy_test.jsonl", 
]
local_dir = str(data.data / behavior)
os.makedirs(local_dir, exist_ok=True)
for filename in files_to_download:
    file_path = hf_hub_download(
        repo_id=repo_id,
        repo_type="dataset",
        filename=datasource + "/" + filename,
        local_dir=local_dir,
        token=os.environ.get("HF_TOKEN")
    )
    print(f"Downloaded to: {file_path}")

jailbreaks/ministral_8b_on_policy_train.(…):   0%|          | 0.00/29.0M [00:00<?, ?B/s]

Downloaded to: /workspace/LASR-probe-gen/data/refusal/jailbreaks/ministral_8b_on_policy_train.jsonl


In [2]:
# Code for replacing names with other names in a hugging face repo
from huggingface_hub import HfApi, list_repo_files, CommitOperationCopy, CommitOperationDelete

def rename_files_in_hf_dataset(repo_id, old_pattern, new_pattern, repo_type="dataset", confirm=True):
    api = HfApi()
    
    try:
        files = list_repo_files(repo_id=repo_id, repo_type=repo_type)
        files_to_rename = [f for f in files if old_pattern in f]
        
        if not files_to_rename:
            return False
        
        if confirm:
            print(f"Found {len(files_to_rename)} files to rename:")
            for file in files_to_rename:
                new_name = file.replace(old_pattern, new_pattern)
                print(f"  - {file} -> {new_name}")
            
            response = input(f"\nProceed with renaming {len(files_to_rename)} files? (y/N): ")
            if response.lower() != 'y':
                return False
        
        operations = []
        
        for old_filename in files_to_rename:
            new_filename = old_filename.replace(old_pattern, new_pattern)
            
            operations.append(
                CommitOperationCopy(
                    src_path_in_repo=old_filename,
                    path_in_repo=new_filename
                )
            )
            
            operations.append(
                CommitOperationDelete(path_in_repo=old_filename)
            )
        
        commit_info = api.create_commit(
            repo_id=repo_id,
            repo_type=repo_type,
            operations=operations,
            commit_message=f"Rename {len(files_to_rename)} files: replace '{old_pattern}' with '{new_pattern}'"
        )
        
        return True
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return False

In [3]:
from huggingface_hub import HfApi, list_repo_files, CommitOperationCopy, CommitOperationDelete

# Initialize the API
api = HfApi()

def move_all_to_folder(
    target_folder: str,
    repo_id: str,
    confirm: bool = True,
    match: str | None = None,
    source_folder: str = ""
):
    """
    Move files from a Hugging Face repo into a subfolder, optionally filtering.

    Args:
        target_folder (str): The new folder (created inside source_folder).
        repo_id (str): The Hugging Face repo ID.
        confirm (bool): Whether to ask for confirmation before committing.
        match (str | None): If provided, only move files whose names contain this substring.
        source_folder (str): Only consider files directly inside this folder (non-recursive).
                             "" means repo root.
    """
    repo_type = "dataset"
    if source_folder and not source_folder.endswith("/"):
        source_folder += "/"
    if target_folder and not target_folder.endswith("/"):
        target_folder += "/"

    files = list_repo_files(repo_id=repo_id, repo_type=repo_type)

    # Restrict to immediate files inside source_folder (non-recursive)
    files_to_move = [
        f for f in files
        if f.startswith(source_folder) and "/" not in f[len(source_folder):]
    ]

    # Filter by match string if provided
    if match is not None:
        files_to_move = [f for f in files_to_move if match in f]

    if not files_to_move:
        print(f"No files to move (source='{source_folder}', match='{match}', target='{target_folder}').")
        return

    print(f"Found {len(files_to_move)} files to move:")
    for old_filename in files_to_move:
        # Insert target folder *inside* source_folder
        new_filename = f"{source_folder}{target_folder}{old_filename[len(source_folder):]}"
        print(f"  - {old_filename} -> {new_filename}")

    if confirm:
        response = input(f"\nProceed with moving {len(files_to_move)} files into '{source_folder}{target_folder}'? (y/N): ")
        if response.lower() != "y":
            print("Operation cancelled.")
            return

    operations = []
    for old_filename in files_to_move:
        new_filename = f"{source_folder}{target_folder}{old_filename[len(source_folder):]}"
        operations.append(CommitOperationCopy(src_path_in_repo=old_filename, path_in_repo=new_filename))
        operations.append(CommitOperationDelete(path_in_repo=old_filename))

    commit_info = api.create_commit(
        repo_id=repo_id,
        repo_type=repo_type,
        operations=operations,
        commit_message=f"Move {len(files_to_move)} files from '{source_folder}' into '{source_folder}{target_folder}'"
    )

    print(f"\n✓ Successfully moved {len(files_to_move)} files into '{source_folder}{target_folder}'!")
    print(f"Commit: {commit_info.commit_url}")


In [26]:
move_all_to_folder("tinystories", repo_id, match="story")
success = rename_files_in_hf_dataset(repo_id, "_story", "")
move_all_to_folder("llama_3b", repo_id, source_folder="tinystories", match=".pkl")

Found 22 files to move:
  - llama_3b_story_balanced_1k.jsonl -> tinystories/llama_3b_story_balanced_1k.jsonl
  - llama_3b_story_balanced_1k_layer_0.pkl -> tinystories/llama_3b_story_balanced_1k_layer_0.pkl
  - llama_3b_story_balanced_1k_layer_12.pkl -> tinystories/llama_3b_story_balanced_1k_layer_12.pkl
  - llama_3b_story_balanced_1k_layer_15.pkl -> tinystories/llama_3b_story_balanced_1k_layer_15.pkl
  - llama_3b_story_balanced_1k_layer_18.pkl -> tinystories/llama_3b_story_balanced_1k_layer_18.pkl
  - llama_3b_story_balanced_1k_layer_21.pkl -> tinystories/llama_3b_story_balanced_1k_layer_21.pkl
  - llama_3b_story_balanced_1k_layer_24.pkl -> tinystories/llama_3b_story_balanced_1k_layer_24.pkl
  - llama_3b_story_balanced_1k_layer_27.pkl -> tinystories/llama_3b_story_balanced_1k_layer_27.pkl
  - llama_3b_story_balanced_1k_layer_3.pkl -> tinystories/llama_3b_story_balanced_1k_layer_3.pkl
  - llama_3b_story_balanced_1k_layer_6.pkl -> tinystories/llama_3b_story_balanced_1k_layer_6.pkl
  - ll

In [None]:
move_all_to_folder("brazilian", repo_id, match="brazil")
success = rename_files_in_hf_dataset(repo_id, "_brazil", "")
move_all_to_folder("llama_3b", repo_id, source_folder="brazilian", match=".pkl")

Found 44 files to move:
  - llama_3b_brazil_1k_balanced.jsonl -> brazilian/llama_3b_brazil_1k_balanced.jsonl
  - llama_3b_brazil_1k_balanced_layer_0.pkl -> brazilian/llama_3b_brazil_1k_balanced_layer_0.pkl
  - llama_3b_brazil_1k_balanced_layer_12.pkl -> brazilian/llama_3b_brazil_1k_balanced_layer_12.pkl
  - llama_3b_brazil_1k_balanced_layer_15.pkl -> brazilian/llama_3b_brazil_1k_balanced_layer_15.pkl
  - llama_3b_brazil_1k_balanced_layer_18.pkl -> brazilian/llama_3b_brazil_1k_balanced_layer_18.pkl
  - llama_3b_brazil_1k_balanced_layer_21.pkl -> brazilian/llama_3b_brazil_1k_balanced_layer_21.pkl
  - llama_3b_brazil_1k_balanced_layer_24.pkl -> brazilian/llama_3b_brazil_1k_balanced_layer_24.pkl
  - llama_3b_brazil_1k_balanced_layer_27.pkl -> brazilian/llama_3b_brazil_1k_balanced_layer_27.pkl
  - llama_3b_brazil_1k_balanced_layer_3.pkl -> brazilian/llama_3b_brazil_1k_balanced_layer_3.pkl
  - llama_3b_brazil_1k_balanced_layer_6.pkl -> brazilian/llama_3b_brazil_1k_balanced_layer_6.pkl
  - ll


✓ Successfully moved 44 files into 'brazilian/'!
Commit: https://huggingface.co/datasets/lasrprobegen/metaphors-activations/commit/06c81e13dbe679dd8289f1c6606f1565fe8a113c
Found 44 files to rename:
  - brazilian/llama_3b_brazil_1k_balanced.jsonl -> brazilian/llama_3b_1k_balanced.jsonl
  - brazilian/llama_3b_brazil_1k_balanced_layer_0.pkl -> brazilian/llama_3b_1k_balanced_layer_0.pkl
  - brazilian/llama_3b_brazil_1k_balanced_layer_12.pkl -> brazilian/llama_3b_1k_balanced_layer_12.pkl
  - brazilian/llama_3b_brazil_1k_balanced_layer_15.pkl -> brazilian/llama_3b_1k_balanced_layer_15.pkl
  - brazilian/llama_3b_brazil_1k_balanced_layer_18.pkl -> brazilian/llama_3b_1k_balanced_layer_18.pkl
  - brazilian/llama_3b_brazil_1k_balanced_layer_21.pkl -> brazilian/llama_3b_1k_balanced_layer_21.pkl
  - brazilian/llama_3b_brazil_1k_balanced_layer_24.pkl -> brazilian/llama_3b_1k_balanced_layer_24.pkl
  - brazilian/llama_3b_brazil_1k_balanced_layer_27.pkl -> brazilian/llama_3b_1k_balanced_layer_27.pkl
 

In [53]:
# move_all_to_folder("brazilian", repo_id, match="brazil")
# success = rename_files_in_hf_dataset(repo_id, "_brazil", "")
move_all_to_folder("llama_3b", repo_id, source_folder="jailbreaks", match=".pkl")

Found 25 files to move:
  - jailbreaks/llama_3b_balanced_1800_layer_0.pkl -> jailbreaks/llama_3b/llama_3b_balanced_1800_layer_0.pkl
  - jailbreaks/llama_3b_balanced_1800_layer_12.pkl -> jailbreaks/llama_3b/llama_3b_balanced_1800_layer_12.pkl
  - jailbreaks/llama_3b_balanced_1800_layer_15.pkl -> jailbreaks/llama_3b/llama_3b_balanced_1800_layer_15.pkl
  - jailbreaks/llama_3b_balanced_1800_layer_18.pkl -> jailbreaks/llama_3b/llama_3b_balanced_1800_layer_18.pkl
  - jailbreaks/llama_3b_balanced_1800_layer_21.pkl -> jailbreaks/llama_3b/llama_3b_balanced_1800_layer_21.pkl
  - jailbreaks/llama_3b_balanced_1800_layer_24.pkl -> jailbreaks/llama_3b/llama_3b_balanced_1800_layer_24.pkl
  - jailbreaks/llama_3b_balanced_1800_layer_27.pkl -> jailbreaks/llama_3b/llama_3b_balanced_1800_layer_27.pkl
  - jailbreaks/llama_3b_balanced_1800_layer_3.pkl -> jailbreaks/llama_3b/llama_3b_balanced_1800_layer_3.pkl
  - jailbreaks/llama_3b_balanced_1800_layer_6.pkl -> jailbreaks/llama_3b/llama_3b_balanced_1800_laye


✓ Successfully moved 25 files into 'jailbreaks/llama_3b/'!
Commit: https://huggingface.co/datasets/lasrprobegen/refusal-activations/commit/92042c5d17c5d82a0ddd5caf34aa0f2e702cbe7a


In [47]:
# move_all_to_folder("shakespeare", repo_id, match="ood")
# success = rename_files_in_hf_dataset(repo_id, "_ood", "")
move_all_to_folder("shakespeare", repo_id, match="shakespeare")
success = rename_files_in_hf_dataset(repo_id, "_shakespeare", "")
move_all_to_folder("llama_3b", repo_id, source_folder="shakespeare", match=".pkl")

Found 17 files to move:
  - llama_3b_shakespeare_15k.jsonl -> shakespeare/llama_3b_shakespeare_15k.jsonl
  - llama_3b_shakespeare_45k.jsonl -> shakespeare/llama_3b_shakespeare_45k.jsonl
  - llama_3b_shakespeare_balanced_1k.jsonl -> shakespeare/llama_3b_shakespeare_balanced_1k.jsonl
  - llama_3b_shakespeare_balanced_1k_layer_0.pkl -> shakespeare/llama_3b_shakespeare_balanced_1k_layer_0.pkl
  - llama_3b_shakespeare_balanced_1k_layer_12.pkl -> shakespeare/llama_3b_shakespeare_balanced_1k_layer_12.pkl
  - llama_3b_shakespeare_balanced_1k_layer_15.pkl -> shakespeare/llama_3b_shakespeare_balanced_1k_layer_15.pkl
  - llama_3b_shakespeare_balanced_1k_layer_18.pkl -> shakespeare/llama_3b_shakespeare_balanced_1k_layer_18.pkl
  - llama_3b_shakespeare_balanced_1k_layer_21.pkl -> shakespeare/llama_3b_shakespeare_balanced_1k_layer_21.pkl
  - llama_3b_shakespeare_balanced_1k_layer_24.pkl -> shakespeare/llama_3b_shakespeare_balanced_1k_layer_24.pkl
  - llama_3b_shakespeare_balanced_1k_layer_27.pkl -> 

In [37]:
move_all_to_folder("mmlu", repo_id, match="ood")
success = rename_files_in_hf_dataset(repo_id, "_ood", "")
move_all_to_folder("llama_3b", repo_id, source_folder="mmlu", match=".pkl")

Found 22 files to move:
  - llama_3b_incentivised_ood_1k_balanced.jsonl -> mmlu/llama_3b_incentivised_ood_1k_balanced.jsonl
  - llama_3b_incentivised_ood_1k_balanced_layer_0.pkl -> mmlu/llama_3b_incentivised_ood_1k_balanced_layer_0.pkl
  - llama_3b_incentivised_ood_1k_balanced_layer_12.pkl -> mmlu/llama_3b_incentivised_ood_1k_balanced_layer_12.pkl
  - llama_3b_incentivised_ood_1k_balanced_layer_15.pkl -> mmlu/llama_3b_incentivised_ood_1k_balanced_layer_15.pkl
  - llama_3b_incentivised_ood_1k_balanced_layer_18.pkl -> mmlu/llama_3b_incentivised_ood_1k_balanced_layer_18.pkl
  - llama_3b_incentivised_ood_1k_balanced_layer_21.pkl -> mmlu/llama_3b_incentivised_ood_1k_balanced_layer_21.pkl
  - llama_3b_incentivised_ood_1k_balanced_layer_24.pkl -> mmlu/llama_3b_incentivised_ood_1k_balanced_layer_24.pkl
  - llama_3b_incentivised_ood_1k_balanced_layer_27.pkl -> mmlu/llama_3b_incentivised_ood_1k_balanced_layer_27.pkl
  - llama_3b_incentivised_ood_1k_balanced_layer_3.pkl -> mmlu/llama_3b_incentivi

In [None]:
move_all_to_folder("jailbreaks", repo_id, source_folder="")
move_all_to_folder("llama_3b", repo_id, source_folder="jailbreaks", match=".pkl")

Found 15 files to move:
  - jailbreaks/ministral_8b_on_policy_test_layer_24.pkl -> jailbreaks/llama_3b/ministral_8b_on_policy_test_layer_24.pkl
  - jailbreaks/ministral_8b_on_policy_test_layer_27.pkl -> jailbreaks/llama_3b/ministral_8b_on_policy_test_layer_27.pkl
  - jailbreaks/ministral_8b_on_policy_test_layer_3.pkl -> jailbreaks/llama_3b/ministral_8b_on_policy_test_layer_3.pkl
  - jailbreaks/ministral_8b_on_policy_test_layer_6.pkl -> jailbreaks/llama_3b/ministral_8b_on_policy_test_layer_6.pkl
  - jailbreaks/ministral_8b_on_policy_test_layer_9.pkl -> jailbreaks/llama_3b/ministral_8b_on_policy_test_layer_9.pkl
  - jailbreaks/ministral_8b_on_policy_train_layer_0.pkl -> jailbreaks/llama_3b/ministral_8b_on_policy_train_layer_0.pkl
  - jailbreaks/ministral_8b_on_policy_train_layer_12.pkl -> jailbreaks/llama_3b/ministral_8b_on_policy_train_layer_12.pkl
  - jailbreaks/ministral_8b_on_policy_train_layer_15.pkl -> jailbreaks/llama_3b/ministral_8b_on_policy_train_layer_15.pkl
  - jailbreaks/min


✓ Successfully moved 15 files into 'jailbreaks/llama_3b/'!
Commit: https://huggingface.co/datasets/lasrprobegen/refusal-activations/commit/d82bd56b623f36c502e64ae3883c22c02c7d6c0c


In [6]:
repo_id = "lasrprobegen/deception-activations"
success = rename_files_in_hf_dataset(repo_id, "llama_3b_on_policy", "llama_3b_incentivised")
success = rename_files_in_hf_dataset(repo_id, "mistral_7b_on_policy", "mistral_7b_off_policy")

success = rename_files_in_hf_dataset(repo_id, "llama_3b_3.5k", "llama_3b_incentivised_3.5k")
success = rename_files_in_hf_dataset(repo_id, "deepseek_mixtral_3.5k", "deepseek_mixtral_off_policy_3.5k")

Found 16 files to rename:
  - roleplaying/llama_3b/llama_3b_3.5k_layer_0.pkl -> roleplaying/llama_3b/llama_3b_incentivised_3.5k_layer_0.pkl
  - roleplaying/llama_3b/llama_3b_3.5k_layer_12.pkl -> roleplaying/llama_3b/llama_3b_incentivised_3.5k_layer_12.pkl
  - roleplaying/llama_3b/llama_3b_3.5k_layer_15.pkl -> roleplaying/llama_3b/llama_3b_incentivised_3.5k_layer_15.pkl
  - roleplaying/llama_3b/llama_3b_3.5k_layer_18.pkl -> roleplaying/llama_3b/llama_3b_incentivised_3.5k_layer_18.pkl
  - roleplaying/llama_3b/llama_3b_3.5k_layer_21.pkl -> roleplaying/llama_3b/llama_3b_incentivised_3.5k_layer_21.pkl
  - roleplaying/llama_3b/llama_3b_3.5k_layer_24.pkl -> roleplaying/llama_3b/llama_3b_incentivised_3.5k_layer_24.pkl
  - roleplaying/llama_3b/llama_3b_3.5k_layer_27.pkl -> roleplaying/llama_3b/llama_3b_incentivised_3.5k_layer_27.pkl
  - roleplaying/llama_3b/llama_3b_3.5k_layer_3.pkl -> roleplaying/llama_3b/llama_3b_incentivised_3.5k_layer_3.pkl
  - roleplaying/llama_3b/llama_3b_3.5k_layer_6.pkl

In [7]:
repo_id = "lasrprobegen/sandbagging-activations"
success = rename_files_in_hf_dataset(repo_id, "llama_3b_on_policy", "llama_3b_incentivised")
success = rename_files_in_hf_dataset(repo_id, "ministral_8b_on_policy", "ministral_8b_off_policy")
success = rename_files_in_hf_dataset(repo_id, "mistral_7b_on_policy", "mistral_7b_off_policy")

Found 55 files to rename:
  - multichoice/llama_3b/llama_3b_on_policy_test_layer_0.pkl -> multichoice/llama_3b/llama_3b_incentivised_test_layer_0.pkl
  - multichoice/llama_3b/llama_3b_on_policy_test_layer_12.pkl -> multichoice/llama_3b/llama_3b_incentivised_test_layer_12.pkl
  - multichoice/llama_3b/llama_3b_on_policy_test_layer_15.pkl -> multichoice/llama_3b/llama_3b_incentivised_test_layer_15.pkl
  - multichoice/llama_3b/llama_3b_on_policy_test_layer_18.pkl -> multichoice/llama_3b/llama_3b_incentivised_test_layer_18.pkl
  - multichoice/llama_3b/llama_3b_on_policy_test_layer_21.pkl -> multichoice/llama_3b/llama_3b_incentivised_test_layer_21.pkl
  - multichoice/llama_3b/llama_3b_on_policy_test_layer_24.pkl -> multichoice/llama_3b/llama_3b_incentivised_test_layer_24.pkl
  - multichoice/llama_3b/llama_3b_on_policy_test_layer_27.pkl -> multichoice/llama_3b/llama_3b_incentivised_test_layer_27.pkl
  - multichoice/llama_3b/llama_3b_on_policy_test_layer_3.pkl -> multichoice/llama_3b/llama_3b_

In [5]:
# Example usage
repo_id = "lasrprobegen/refusal-activations"
# success = rename_files_in_hf_dataset(repo_id, "_balanced", "")
# success = rename_files_in_hf_dataset(repo_id, "_5k", "_train")
# success = rename_files_in_hf_dataset(repo_id, "_4k", "_train")
# success = rename_files_in_hf_dataset(repo_id, "_3k", "_train")
# success = rename_files_in_hf_dataset(repo_id, "_1k", "_test")
# success = rename_files_in_hf_dataset(repo_id, "_500", "_test")
# success = rename_files_in_hf_dataset(repo_id, "_test_test", "_test")
# success = rename_files_in_hf_dataset(repo_id, "_train_train", "_train")
# success = rename_files_in_hf_dataset(repo_id, "b_t", "b_on_policy_t")
# success = rename_files_in_hf_dataset(repo_id, "ministral-8b", "ministral_8b")

In [44]:
from huggingface_hub import HfApi, list_repo_files, CommitOperationCopy, CommitOperationDelete

api = HfApi()

def rename_folder(repo_id: str, old_folder: str, new_folder: str, confirm: bool = True):
    """
    Rename a folder in a Hugging Face repo by copying its contents into a new folder
    and deleting the old ones.

    Args:
        repo_id (str): The Hugging Face repo ID.
        old_folder (str): The current folder name.
        new_folder (str): The new folder name.
        confirm (bool): Whether to ask before committing.
    """
    repo_type = "dataset"
    if not old_folder.endswith("/"):
        old_folder += "/"
    if not new_folder.endswith("/"):
        new_folder += "/"

    files = list_repo_files(repo_id=repo_id, repo_type=repo_type)

    # Collect files under old_folder
    files_to_move = [f for f in files if f.startswith(old_folder)]

    if not files_to_move:
        print(f"No files found in folder '{old_folder}'.")
        return

    print(f"Found {len(files_to_move)} files to rename from '{old_folder}' → '{new_folder}':")
    for old_filename in files_to_move:
        new_filename = new_folder + old_filename[len(old_folder):]
        print(f"  - {old_filename} -> {new_filename}")

    if confirm:
        response = input(f"\nProceed with renaming folder '{old_folder}' → '{new_folder}'? (y/N): ")
        if response.lower() != "y":
            print("Operation cancelled.")
            return

    operations = []
    for old_filename in files_to_move:
        new_filename = new_folder + old_filename[len(old_folder):]
        operations.append(CommitOperationCopy(src_path_in_repo=old_filename, path_in_repo=new_filename))
        operations.append(CommitOperationDelete(path_in_repo=old_filename))

    commit_info = api.create_commit(
        repo_id=repo_id,
        repo_type=repo_type,
        operations=operations,
        commit_message=f"Rename folder '{old_folder}' → '{new_folder}'"
    )

    print(f"\n✓ Successfully renamed '{old_folder}' → '{new_folder}'")
    print(f"Commit: {commit_info.commit_url}")


In [45]:
# Rename folder "arguments/llama_3b/" to "arguments/llama3/"
rename_folder("lasrprobegen/refusal-activations", "jailbreaks", "jailbreaks_old")

Found 35 files to rename from 'jailbreaks/' → 'jailbreaks_old/':
  - jailbreaks/.gitattributes -> jailbreaks_old/.gitattributes
  - jailbreaks/llama_3b/llama_3b_incentivised_test_layer_0.pkl -> jailbreaks_old/llama_3b/llama_3b_incentivised_test_layer_0.pkl
  - jailbreaks/llama_3b/llama_3b_incentivised_test_layer_12.pkl -> jailbreaks_old/llama_3b/llama_3b_incentivised_test_layer_12.pkl
  - jailbreaks/llama_3b/llama_3b_incentivised_test_layer_15.pkl -> jailbreaks_old/llama_3b/llama_3b_incentivised_test_layer_15.pkl
  - jailbreaks/llama_3b/llama_3b_incentivised_test_layer_18.pkl -> jailbreaks_old/llama_3b/llama_3b_incentivised_test_layer_18.pkl
  - jailbreaks/llama_3b/llama_3b_incentivised_test_layer_21.pkl -> jailbreaks_old/llama_3b/llama_3b_incentivised_test_layer_21.pkl
  - jailbreaks/llama_3b/llama_3b_incentivised_test_layer_24.pkl -> jailbreaks_old/llama_3b/llama_3b_incentivised_test_layer_24.pkl
  - jailbreaks/llama_3b/llama_3b_incentivised_test_layer_27.pkl -> jailbreaks_old/llama_

In [8]:
from huggingface_hub import HfApi, hf_hub_download, upload_file, list_repo_files
import os

api = HfApi()

def merge_dataset_repos(
    src_repo: str,
    dst_repo: str,
    dst_folder: str = "",
    match: str | None = None,
):
    """
    Merge files from src_repo into dst_repo by downloading and re-uploading.

    Args:
        src_repo (str): Source dataset repo ID.
        dst_repo (str): Destination dataset repo ID.
        dst_folder (str): Optional subfolder inside dst_repo to upload files to.
        match (str | None): Optional substring; only files containing this will be moved.
    """
    if dst_folder and not dst_folder.endswith("/"):
        dst_folder += "/"

    # List files in source repo
    files = list_repo_files(repo_id=src_repo, repo_type="dataset")

    # Filter by match substring
    if match is not None:
        files = [f for f in files if match in f]

    if not files:
        print("No files found to merge.")
        return

    print(f"Found {len(files)} files to merge from {src_repo} into {dst_repo}:")

    for f in files:
        print(f"  - {f}")

    proceed = input("\nProceed with downloading and uploading? (y/N): ")
    if proceed.lower() != "y":
        print("Operation cancelled.")
        return

    for f in files:
        # Download file locally (temp folder)
        local_path = hf_hub_download(repo_id=src_repo, filename=f, repo_type="dataset")
        # Determine new path in destination repo
        new_path = f"{dst_folder}{f}"
        print(f"Uploading {f} -> {new_path} ...")
        upload_file(
            path_or_fileobj=local_path,
            path_in_repo=new_path,
            repo_id=dst_repo,
            repo_type="dataset",
            # overwrite=True
        )

    print(f"\n✓ Successfully merged {len(files)} files into {dst_repo}!")


In [None]:
merge_dataset_repos(
    src_repo="lasrprobegen/sandbagging_multi-deception-activations",
    dst_repo="lasrprobegen/sandbagging-activations",
    dst_folder="multichoice/",  # optional
    # match=".pkl"                # optional filter
)

Found 75 files to merge from lasrprobegen/sandbagging_multi-deception-activations into lasrprobegen/sandbagging-activations:
  - .gitattributes
  - llama_3b_balanced_3k.jsonl
  - llama_3b_balanced_3k_layer_0.pkl
  - llama_3b_balanced_3k_layer_12.pkl
  - llama_3b_balanced_3k_layer_15.pkl
  - llama_3b_balanced_3k_layer_18.pkl
  - llama_3b_balanced_3k_layer_21.pkl
  - llama_3b_balanced_3k_layer_24.pkl
  - llama_3b_balanced_3k_layer_27.pkl
  - llama_3b_balanced_3k_layer_3.pkl
  - llama_3b_balanced_3k_layer_6.pkl
  - llama_3b_balanced_3k_layer_9.pkl
  - llama_3b_balanced_500.jsonl
  - llama_3b_balanced_500_layer_0.pkl
  - llama_3b_balanced_500_layer_12.pkl
  - llama_3b_balanced_500_layer_15.pkl
  - llama_3b_balanced_500_layer_18.pkl
  - llama_3b_balanced_500_layer_21.pkl
  - llama_3b_balanced_500_layer_24.pkl
  - llama_3b_balanced_500_layer_27.pkl
  - llama_3b_balanced_500_layer_3.pkl
  - llama_3b_balanced_500_layer_6.pkl
  - llama_3b_balanced_500_layer_9.pkl
  - llama_3b_outputs_10k.jsonl


Uploading .gitattributes -> multichoice/.gitattributes ...
Uploading llama_3b_balanced_3k.jsonl -> multichoice/llama_3b_balanced_3k.jsonl ...
Uploading llama_3b_balanced_3k_layer_0.pkl -> multichoice/llama_3b_balanced_3k_layer_0.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_3k_layer_12.pkl:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Uploading llama_3b_balanced_3k_layer_12.pkl -> multichoice/llama_3b_balanced_3k_layer_12.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_3k_layer_15.pkl:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Uploading llama_3b_balanced_3k_layer_15.pkl -> multichoice/llama_3b_balanced_3k_layer_15.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_3k_layer_18.pkl:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Uploading llama_3b_balanced_3k_layer_18.pkl -> multichoice/llama_3b_balanced_3k_layer_18.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_3k_layer_21.pkl:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Uploading llama_3b_balanced_3k_layer_21.pkl -> multichoice/llama_3b_balanced_3k_layer_21.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_3k_layer_24.pkl:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Uploading llama_3b_balanced_3k_layer_24.pkl -> multichoice/llama_3b_balanced_3k_layer_24.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_3k_layer_27.pkl:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Uploading llama_3b_balanced_3k_layer_27.pkl -> multichoice/llama_3b_balanced_3k_layer_27.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_3k_layer_3.pkl:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Uploading llama_3b_balanced_3k_layer_3.pkl -> multichoice/llama_3b_balanced_3k_layer_3.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_3k_layer_6.pkl:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Uploading llama_3b_balanced_3k_layer_6.pkl -> multichoice/llama_3b_balanced_3k_layer_6.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_3k_layer_9.pkl:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Uploading llama_3b_balanced_3k_layer_9.pkl -> multichoice/llama_3b_balanced_3k_layer_9.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_500.jsonl: 0.00B [00:00, ?B/s]

Uploading llama_3b_balanced_500.jsonl -> multichoice/llama_3b_balanced_500.jsonl ...


llama_3b_balanced_500_layer_0.pkl:   0%|          | 0.00/601M [00:00<?, ?B/s]

Uploading llama_3b_balanced_500_layer_0.pkl -> multichoice/llama_3b_balanced_500_layer_0.pkl ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

llama_3b_balanced_500_layer_12.pkl:   0%|          | 0.00/601M [00:00<?, ?B/s]

In [7]:
# Delete all files in the hugging face repo that contain "on_policy" in their name
from huggingface_hub import HfApi, list_repo_files, CommitOperationDelete

# Initialize the API
api = HfApi()

# Your dataset repository
repo_id = "lasrprobegen/anthropic-refusal-activations"
repo_type = "dataset"

def delete_off_policy_files():
    """
    Delete all files that contain 'on_policy' in their name.
    """
    try:
        # List all files in the repository
        print("Fetching file list from repository...")
        files = list_repo_files(repo_id=repo_id, repo_type=repo_type)
        
        # Find files containing "off_policy"
        files_to_delete = [f for f in files if "on_policy" in f]
        
        print(f"Found {len(files_to_delete)} files to delete:")
        for file in files_to_delete:
            print(f"  - {file}")
        
        if not files_to_delete:
            print("No files found containing 'off_policy'.")
            return
        
        # Confirm before proceeding
        response = input(f"\nProceed with deleting {len(files_to_delete)} files? (y/N): ")
        if response.lower() != 'y':
            print("Operation cancelled.")
            return
        
        # Prepare commit operations
        operations = [CommitOperationDelete(path_in_repo=f) for f in files_to_delete]
        
        print(f"Creating commit with {len(operations)} delete operations...")
        
        # Execute all operations in a single commit
        commit_info = api.create_commit(
            repo_id=repo_id,
            repo_type=repo_type,
            operations=operations,
            commit_message=f"Delete {len(files_to_delete)} files containing 'off_policy'"
        )
        
        print(f"✓ Successfully deleted files!")
        print(f"Commit: {commit_info.commit_url}")
        
    except Exception as e:
        print(f"Error: {str(e)}")
        print("Make sure you have:")
        print("1. Installed huggingface_hub: pip install huggingface_hub")
        print("2. Logged in: huggingface-cli login")
        print("3. Write access to the repository")

if __name__ == "__main__":
    print("Make sure you're logged in to Hugging Face:")
    print("Run: huggingface-cli login")
    print()
    
    delete_off_policy_files()


Make sure you're logged in to Hugging Face:
Run: huggingface-cli login

Fetching file list from repository...
Found 0 files to delete:
No files found containing 'off_policy'.
