# Inversion Datasets Set Up

In [None]:
# For when connected to google colab
!git clone https://github.com/AdrSkapars/inversion_optimisation.git 
%cd inversion_optimisation
!uv pip install -e .

import sys
sys.path.insert(0, "/content/inversion_optimisation/src")

In [None]:
device = ["cuda", "cpu"][0]

In [None]:
import torch
import numpy as np
import random
import pickle
from tqdm import tqdm
from datasets import load_dataset
from transformer_lens import HookedTransformer

In [None]:
model_name = {
    # Simple models
    0: "attn-only-1l",
    1: "gelu-1l",
    2: "tiny-stories-1L-21M",
    
    ## Small models
    3: "tiny-stories-1M",
    4: "tiny-stories-3M",
    5: "tiny-stories-8M",
    6: "tiny-stories-28M",
    7: "tiny-stories-33M",
    8: "tiny-stories-instruct-33M",
    
    ## Large models
    9: "gpt2-small",
    10: "gpt2-medium",
    11: "gpt2-xl",
    12: "llama-7b",
    13: "meta-llama/Llama-3.2-1B",
    14: "meta-llama/Llama-3.2-1B-Instruct",
    15: "Qwen/Qwen2.5-0.5B",
    16: "Qwen/Qwen2.5-1.5B",
    17: "Qwen/Qwen2.5-3B",
    18: "pythia-1.4b",
}[7]

model = HookedTransformer.from_pretrained(model_name, device=device)
model = model.eval()

In [None]:
# # Generate the targets and (unused) initialisations for all LOGIT-inversion experiments
for input_len in range(1,11):
    torch.manual_seed(1)
    np.random.seed(1)
    random.seed(1)

    num_targets = 1000
    tokens_list = []
    for _ in tqdm(range(num_targets)):
        tokens = torch.randint(0, len(model.tokenizer.vocab), (1, input_len)).to(device)
        tokens_list.append(tokens)
    true_tokens = torch.cat(tokens_list, dim=0).to(device)
    with open(f"/content/true_tokens_{num_targets}_{input_len}.pkl", 'wb') as file:
        pickle.dump(true_tokens, file)

    tokens_list = []
    for _ in tqdm(range(num_targets)):
        tokens = torch.randint(0, len(model.tokenizer.vocab), (1, input_len)).to(device)
        tokens_list.append(tokens)
    true_tokens = torch.cat(tokens_list, dim=0).to(device)
    with open(f"/content/initial_tokens_{num_targets}_{input_len}.pkl", 'wb') as file:
        pickle.dump(true_tokens, file)

In [None]:
# Generate the targets and (unused) initialisations for all TEXT-inversion experiments
for input_len in range(1,11):
    torch.manual_seed(1)
    np.random.seed(1)
    random.seed(1)

    num_targets = 1000
    with open(f"/content/true_tokens_{num_targets}_{input_len}.pkl", 'rb') as file:
        loaded_true_tokens = pickle.load(file).to("cpu")

    output_len = 25
    batch_size = 1000
    for batch in range(0, num_targets, batch_size):
        input_tokens = loaded_true_tokens[batch:batch+batch_size].to(device)
        output_tokens = model.generate(
            input_tokens,
            # min_new_tokens=output_len,
            max_new_tokens=output_len,
            do_sample=False,
            stop_at_eos=False,
            verbose=False,
            return_type="tokens",)[:,input_len:]
        if batch == 0:
            all_output_tokens = output_tokens
        else:
            all_output_tokens = torch.cat((all_output_tokens, output_tokens), dim=0)

    with open(f"/content/true_tokens_{num_targets}_{input_len}_{output_len}_greedy.pkl", 'wb') as file:
        pickle.dump(all_output_tokens, file)

In [None]:
# Filter the dataset used for evaluating privacy PII application
torch.manual_seed(1)
np.random.seed(1)
random.seed(1)

ds = load_dataset("ai4privacy/pii-masking-400k", split="train", streaming=True)
formatted_ds = {}
for data in tqdm(ds):
    # Filter out non english strings
    if data["language"] != "en":
        continue
    tokens = model.tokenizer(data["source_text"]).input_ids
    # Only keep 500 samples for each length between 15 and 24
    if len(tokens) < 15 or len(tokens) > 24:
        continue
    if len(tokens) not in formatted_ds:
        formatted_ds[len(tokens)] = []
    if len(formatted_ds[len(tokens)]) < 500:
        # Tokenise the strings and make the labels match the tokens
        tokens_decoded = []
        tokens_labels = []
        current_label = 0
        current_len = 1
        for token_id in tokens:
            decoded = model.tokenizer.decode([token_id])
            tokens_decoded.append(decoded)

            label = None
            # Check if we have passed the last label text span and should move onto the next
            if current_label < len(data["privacy_mask"]) and current_len > data["privacy_mask"][current_label]["end"]:
                current_label += 1
            # Check if we have are in the middle of the current label text span
            if current_label < len(data["privacy_mask"]) and current_len >= data["privacy_mask"][current_label]["start"]:
                    label = data["privacy_mask"][current_label]["label"]
            tokens_labels.append(label)

            current_len += len(decoded)

        new_data = {
            "source_text": data["source_text"],
            "source_text_labels": data["privacy_mask"],
            "tokens": tokens,
            "tokens_decoded": tokens_decoded,
            "tokens_labels": tokens_labels
        }
        formatted_ds[len(tokens)].append(new_data)

# # Upload to HuggingFace if want to
# dataset_dict = DatasetDict()
# for i in range(15, 25):
#     dataset_dict[f"length_{i}"] = Dataset.from_list(formatted_ds[i])
# dataset_dict.push_to_hub("AdrSkapars/pii-inversion-test-5k")

In [None]:
# # Code for getting dataset onto huggingface
# from huggingface_hub import HfApi
# import os
# import yaml

# # Path to your dataset files
# dataset_dir = "pii-inversion-5k"
# username = "AdrSkapars"
# repo_name = "pii-inversion-5k"
# repo_id = f"{username}/{repo_name}"

# # Initialize Hugging Face API
# api = HfApi()

# # Create README.md with YAML configuration
# yaml_config = {
#     "configs": [
#         {
#             "config_name": "default",
#             "data_files": []
#         }
#     ]
# }

# # Add each length file as a separate split
# for length in range(15, 25):  # Range 15-24
#     file_name = f"length_{length}.jsonl"
#     file_path = os.path.join(dataset_dir, file_name)

#     if os.path.exists(file_path):
#         yaml_config["configs"][0]["data_files"].append({
#             "split": f"length_{length}",
#             "path": file_name
#         })

# # Create the README.md with YAML front matter
# readme_content = "---\n"
# readme_content += yaml.dump(yaml_config)
# readme_content += "---\n\n"
# readme_content += "# PII Inversion Dataset\n\n"
# with open(os.path.join(dataset_dir, "README.md"), "w") as f:
#     f.write(readme_content)

# # Create or update the repository
# api.create_repo(
#     repo_id=repo_id,
#     repo_type="dataset",
#     exist_ok=True
# )

# # Upload all files
# api.upload_folder(
#     folder_path=dataset_dir,
#     repo_id=repo_id,
#     repo_type="dataset"
# )

# from datasets import Dataset, DatasetDict
# from huggingface_hub import HfApi, HfFolder
# import os
# import json

# # Folder with your JSONL files
# data_dir = "pii-inversion-5k"

# # Prepare a dataset dictionary with custom splits
# dataset_dict = DatasetDict()

# for i in range(15, 25):
#     file_path = os.path.join(data_dir, f"length_{i}.jsonl")
#     with open(file_path, 'r') as f:
#         data = [json.loads(line) for line in f]
#     dataset_dict[f"length_{i}"] = Dataset.from_list(data)

# # Push to Hugging Face hub
# dataset_dict.push_to_hub("AdrSkapars/pii-inversion-5k")