In [1]:
import json
import random
from tqdm import tqdm
from datasets import load_dataset, concatenate_datasets
# from datasets import train test split
from datasets import Dataset
from transformers import AutoTokenizer
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def format_for_finetuning(data,
                          system_message: str,
                          tokenizer) -> str:
    """
    Format data in JSON for fine-tuning an OpenAI chatbot model.
    """
    
    chat = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": data['dialogue']},
        {"role": "assistant", "content": data['section_text']},      
    ]
    text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
    return json.dumps({"text":text})

In [29]:
def subsample_data(all_data, split_ratio=0.5):
    total_ids = all_data['new_ID']

    # we are sampling 50% iid of the data
    subsampled_ids = random.sample(total_ids, int(len(total_ids) * split_ratio))

    return subsampled_ids

In [33]:
model_id='meta-llama/Llama-3.1-8B-Instruct'
data_name='beanham/medsum_llm_attack'
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = load_dataset(data_name)
all_data = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])
# randomly subset

# new column in all_data for new index
# dataset object

new_ids = range(len(all_data))
all_data = all_data.add_column("new_ID", new_ids)

In [34]:
system_message = """You are a helpful medical assistant! Please help me summarize dialogues between doctors and patients."""

# model ids
model_ids = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
for model_id in model_ids:
    
    subsample_ids = subsample_data(all_data)
    filtered_dataset = all_data.filter(lambda example: example['new_ID'] in subsample_ids)

    # train test split from datasets library
    dataset_train_val = filtered_dataset.train_test_split(test_size=0.2)
    dataset_train = dataset_train_val['train']
    dataset_val = dataset_train_val['test']

    # shadow model ids
    shadow_dataset = all_data.filter(lambda example: example['ID'] not in subsample_ids)
    shadow_dataset_train_val = shadow_dataset.train_test_split(test_size=0.2)
    shadow_dataset_train = shadow_dataset_train_val['train']
    shadow_dataset_val = shadow_dataset_train_val['test']
    
    data_formatted_train = '\n'.join(
        [format_for_finetuning(dataset_train[i], system_message, tokenizer) for i in tqdm(range(len(dataset_train)))]
        )
    with open(f'formatted_data/data_formatted_train_{model_id}.jsonl', 'w') as f:
        f.write(data_formatted_train)

    data_formatted_validation = '\n'.join(
        [format_for_finetuning(dataset_val[i], system_message, tokenizer) for i in tqdm(range(len(dataset_val)))]
        )
    with open(f'formatted_data/data_formatted_val_{model_id}.jsonl', 'w') as f:
        f.write(data_formatted_validation)

    data_formatted_shadow_train = '\n'.join(
        [format_for_finetuning(shadow_dataset_train[i], system_message, tokenizer) for i in tqdm(range(len(shadow_dataset_train)))]
        )
    with open(f'formatted_data/data_formatted_shadow_train_{model_id}.jsonl', 'w') as f:
        f.write(data_formatted_shadow_train)

    data_formatted_shadow_validation = '\n'.join(
        [format_for_finetuning(shadow_dataset_val[i], system_message, tokenizer) for i in tqdm(range(len(shadow_dataset_val)))]
        )
    with open(f'formatted_data/data_formatted_shadow_val_{model_id}.jsonl', 'w') as f:
        f.write(data_formatted_shadow_validation)

    df = pd.DataFrame(subsample_ids, columns=['new_ID'])
    df.to_csv(f'formatted_data/subsample_ids_{model_id}.csv', index=False)

    df_shadow = pd.DataFrame(shadow_dataset['new_ID'], columns=['new_ID'])
    df_shadow.to_csv(f'formatted_data/shadow_ids_{model_id}.csv', index=False)

Filter:   0%|          | 0/1329 [00:00<?, ? examples/s]Filter: 100%|██████████| 1329/1329 [00:00<00:00, 22467.49 examples/s]
Filter: 100%|██████████| 1329/1329 [00:00<00:00, 41130.03 examples/s]
100%|██████████| 531/531 [00:00<00:00, 7595.24it/s]
100%|██████████| 133/133 [00:00<00:00, 7322.59it/s]
100%|██████████| 556/556 [00:00<00:00, 7739.52it/s]
100%|██████████| 139/139 [00:00<00:00, 6946.86it/s]
Filter: 100%|██████████| 1329/1329 [00:00<00:00, 49932.64 examples/s]
Filter: 100%|██████████| 1329/1329 [00:00<00:00, 36872.45 examples/s]
100%|██████████| 531/531 [00:00<00:00, 4364.59it/s]
100%|██████████| 133/133 [00:00<00:00, 6633.79it/s]
100%|██████████| 511/511 [00:00<00:00, 6530.78it/s]
100%|██████████| 128/128 [00:00<00:00, 5936.93it/s]
Filter: 100%|██████████| 1329/1329 [00:00<00:00, 36502.78 examples/s]
Filter: 100%|██████████| 1329/1329 [00:00<00:00, 38854.01 examples/s]
100%|██████████| 531/531 [00:00<00:00, 6127.81it/s]
100%|██████████| 133/133 [00:00<00:00, 5991.54it/s]
100%|

In [None]:
import pandas as pd
import ast

with open("subsample_ids_1.txt", "r") as file:
    data = file.read()

data_list = ast.literal_eval(data)

df = pd.DataFrame(data_list, columns=["ID"])

csv_file = "subsample_ids_1.csv"
df.to_csv(csv_file, index=False)

Data successfully saved to subsample_ids_1.csv
