In [8]:
import json
import random
from tqdm import tqdm
from datasets import load_dataset, concatenate_datasets
# from datasets import train test split
from datasets import Dataset
from transformers import AutoTokenizer
import pandas as pd

In [2]:
def format_for_finetuning(data,
                          system_message: str,
                          tokenizer) -> str:
    """
    Format data in JSON for fine-tuning an OpenAI chatbot model.
    """
    
    chat = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": data['dialogue']},
        {"role": "assistant", "content": data['section_text']},      
    ]
    text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)    
    return json.dumps({"text":text})

In [3]:
def subsample_data(all_data, split_ratio=0.5):
    total_ids = all_data['ID']

    # we are sampling 50% iid of the data
    subsampled_ids = random.sample(total_ids, int(len(total_ids) * split_ratio))

    return subsampled_ids

In [6]:
model_id='meta-llama/Llama-3.1-8B-Instruct'
data_name='beanham/medsum_llm_attack'
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = load_dataset(data_name)
all_data = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])
# randomly subset


In [9]:
system_message = """You are a helpful medical assistant! Please help me summarize dialogues between doctors and patients."""

# model ids
model_ids = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
for model_id in model_ids:
    subsample_ids = subsample_data(all_data)
    filtered_dataset = all_data.filter(lambda example: example['ID'] in subsample_ids)

    # train test split from datasets library
    dataset_train_val = filtered_dataset.train_test_split(test_size=0.2)
    dataset_train = dataset_train_val['train']
    dataset_val = dataset_train_val['test']
    
    data_formatted_train = '\n'.join(
        [format_for_finetuning(dataset_train[i], system_message, tokenizer) for i in tqdm(range(len(dataset_train)))]
        )
    with open(f'formatted_data/data_formatted_train_{model_id}.jsonl', 'w') as f:
        f.write(data_formatted_train)

    data_formatted_validation = '\n'.join(
        [format_for_finetuning(dataset_val[i], system_message, tokenizer) for i in tqdm(range(len(dataset_val)))]
        )
    with open(f'formatted_data/data_formatted_val_{model_id}.jsonl', 'w') as f:
        f.write(data_formatted_validation)

    df = pd.DataFrame(subsample_ids, columns=['ID'])
    df.to_csv(f'formatted_data/subsample_ids_{model_id}.csv', index=False)

Filter:   0%|          | 0/1329 [00:00<?, ? examples/s]

Filter: 100%|██████████| 1329/1329 [00:00<00:00, 17273.45 examples/s]
100%|██████████| 662/662 [00:00<00:00, 9220.30it/s]
100%|██████████| 166/166 [00:00<00:00, 8918.45it/s]
Filter: 100%|██████████| 1329/1329 [00:00<00:00, 58703.91 examples/s]
100%|██████████| 684/684 [00:00<00:00, 9355.63it/s]
100%|██████████| 171/171 [00:00<00:00, 8860.55it/s]
Filter: 100%|██████████| 1329/1329 [00:00<00:00, 59327.46 examples/s]
100%|██████████| 676/676 [00:00<00:00, 9437.14it/s]
100%|██████████| 170/170 [00:00<00:00, 9235.20it/s]
Filter: 100%|██████████| 1329/1329 [00:00<00:00, 59218.42 examples/s]
100%|██████████| 676/676 [00:00<00:00, 8964.09it/s]
100%|██████████| 169/169 [00:00<00:00, 9106.10it/s]
Filter: 100%|██████████| 1329/1329 [00:00<00:00, 58735.46 examples/s]
100%|██████████| 671/671 [00:00<00:00, 5396.02it/s]
100%|██████████| 168/168 [00:00<00:00, 8027.19it/s]
Filter: 100%|██████████| 1329/1329 [00:00<00:00, 57792.79 examples/s]
100%|██████████| 678/678 [00:00<00:00, 8856.49it/s]
100%|███

In [None]:
import pandas as pd
import ast

with open("subsample_ids_1.txt", "r") as file:
    data = file.read()

data_list = ast.literal_eval(data)

df = pd.DataFrame(data_list, columns=["ID"])

csv_file = "subsample_ids_1.csv"
df.to_csv(csv_file, index=False)

Data successfully saved to subsample_ids_1.csv
