In [7]:
import json
import random
from tqdm import tqdm
from datasets import load_dataset, concatenate_datasets
# from datasets import train test split
from datasets import Dataset
from transformers import AutoTokenizer

In [2]:
def format_for_finetuning(data,
                          system_message: str,
                          tokenizer) -> str:
    """
    Format data in JSON for fine-tuning an OpenAI chatbot model.
    """
    
    chat = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": data['dialogue']},
        {"role": "assistant", "content": data['section_text']},      
    ]
    text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)    
    return json.dumps({"text":text})

In [3]:
def subsample_data(all_data, split_ratio=0.5):
    total_ids = all_data['ID']

    # we are sampling 50% iid of the data
    subsampled_ids = random.sample(total_ids, int(len(total_ids) * split_ratio))

    return subsampled_ids

In [9]:
model_id='meta-llama/Llama-3.1-8B-Instruct'
data_name='beanham/medsum_llm_attack'
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = load_dataset(data_name)
all_data = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])
# randomly subset

subsample_ids = subsample_data(all_data)
filtered_dataset = all_data.filter(lambda example: example['ID'] in subsample_ids)

# train test split from datasets library
dataset_train_val = filtered_dataset.train_test_split(test_size=0.2)
dataset_train = dataset_train_val['train']
dataset_val = dataset_train_val['test']

Filter: 100%|██████████| 1329/1329 [00:00<00:00, 40959.28 examples/s]


In [10]:
system_message = """You are a helpful medical assistant! Please help me summarize dialogues between doctors and patients."""

data_formatted_train = '\n'.join(
    [format_for_finetuning(dataset_train[i], system_message, tokenizer) for i in tqdm(range(len(dataset_train)))]
    )
with open('data_formatted_train.jsonl', 'w') as f:
    f.write(data_formatted_train)

data_formatted_validation = '\n'.join(
    [format_for_finetuning(dataset_val[i], system_message, tokenizer) for i in tqdm(range(len(dataset_val)))]
    )
with open('data_formatted_val.jsonl', 'w') as f:
    f.write(data_formatted_validation)

100%|██████████| 682/682 [00:00<00:00, 5559.36it/s]
100%|██████████| 171/171 [00:00<00:00, 8043.72it/s]


In [11]:
# subsample_ids to csv
import pandas as pd
df = pd.DataFrame(subsample_ids, columns=['ID'])
df.to_csv('subsample_ids.csv', index=False)

In [None]:
import pandas as pd
import ast

with open("subsample_ids_1.txt", "r") as file:
    data = file.read()

data_list = ast.literal_eval(data)

df = pd.DataFrame(data_list, columns=["ID"])

csv_file = "subsample_ids_1.csv"
df.to_csv(csv_file, index=False)

Data successfully saved to subsample_ids_1.csv
