In [1]:
import json
import random
from tqdm import tqdm
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer

In [2]:
def format_for_finetuning(data,
                          system_message: str,
                          tokenizer) -> str:
    """
    Format data in JSON for fine-tuning an OpenAI chatbot model.
    """
    
    chat = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": data['dialogue']},
        {"role": "assistant", "content": data['section_text']},      
    ]
    text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)    
    return json.dumps({"text":text})

In [3]:
def subsample_data(dataset, split_ratio=0.5):
    all_data = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])    
    total_ids = all_data['ID']

    # we are sampling 50% iid of the data
    subsampled_ids = random.sample(total_ids, int(len(total_ids) * split_ratio))

    return subsampled_ids

In [4]:
model_id='meta-llama/Llama-3.1-8B-Instruct'
data_name='beanham/medsum_llm_attack'
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = load_dataset(data_name)
all_data = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])
## randomly subset
#subsample_ids=subsample_data(dataset)
#dataset = dataset.filter(lambda example: example['ID'] in subsample_ids)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
system_message=system_message = """You are a helpful medical assistant! Please help me summarize dialogues between doctors and patients."""
data_formatted = '\n'.join(
    [format_for_finetuning(all_data[i], system_message, tokenizer) for i in tqdm(range(len(all_data)))]
    )
with open('data_formatted.jsonl', 'w') as f:
    f.write(data_formatted)

100%|█████████████████████████████████████| 1329/1329 [00:00<00:00, 7164.18it/s]


In [None]:
!