In [16]:
import json
import argparse
from tqdm import tqdm
from datasets import Dataset
from os import makedirs, path
from transformers import AutoTokenizer
from datasets import load_dataset, concatenate_datasets

system_message = """You are a helpful law assistant! Please help me summarize law cases."""

def format_for_finetuning(data,
                          system_message: str,
                          tokenizer) -> str:
    """
    Format data in JSON for fine-tuning an OpenAI chatbot model.
    """
    
    chat = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": data['text']},
        {"role": "assistant", "content": data['summary']},      
    ]
    text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
    return json.dumps({"text":text})

In [2]:
#-------------------
# Parameters
#-------------------    
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='beanham/law_llm_attack')
parser.add_argument('--data_dir', type=str, default='formatted_data/')
parser.add_argument('--model_id', type=str, default='meta-llama/Llama-3.1-8B-Instruct')
args = parser.parse_args(args=[])
if not path.exists(args.data_dir):
    makedirs(args.data_dir)
if 'llama' in args.model_id:
    args.model_tag = 'llama'
elif 'mistral' in args.model_id:
    args.model_tag = 'mistral'
elif 'Qwen' in args.model_id:
    args.model_tag = 'qwen'

In [3]:
# ----------------------
# Load & Process Data
# ----------------------
print('Downloading and preparing data...')
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
dataset = load_dataset(args.dataset)
all_data = concatenate_datasets([dataset['train'], dataset['val'], dataset['test']])
new_ids = range(len(all_data))
all_data = all_data.add_column("new_ID", new_ids)

Downloading and preparing data...


In [4]:
    ## load split ids
    with open(path.join(args.data_dir, 'train_ids.txt'), 'r') as f:
        train_ids=f.readlines()    
    with open(path.join(args.data_dir, 'val_ids.txt'), 'r') as f:
        val_ids=f.readlines()
    with open(path.join(args.data_dir, 'shadow_train_ids.txt'), 'r') as f:
        shadow_train_ids=f.readlines()
    with open(path.join(args.data_dir, 'shadow_val_ids.txt'), 'r') as f:
        shadow_val_ids=f.readlines()
    train_ids=[int(i.split()[0]) for i in train_ids]
    val_ids=[int(i.split()[0]) for i in val_ids]
    shadow_train_ids=[int(i.split()[0]) for i in shadow_train_ids]
    shadow_val_ids=[int(i.split()[0]) for i in shadow_val_ids]    
    
    dataset_train = all_data.filter(lambda example: example['new_ID'] in train_ids)
    dataset_val = all_data.filter(lambda example: example['new_ID'] in val_ids)
    shadow_dataset_train = all_data.filter(lambda example: example['new_ID'] in shadow_train_ids)
    shadow_dataset_val = all_data.filter(lambda example: example['new_ID'] in shadow_val_ids)

Filter:   0%|          | 0/1171 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1171 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1171 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1171 [00:00<?, ? examples/s]