In [7]:
#Code origin
#Author: Alexander Valentini

from preference_data_parser import parse_preference_data
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from datasets import load_from_disk
import numpy as np
from utils import write_jsonl

content = parse_preference_data()
model_name='stabilityai/stablelm-zephyr-3b'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
#content is a list of dictionaries, each dictionary has the following keys:
# 'preference' : The preference answer pairs for the question
# 'question_complete' : The question text
# 'question_id' : The question ID
# 'course id' : The course ID
#There are 1522 different questions (in the list) and  18 preference pair answers for each question (accessed by ['preference']). 
total_length = 0
for key in content:
    total_length += len(key['preference'])
total_length

26738

In [12]:
content[0]['preference'][0]['overall']

'B'

In [9]:
def convert_to_triplets(content, tokenizer):
    triplets = []

    #Add assistant message.
    for item in content:
        question = item['question_complete']
        for pref in item['preference']:
            chosen = pref['B'] if pref['overall'] == 'B' else pref['A']
            rejected = pref['A'] if pref['overall'] == 'B' else pref['B']
            prompt_messages = [{"role": "user", "content": question}]
            chosen_messages = [{"role": "assistant", "content": chosen}]
            rejected_messages = [{"role": "assistant", "content": rejected}]
            
            triplet = {
                "prompt": tokenizer.apply_chat_template(prompt_messages, tokenize=False)[:-2],#+"<|assistant|>",
                "chosen": tokenizer.apply_chat_template(chosen_messages, tokenize=False)[:-2],#[14:][:-2],
                "rejected": tokenizer.apply_chat_template(rejected_messages, tokenize=False)[:-2]#[14:][:-2]
            }
            triplets.append(triplet)
    
    return triplets

# Convert the data
converted_data = convert_to_triplets(content, tokenizer)

# Split the data into train (80%), temp (20%)
train_data, temp_data = train_test_split(converted_data, test_size=0.2, random_state=42)

# Split the temp data into eval (50% of temp) and test (50% of temp)
eval_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)
test_dataset = Dataset.from_list(test_data)

# Combine the datasets into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "eval": eval_dataset,
    "test": test_dataset
})

# Save to a single JSON file
#output_path = "datasets/dpo_preference_data_self_made"
#dataset_dict.save_to_disk(output_path)

# Verify the saved data
#print(f"Saved dataset splits to {output_path}")

#Number of entries with prompt length greater than 1024
#print(sum([len(tokenizer(x)["input_ids"]) > 1024 for x in train_dataset["prompt"]]))
#print(sum([len(tokenizer(x)["input_ids"]) > 1024 for x in train_dataset["chosen"]]))
#print(sum([len(tokenizer(x)["input_ids"]) > 1024 for x in train_dataset["rejected"]]))


In [6]:
#write_jsonl(train_data, 'train_preference_data_new.jsonl')
#write_jsonl(eval_data, 'eval_preference_data_new.jsonl')
#write_jsonl(test_data, 'test_preference_data_new.jsonl')

#test_dataset.to_json('test_preference_data_new.jsonl', orient="records")
#train_dataset.to_json('train_preference_data_new.jsonl', orient="records")
#eval_dataset.to_json('eval_preference_data_new.jsonl', orient="records")

In [14]:
tokenizer.model_max_length = 2500
train_dataset


Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 21390
})

In [15]:
prompt_length_max = int(np.max([len(tokenizer(x)["input_ids"]) for x in train_dataset["prompt"]]))
chosen_length_max = int(np.max([len(tokenizer(x)["input_ids"]) for x in train_dataset["chosen"]]))
rejected_length_max = int(np.max([len(tokenizer(x)["input_ids"]) for x in train_dataset["rejected"]]))

In [16]:
print(prompt_length_max)
print(chosen_length_max)
print(rejected_length_max)

1081
1983
2085


In [11]:
prompt_length_99 = int(np.percentile([len(tokenizer(x)["input_ids"]) for x in train_dataset["prompt"]], 99))
chosen_length_99 = int(np.percentile([len(tokenizer(x)["input_ids"]) for x in train_dataset["chosen"]], 99))
rejected_length_99 = int(np.percentile([len(tokenizer(x)["input_ids"]) for x in train_dataset["rejected"]], 99))

Token indices sequence length is longer than the specified maximum sequence length for this model (2085 > 2048). Running this sequence through the model will result in indexing errors


In [12]:
print(prompt_length_99)
print(chosen_length_99)
print(rejected_length_99)
print(sum([len(tokenizer(x)["input_ids"]) > 1024 for x in train_dataset["prompt"]]))
print(sum([len(tokenizer(x)["input_ids"]) > 1024 for x in train_dataset["chosen"]]))
print(sum([len(tokenizer(x)["input_ids"]) > 1024 for x in train_dataset["rejected"]]))

776
806
732
48
43
19


In [13]:
#prompt_length = int(percentile([len(tokenizer(x)["input_ids"]) for x in train_dataset["prompt"]], 95))
max_seq_length_chosen = int(np.percentile([len(tokenizer(x["prompt"] + x["chosen"])["input_ids"]) for x in train_dataset], 99))
max_seq_length_rejected = int(np.percentile([len(tokenizer(x["prompt"] + x["rejected"])["input_ids"]) for x in train_dataset], 99))
max_seq_length = max(max_seq_length_chosen, max_seq_length_rejected)
print(max_seq_length_chosen)
print(max_seq_length_rejected)
print(max_seq_length)

1287
1202
1287


In [14]:
#Please remove the entries with prompt length+chosen length or prompt length+rejected length greater than max_seq_length
train_dataset_filtered = train_dataset.filter(lambda x: len(tokenizer(x["prompt"] + x["chosen"])["input_ids"]) <= max_seq_length and len(tokenizer(x["prompt"] + x["rejected"])["input_ids"]) <= max_seq_length)
eval_dataset_filtered = eval_dataset.filter(lambda x: len(tokenizer(x["prompt"] + x["chosen"])["input_ids"]) <= max_seq_length and len(tokenizer(x["prompt"] + x["rejected"])["input_ids"]) <= max_seq_length)


#train_dataset_filtered = train_dataset.filter(lambda x: len(tokenizer(x["prompt"])["input_ids"]) <= 1024 and len(tokenizer(x["chosen"])["input_ids"]) <= 1024 and len(tokenizer(x["rejected"])["input_ids"]) <= 1024)
#eval_dataset_filtered = eval_dataset.filter(lambda x: len(tokenizer(x["prompt"])["input_ids"]) <= 1024 and len(tokenizer(x["chosen"])["input_ids"]) <= 1024 and len(tokenizer(x["rejected"])["input_ids"]) <= 1024)


Filter:   0%|          | 0/21390 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2674 [00:00<?, ? examples/s]

In [16]:
#Find max prompt length
prompt_length_max = int(np.max([len(tokenizer(x["prompt"])["input_ids"]) for x in train_dataset_filtered]))
prompt_length_max

1081

In [None]:
# Combine the datasets into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset_filtered,
    "eval": eval_dataset_filtered,
    "test": test_dataset
})

# Save to a single JSON file
output_path = "datasets/dpo_preference_data_self_made/preference_data_99_percentile_filtered"
dataset_dict.save_to_disk(output_path)

# Verify the saved data
print(f"Saved dataset splits to {output_path}")

#Find max sequence length
max_seq_length_chosen = int(np.max([len(tokenizer(x["prompt"] + x["chosen"])["input_ids"]) for x in train_dataset_filtered]))
max_seq_length_rejected = int(np.max([len(tokenizer(x["prompt"] + x["rejected"])["input_ids"]) for x in train_dataset_filtered]))
print(max_seq_length_chosen)
print(max_seq_length_rejected)

#Number of entries with prompt length greater than 1024
#print(sum([len(tokenizer(x)["input_ids"]) > 1024 for x in train_dataset_filtered["prompt"]]))
#print(sum([len(tokenizer(x)["input_ids"]) > 1024 for x in train_dataset_filtered["chosen"]]))
#print(sum([len(tokenizer(x)["input_ids"]) > 1024 for x in train_dataset_filtered["rejected"]]))

In [None]:
#Please write code to find the length of all the prompt, chosen, and rejected messages in the train_dataset:


In [21]:
#Sanity Check:
dataset_path = dataset_path = "datasets/dpo_preference_data_self_made/train_preference_data"
loaded_dataset = load_from_disk(dataset_path)
print(f"Number of examples in the training set: {len(loaded_dataset['train'])}")
print(f"Number of examples in the evaluation set: {len(loaded_dataset['eval'])}")
print(f"Number of examples in the test set: {len(loaded_dataset['test'])}")
loaded_dataset['train'][0]

Number of examples in the training set: 21282
Number of examples in the evaluation set: 2663
Number of examples in the test set: 2674


{'prompt': '<|user|>\nQuestion: Let $X$ denote the random variable associated to the plaintexts and $Y$ the random variable associated to the corresponding ciphertexts. If a cryptosystem achieves perfect secrecy, then we have that \\dots?\n\nOptions:\nA. $\\Pr [X=x|Y=y] = \\Pr[Y=y]$.\nB. $\\Pr [X=x|Y=y] = \\Pr[X=x]$.\nC. $\\Pr [X=x|Y=y] = \\Pr[X=x,Y=y]$.\nD. $\\Pr [X=x] = \\Pr[Y=y]$.<|endoftext|',
 'chosen': '<|assistant|>\nThe correct statement is: \\"$\\\\\\\\Pr [X=x|Y=y] = \\\\\\\\Pr[X=x]$.\\"\\n\\nPerfect secrecy in a cryptosystem means that observing the ciphertext does not give any information about the plaintext. Mathematically, this is formalized as $\\\\\\\\Pr [X=x|Y=y] = \\\\\\\\Pr[X=x]$, meaning that the probability of a certain plaintext being the true message given the ciphertext is the same as the probability of that plaintext being the true message without knowing the ciphertext.\\n\\nThe other statements are not necessarily true in the context of perfect secrecy. \\"$\\

In [None]:
#type(converted_data)


list