In [1]:
import datasets
from datasets import load_dataset, concatenate_datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import random
import json
import os
import numpy as np

In [2]:
m1_dpo = load_dataset("json", data_files="project-code-2024/datasets/M1_preference_data_15052024.json")

In [3]:
m1_dpo["train"]

Dataset({
    features: ['course_id', 'question_complete', 'preference', 'question_id'],
    num_rows: 1522
})

In [4]:
m1_dpo["train"]["preference"][1]

[{'A': '...',
  'B': '...',
  'criteria': {'clarity': 'None',
   'completeness': 'A',
   'correctness': 'B',
   'modified': None,
   'other': 'Conciseness: B; Engagement: AB',
   'overall': 'A',
   'relevance': 'AB'},
  'overall': 'A'},
 {'A': 'To determine the asymptotic work of the `parGroupBy2` function, we first need to understand what this function does and how it operates.\n\nThe `parGroupBy2` function likely takes a collection of elements and groups them into pairs based on some criteria. The function name suggests that it may be parallelized, meaning that it may use multiple threads or processors to perform its operations in parallel.\n\nIn terms of determining the asymptotic work of this function, we need to consider the complexity of grouping the elements into pairs. If the function involves comparing every element to every other element to form pairs, the work complexity would likely be O(n^2), where n is the number of elements in the collection.\n\nHowever, if the function 

In [5]:
np.median([len(p) for p in m1_dpo["train"]["preference"]])

18.0

In [6]:
def check_length_by_words(pair, min_words = 30, max_words = 200):
    return (min_words <= len(pair['A'].split()) <= max_words) and (min_words <= len(pair['B'].split()) <= max_words)

def check_length_by_characters(pair, min_length = 200, max_length = 700):
    return (min_length <= len(pair['A'].split()) <= max_length) and (min_length <= len(pair['B'].split()) <= max_length)

def filter_by_length(sample:dict, check=check_length_by_characters):
    for pair in sample["preference"]:
        # print(len(pair['A']))
        # print(len(pair['B']))
        if check(pair):
            return True
    return False 

In [7]:
m1_dpo["train"].filter(filter_by_length)

Filter:   0%|          | 0/1522 [00:00<?, ? examples/s]

Dataset({
    features: ['course_id', 'question_complete', 'preference', 'question_id'],
    num_rows: 1202
})

In [11]:
from typing import Literal
def choose_preference_pair_id(prefenrences:list[dict], method:Literal["random", "length_in_characters"]="length_in_characters"):
    if method=="random":
        preference_pair_id = random.randint(0, len(prefenrences)-1)
    elif method == "length_in_characters":
        # choose_from_idxs = []
        # for i, pair in enumerate(prefenrences):
        #     print(f"{pair=}")
        #     if ((len(pair['A']) > min_length) and (len(pair['B']) > min_length)):
        #         choose_from_idxs += [i]
        choose_from_idxs = [i for i, pair in enumerate(prefenrences) if check_length_by_characters(pair)]
        # print(choose_from_idxs)
        preference_pair_id = random.choice(choose_from_idxs)
    return preference_pair_id


def choose_random_preference_pair(sample:dict)->dict:
    preference_pair_id = choose_preference_pair_id(sample["preference"])
    chosen_key = sample["preference"][preference_pair_id]['criteria']['overall']
    other_mapping = {
        'A':'B',
        'B':'A'
    }
    rejected_key = other_mapping[chosen_key]
    sample["chosen"] = sample["preference"][preference_pair_id][chosen_key]
    sample["rejected"] = sample["preference"][preference_pair_id][rejected_key]
    sample["prompt"] = sample["question_complete"]
    for useless_key in ['preference', 'course_id', 'question_complete', 'question_id']:
        sample.pop(useless_key)
    # sample = {k:v for k, v in sample.items() if k in ["chosen", "rejected", "prompt"]}
    return sample

random.seed(42)
m1_dpo["train"] = m1_dpo["train"].filter(filter_by_length)
m1_dpo["train"] = m1_dpo["train"].map(choose_random_preference_pair)

Filter:   0%|          | 0/1202 [00:00<?, ? examples/s]

Map:   0%|          | 0/1202 [00:00<?, ? examples/s]

In [12]:
m1_dpo["train"]

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 1202
})

In [13]:
m1_dpo = m1_dpo["train"].train_test_split(0.2, seed=42)

In [14]:
for split in m1_dpo.keys():
    m1_dpo[split].to_json(os.path.join("project-code-2024", "datasets", f"dpo_M1_15052024_length_200-700_{split}.jsonl"))

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

#### OpenAI WebGPT Comparisons

In [2]:
webgpt_dataset = load_dataset("openai/webgpt_comparisons")

Downloading data: 100%|██████████| 127M/127M [01:02<00:00, 2.03MB/s] 


Generating train split:   0%|          | 0/19578 [00:00<?, ? examples/s]

In [3]:
print(webgpt_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'quotes_0', 'answer_0', 'tokens_0', 'score_0', 'quotes_1', 'answer_1', 'tokens_1', 'score_1'],
        num_rows: 19578
    })
})


In [5]:
filter_same_responses = lambda example: example['score_0'] != 0 and example['score_1'] != 0

def process_webgpt(example:dict):
    example['chosen'] = example['answer_0'] if example['score_0'] > example['score_1'] else example['answer_1']
    example['rejected'] = example['answer_1'] if example['score_0'] > example['score_1'] else example['answer_0']
    example['prompt'] = example['question']['full_text']
    for useless_key in ['question', 'quotes_0', 'answer_0', 'tokens_0', 'score_0', 'quotes_1', 'answer_1', 'tokens_1', 'score_1']:
        example.pop(useless_key)
    return example

In [6]:
webgpt_dataset["train"] = webgpt_dataset["train"].filter(filter_same_responses)

Filter:   0%|          | 0/19578 [00:00<?, ? examples/s]

In [8]:
webgpt_dataset["train"] = webgpt_dataset["train"].map(process_webgpt)

Map:   0%|          | 0/14346 [00:00<?, ? examples/s]

In [10]:
webgpt_dataset["train"].to_json(os.path.join("project-code-2024", "datasets", f"dpo_webgpt_comparaisons.jsonl"))

Creating json from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

25091329

In [24]:
orca = load_dataset("Intel/orca_dpo_pairs")

In [25]:
orca

DatasetDict({
    train: Dataset({
        features: ['system', 'question', 'chosen', 'rejected'],
        num_rows: 12859
    })
})

In [26]:
def process_orca_sample(sample:dict)->dict:
    sample["prompt"] = sample["question"]
    for useless_key in ['question', 'system']:
        sample.pop(useless_key)
    return sample

orca['train'] = orca['train'].map(process_orca_sample)


Map:   0%|          | 0/12859 [00:00<?, ? examples/s]

In [27]:
orca

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected', 'prompt'],
        num_rows: 12859
    })
})

In [28]:
orca["train"].to_json(os.path.join("project-code-2024", "datasets", f"dpo_orca_train.jsonl"))

Creating json from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

34429694

#### Stanford Human Preferences Dataset (SHP)

In [None]:
shp_dataset_full = load_dataset("stanfordnlp/SHP")

In [None]:
print(shp_dataset_full)

In [None]:
shp_train_full = shp_dataset_full['train']
print(shp_train_full)

In [None]:
shp_train = shp_train_full.filter(lambda example: example['domain'] in ['askacademia'])
print(shp_train)

In [None]:
# only take the 4 domains:
# askacademia, askengineers, askphysics, askscience
shp_dataset_train = shp_dataset_full.filter(lambda example: example['domain'] in ['askacademia_train', 'askengineers_train', 'askphysics_train', 'askscience_train'])
shp_dataset_train = shp_dataset_train['train']
print(shp_dataset_train)

shp_dataset_eval = shp_dataset_full.filter(lambda example: example['domain'] in ['askacademia_validation', 'askengineers_validation', 'askphysics_validation', 'askscience_validation'])
shp_dataset_eval = shp_dataset_eval['validation']
print(shp_dataset_eval)

shp_dataset_test = shp_dataset_full.filter(lambda example: example['domain'] in ['askacademia_test', 'askengineers_test', 'askphysics_test', 'askscience_test'])
shp_dataset_test = shp_dataset_test['test']
print(shp_dataset_test)

In [None]:
# labels -> 1 if A is preferred to B, 0 if B is preferred to A. 

shp_dataset_formatted_train = []

for example in shp_dataset_train:
    prompt = example['history']
    if example['labels'] == 1:
        chosen_answer = example['human_ref_A']
        rejected_answer = example['human_ref_B']
    else:
        chosen_answer = example['human_ref_B']
        rejected_answer = example['human_ref_A']
    
    reformatted_example = {
        "prompt": prompt,
        "chosen": chosen_answer,
        "rejected": rejected_answer
    }
    
    shp_dataset_formatted_train.append(reformatted_example)

reformatted_json_shp_train = json.dumps(shp_dataset_formatted_train, indent=4)

print(reformatted_json_shp_train)


In [None]:
shp_dataset_formatted_eval = []

for example in shp_dataset_eval:
    prompt = example['history']
    if example['labels'] == 1:
        chosen_answer = example['human_ref_A']
        rejected_answer = example['human_ref_B']
    else:
        chosen_answer = example['human_ref_B']
        rejected_answer = example['human_ref_A']
    
    reformatted_example = {
        "prompt": prompt,
        "chosen": chosen_answer,
        "rejected": rejected_answer
    }
    
    shp_dataset_formatted_eval.append(reformatted_example)

reformatted_json_shp_eval = json.dumps(shp_dataset_formatted_eval, indent=4)

print(reformatted_json_shp_eval)

In [None]:
shp_dataset_formatted_test = []

for example in shp_dataset_test:
    prompt = example['history']
    if example['labels'] == 1:
        chosen_answer = example['human_ref_A']
        rejected_answer = example['human_ref_B']
    else:
        chosen_answer = example['human_ref_B']
        rejected_answer = example['human_ref_A']
    
    reformatted_example = {
        "prompt": prompt,
        "chosen": chosen_answer,
        "rejected": rejected_answer
    }
    
    shp_dataset_formatted_test.append(reformatted_example)

reformatted_json_shp_test = json.dumps(shp_dataset_formatted_test, indent=4)

print(reformatted_json_shp_test)

In [None]:
print(len(shp_dataset_formatted_train))
print(len(shp_dataset_formatted_eval))
print(len(shp_dataset_formatted_test))

In [None]:
# save the datasets
with open(os.path.join("project-code-2024", "datasets", "shp_train.json"), 'w') as f:
    json.dump(shp_dataset_formatted_train, f, indent=4)

with open(os.path.join("project-code-2024", "datasets", "shp_eval.json"), 'w') as f:
    json.dump(shp_dataset_formatted_eval, f, indent=4)

with open(os.path.join("project-code-2024", "datasets", "shp_test.json"), 'w') as f:
    json.dump(shp_dataset_formatted_test, f, indent=4)