In [1]:
import datasets
from datasets import load_dataset, concatenate_datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import random
import json
import os
import numpy as np

In [2]:
m1_dpo = load_dataset("json", data_files="model/datasets/M1_preference_data_15052024.json")

In [3]:
m1_dpo["train"]

Dataset({
    features: ['preference', 'question_id', 'question_complete', 'course_id'],
    num_rows: 1522
})

In [4]:
np.median([len(p) for p in m1_dpo["train"]["preference"]])

18.0

In [7]:
np.mean([len(p) for p in m1_dpo["train"]["preference"]])

17.567674113009197

In [6]:
np.max([len(p) for p in m1_dpo["train"]["preference"]])

37

In [5]:
np.min([len(p) for p in m1_dpo["train"]["preference"]])

1

In [7]:
def check_length_by_words(pair, min_words = 30, max_words = 200):
    return (min_words <= len(pair['A'].split()) <= max_words) and (min_words <= len(pair['B'].split()) <= max_words)

def check_length_by_characters(pair, min_length = 200, max_length = 700):
    return (min_length <= len(pair['A'].split()) <= max_length) and (min_length <= len(pair['B'].split()) <= max_length)

def filter_by_length(sample:dict, check=check_length_by_characters):
    for pair in sample["preference"]:
        # print(len(pair['A']))
        # print(len(pair['B']))
        if check(pair):
            return True
    return False 

In [8]:
m1_dpo["train"].filter(filter_by_length)

Filter:   0%|          | 0/1522 [00:00<?, ? examples/s]

Dataset({
    features: ['preference', 'question_id', 'question_complete', 'course_id'],
    num_rows: 1202
})

In [11]:
from typing import Literal
def choose_preference_pair_id(prefenrences:list[dict], method:Literal["random", "length_in_characters"]="length_in_characters"):
    if method=="random":
        preference_pair_id = random.randint(0, len(prefenrences)-1)
    elif method == "length_in_characters":
        # choose_from_idxs = []
        # for i, pair in enumerate(prefenrences):
        #     print(f"{pair=}")
        #     if ((len(pair['A']) > min_length) and (len(pair['B']) > min_length)):
        #         choose_from_idxs += [i]
        choose_from_idxs = [i for i, pair in enumerate(prefenrences) if check_length_by_characters(pair)]
        # print(choose_from_idxs)
        preference_pair_id = random.choice(choose_from_idxs)
    return preference_pair_id


def choose_random_preference_pair(sample:dict)->dict:
    preference_pair_id = choose_preference_pair_id(sample["preference"])
    chosen_key = sample["preference"][preference_pair_id]['criteria']['overall']
    other_mapping = {
        'A':'B',
        'B':'A'
    }
    rejected_key = other_mapping[chosen_key]
    sample["chosen"] = sample["preference"][preference_pair_id][chosen_key]
    sample["rejected"] = sample["preference"][preference_pair_id][rejected_key]
    sample["prompt"] = sample["question_complete"]
    for useless_key in ['preference', 'course_id', 'question_complete', 'question_id']:
        sample.pop(useless_key)
    # sample = {k:v for k, v in sample.items() if k in ["chosen", "rejected", "prompt"]}
    return sample


# random.seed(42)
# m1_dpo["train"] = m1_dpo["train"].filter(filter_by_length)
# m1_dpo["train"] = m1_dpo["train"].map(choose_random_preference_pair)

Filter:   0%|          | 0/1202 [00:00<?, ? examples/s]

Map:   0%|          | 0/1202 [00:00<?, ? examples/s]

In [9]:
m1_dpo["train"]

Dataset({
    features: ['preference', 'question_id', 'question_complete', 'course_id'],
    num_rows: 1522
})

In [14]:
full_data = []
other_mapping = {
    'A':'B',
    'B':'A'
}

min_length = 200

for preference_pairs, prompt in zip(m1_dpo["train"]['preference'], m1_dpo["train"]['question_complete']):
    for pair in preference_pairs:
        if (min_length <= len(pair['A'])) and (min_length <= len(pair['B'])):
            chosen_key = pair['criteria']['overall']
            rejected_key = other_mapping[chosen_key]
            full_data.append(
                {
                    'prompt' : prompt,
                    'chosen' : pair[chosen_key],
                    'rejected' : pair[rejected_key]
                }
            )
            
import jsonlines
with jsonlines.open('model/datasets/dpo_M1_all_pairs_length-sup-200-char.jsonl', 'w') as writer:
    writer.write_all(full_data)
    

In [15]:
new_M1 = load_dataset("json", data_files='model/datasets/dpo_M1_all_pairs_length-sup-200-char.jsonl')
new_M1

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 23446
    })
})

In [17]:
new_M1 = new_M1['train'].train_test_split(0.1)

In [18]:
new_M1

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 21101
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 2345
    })
})

In [19]:
for split in new_M1.keys():
    new_M1[split].to_json(os.path.join("model", "datasets", f"dpo_M1_all_pairs_length-sup-200-char_{split}.jsonl"))

Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

In [12]:
m1_dpo["train"]

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 1202
})

In [13]:
m1_dpo = m1_dpo["train"].train_test_split(0.2, seed=42)

In [14]:
for split in m1_dpo.keys():
    m1_dpo[split].to_json(os.path.join("project-code-2024", "datasets", f"dpo_M1_15052024_length_200-700_{split}.jsonl"))

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

#### OpenAI WebGPT Comparisons

In [2]:
webgpt_dataset = load_dataset("openai/webgpt_comparisons")

Downloading data: 100%|██████████| 127M/127M [01:02<00:00, 2.03MB/s] 


Generating train split:   0%|          | 0/19578 [00:00<?, ? examples/s]

In [3]:
print(webgpt_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'quotes_0', 'answer_0', 'tokens_0', 'score_0', 'quotes_1', 'answer_1', 'tokens_1', 'score_1'],
        num_rows: 19578
    })
})


In [5]:
filter_same_responses = lambda example: example['score_0'] != 0 and example['score_1'] != 0

def process_webgpt(example:dict):
    example['chosen'] = example['answer_0'] if example['score_0'] > example['score_1'] else example['answer_1']
    example['rejected'] = example['answer_1'] if example['score_0'] > example['score_1'] else example['answer_0']
    example['prompt'] = example['question']['full_text']
    for useless_key in ['question', 'quotes_0', 'answer_0', 'tokens_0', 'score_0', 'quotes_1', 'answer_1', 'tokens_1', 'score_1']:
        example.pop(useless_key)
    return example

In [6]:
webgpt_dataset["train"] = webgpt_dataset["train"].filter(filter_same_responses)

Filter:   0%|          | 0/19578 [00:00<?, ? examples/s]

In [8]:
webgpt_dataset["train"] = webgpt_dataset["train"].map(process_webgpt)

Map:   0%|          | 0/14346 [00:00<?, ? examples/s]

In [10]:
webgpt_dataset["train"].to_json(os.path.join("project-code-2024", "datasets", f"dpo_webgpt_comparaisons.jsonl"))

Creating json from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

25091329

In [8]:
orca = load_dataset("Intel/orca_dpo_pairs")

In [9]:
orca

DatasetDict({
    train: Dataset({
        features: ['system', 'question', 'chosen', 'rejected'],
        num_rows: 12859
    })
})

In [26]:
def process_orca_sample(sample:dict)->dict:
    sample["prompt"] = sample["question"]
    for useless_key in ['question', 'system']:
        sample.pop(useless_key)
    return sample

orca['train'] = orca['train'].map(process_orca_sample)


Map:   0%|          | 0/12859 [00:00<?, ? examples/s]

In [27]:
orca

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected', 'prompt'],
        num_rows: 12859
    })
})

In [28]:
orca["train"].to_json(os.path.join("project-code-2024", "datasets", f"dpo_orca_train.jsonl"))

Creating json from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

34429694

In [22]:
data_path_tosplit = "dpo_webgpt_comparaisons"

data = load_dataset("json", data_files=f'model/datasets/{data_path_tosplit}.jsonl')
data = data["train"].train_test_split(0.1, seed=42)
for split in data.keys():
    data[split].to_json(os.path.join("model", "datasets", f"{data_path_tosplit}_{split}.jsonl"))

Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [4]:
test_data = load_dataset("json", data_files=["model/datasets/dpo_M1_all_pairs_length-sup-200-char_test.jsonl", "model/datasets/dpo_orca_test.jsonl", "model/datasets/dpo_webgpt_comparaisons_test.jsonl"])

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
test_data['train'].to_json(os.path.join("model", "datasets", f"dpo_test_data_M1-orca-webgpt.jsonl"))

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

12578075