In [1]:
import json
import pandas as pd

# 1. Processing Preference Pairs Gathered from M1

Collected data format: 
```json 
{
    "question_id": <int>,
    "question_complete": <str>,
    "course_id": <int>,
    "preference": [
        {
            "A": <str>,
            "B": <str>,
            "overall": <str>,
            "criteria": {
                "overall": <str>,
                "correctness": <str>,
                "relevance": <str>,
                "clarity": <str>,
                "completeness": <str>,
                "other": <str>
            }
        },
        ...
    ]
}
```

Processed data format:
```json
{
    "prompt": "...",
    "chosen": "...", 
    "rejected": "..."
}
```

In [2]:
def process_preference_pairs(data):
    processed_data = []
    for d in data:
        prompt = d["question_complete"]
        for p in d["preference"]:
            if p["overall"] == "A":
                chosen_option = "A"
                rejected_option = "B"
            elif p["overall"] == "B":
                chosen_option = "B"
                rejected_option = "A" 
            else:  # skip if the overall preference is not A or B e.g. AB 
                continue
                
            if p[chosen_option] == "..." or p[rejected_option] == "..." or p[chosen_option] == "" or p[rejected_option] == "":
                continue
            processed_data.append({
                "prompt": prompt,
                "chosen": p[chosen_option],
                "rejected": p[rejected_option]
            })
    
    return processed_data

In [3]:
def load_data(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    return data

In [4]:
def convert_to_jsonl(data, file_path):
    with open(file_path, "w") as f:
        for d in data:
            f.write(json.dumps(d) + "\n")

In [5]:
pp_m1_json_file = "raw_datasets/M1_preference_data_15052024.json"
pp_m1_jsonl_file = "raw_datasets/M1_preference_data_15052024.jsonl"
pp_m1_data = load_data(pp_m1_json_file)

df_pp_m1 = pd.DataFrame(pp_m1_data)
df_pp_m1

Unnamed: 0,question_id,question_complete,course_id,preference
0,0,Question: Consider the following contains func...,15000,[{'A': 'The asymptotic depth of the contains f...
1,3,Question: What is the asymptotic work of <code...,15000,"[{'A': '...', 'B': '...', 'overall': 'A', 'cri..."
2,4,Question: We have a collection of rectangles i...,15000,[{'A': 'Facts: - Rectangles in the plane have ...
3,5,Question: Which of the following scheduler pol...,15005,[{'A': 'Preemptive scheduling policies allow t...
4,7,"Question: In this week's lecture, you have bee...",15000,"[{'A': 'For the computation g(g(1, x1), g(x2, ..."
...,...,...,...,...
1517,7365,Question: Byzantine consistent broadcast (BCB)...,15003,"[{'A': 'In non-synchronous environments, intro..."
1518,7366,"Question: If process i fails, then eventually ...",15003,"[{'A': 'Yes, the statement is true. If process..."
1519,7368,Question: What happens in the reliable broadca...,15003,[{'A': 'If the completeness property of the fa...
1520,7370,Question: Consider a network that is organized...,15003,"[{'A': 'First, we can use a flooding algorithm..."


In [6]:
#Process the preference pairs
pp_m1_jsonl_file = "datasets/M1_preference_data_15052024.jsonl"
pp_m1_processed = process_preference_pairs(pp_m1_data)
df_pp_m1_processed = pd.DataFrame(pp_m1_processed)
df_pp_m1_processed.drop_duplicates(subset=["chosen", "rejected"], inplace=True)

# convert back to a list 
pp_m1_processed = df_pp_m1_processed.to_dict(orient="records")
convert_to_jsonl(pp_m1_processed, pp_m1_jsonl_file)

df_pp_m1_processed

Unnamed: 0,prompt,chosen,rejected
0,Question: Consider the following contains func...,"When `contains` is called on a List, the `drop...",The asymptotic depth of the contains function ...
1,Question: Consider the following contains func...,To determine the asymptotic depth of the `cont...,The asymptotic depth of the contains function ...
2,Question: Consider the following contains func...,To determine the asymptotic depth of the `cont...,The asymptotic depth of the `contains` functio...
3,Question: Consider the following contains func...,To determine the asymptotic depth of the `cont...,The contains function is a recursive function ...
4,Question: Consider the following contains func...,The asymptotic depth of the contains function ...,When the contains function is called on a List...
...,...,...,...
26636,Question: Consider the transformation from bin...,#### **Answer**: \n\nThe transformation from b...,#### **Answer**:\nThe transformation from bina...
26637,Question: Consider the transformation from bin...,Consider the transformation from binary MRSW s...,Let's consider the transformation from binary ...
26638,Question: Consider the transformation from bin...,To prove that the transformation from binary M...,"First, let's define the terms:\n\n- Binary MRS..."
26639,Question: Consider the transformation from bin...,"To solve this problem, first, let's understand...",Background Information:\n- Triple Data Encrypt...


# 2. Process other datasets

## Stanford Human Preferences Dataset (SHP)

Link: [https://huggingface.co/datasets/stanfordnlp/SHP](https://huggingface.co/datasets/stanfordnlp/SHP)

- `post_id`: the ID of the Reddit post (string)
- `domain`: the subreddit and split the example is drawn from, separated by an underscore (string)
- `upvote_ratio`: the percent of votes received by the post that were positive (aka upvotes) (float)
- `history`: the post title concatented to the post body (string)
- `c_root_id_A`: the ID of comment A (string)
- `c_root_id_B`: the ID of comment B (string)
- `created_at_utc_A`: utc timestamp of when comment A was created (integer)
- `created_at_utc_B`: utc timestamp of when comment B was created (integer)
- `score_A`: (# positive votes - # negative votes + 1) received by comment A (integer)
- `score_B`: (# positive votes - # negative votes + 1) received by comment B (integer)
- `human_ref_A`: text of comment A (string)
- `human_ref_B`: text of comment B (string)
- `labels`: the preference label -- **it is 1 if A is preferred to B; 0 if B is preferred to A**. This was randomized such that the label distribution is roughly 50/50. (integer)
- `seconds_difference`: how many seconds after the less preferred comment the more preferred one was created (will always be >= 0) (integer)
- `score_ratio`: the ratio of the more preferred comment's score to the less preferred comment's score (will be >= 1) (float)

In [7]:
from datasets import load_dataset

In [8]:
data_askengineers = load_dataset("stanfordnlp/shp", data_dir="askengineers")
data_askphysics = load_dataset("stanfordnlp/shp", data_dir="askphysics")
data_askscience = load_dataset("stanfordnlp/shp", data_dir="askscience")
data_explainlikeimfive = load_dataset("stanfordnlp/shp", data_dir="explainlikeimfive")

In [9]:
train_datasets = {
    "askengineers": data_askengineers["train"],
    "askphysics": data_askphysics["train"],
    "askscience": data_askscience["train"],
    "explainlikeimfive": data_explainlikeimfive["train"]
}

test_datasets = {
    "askengineers": data_askengineers["test"],
    "askphysics": data_askphysics["test"],
    "askscience": data_askscience["test"],
    "explainlikeimfive": data_explainlikeimfive["test"]
}

val_datasets = {
    "askengineers": data_askengineers["validation"],
    "askphysics": data_askphysics["validation"],
    "askscience": data_askscience["validation"],
    "explainlikeimfive": data_explainlikeimfive["validation"]
}

In [10]:
# Print infos about the datasets
print("Infos for train datasets")
for k, v in train_datasets.items():
    print(f"Number of samples in {k}: {len(v)}")
    
print("\nInfos for test datasets")
for k, v in test_datasets.items():
    print(f"Number of samples in {k}: {len(v)}")
    
print("\nInfos for validation datasets")
for k, v in val_datasets.items():
    print(f"Number of samples in {k}: {len(v)}")

Infos for train datasets
Number of samples in askengineers: 57096
Number of samples in askphysics: 7364
Number of samples in askscience: 13316
Number of samples in explainlikeimfive: 19592

Infos for test datasets
Number of samples in askengineers: 2638
Number of samples in askphysics: 587
Number of samples in askscience: 977
Number of samples in explainlikeimfive: 1070

Infos for validation datasets
Number of samples in askengineers: 3154
Number of samples in askphysics: 409
Number of samples in askscience: 899
Number of samples in explainlikeimfive: 1014


In [11]:
# Example of a sample
train_datasets["askengineers"][0]

{'post_id': 'ixovda',
 'domain': 'askengineers_train',
 'upvote_ratio': 0.98,
 'history': 'Who else loves talking with Machinists? Just getting a quick poll of who loves diving into technical conversations with machinists? Sometimes I feel like they\'re the only one\'s who actually know what\'s going on and can be responsible for the success of a project. I find it so refreshing to talk to them and practice my technical communication - which sometimes is like speaking another language.   &#x200B;  I guess for any college students or interns reading this, a take away would be: make friends with your machinist/fab shop. These guys will help you interpret your own drawing, make "oh shit" parts and fixes on the fly, and offer deep insight that will make you a better engineer/designer.',
 'c_root_id_A': 'g681ikv',
 'c_root_id_B': 'g681eun',
 'created_at_utc_A': 1600789307,
 'created_at_utc_B': 1600789257,
 'score_A': 305,
 'score_B': 141,
 'human_ref_A': 'The master machinist on our shop fl

In [12]:
def process_shp_sub_dataset(dataset):
    processed_data = []
    for d in dataset:
        prompt = d["history"]
        if d["labels"] == 1:
            chosen_option = "human_ref_A"
            rejected_option = "human_ref_B"
        elif d["labels"] == 0:
            chosen_option = "human_ref_B"
            rejected_option = "human_ref_A"
        else:  # skip if the overall preference is not A or B e.g. AB 
            continue
            
        processed_data.append({
            "prompt": prompt,
            "chosen": d[chosen_option],
            "rejected": d[rejected_option]
        })
    return processed_data

In [13]:
processed_training_data = {
    "askengineers": process_shp_sub_dataset(train_datasets["askengineers"]),
    "askphysics": process_shp_sub_dataset(train_datasets["askphysics"]),
    "askscience": process_shp_sub_dataset(train_datasets["askscience"]),
    "explainlikeimfive": process_shp_sub_dataset(train_datasets["explainlikeimfive"])
}

processed_test_data = {
    "askengineers": process_shp_sub_dataset(test_datasets["askengineers"]),
    "askphysics": process_shp_sub_dataset(test_datasets["askphysics"]),
    "askscience": process_shp_sub_dataset(test_datasets["askscience"]),
    "explainlikeimfive": process_shp_sub_dataset(test_datasets["explainlikeimfive"])
}

processed_val_data = {
    "askengineers": process_shp_sub_dataset(val_datasets["askengineers"]),
    "askphysics": process_shp_sub_dataset(val_datasets["askphysics"]),
    "askscience": process_shp_sub_dataset(val_datasets["askscience"]),
    "explainlikeimfive": process_shp_sub_dataset(val_datasets["explainlikeimfive"])
}

In [20]:
processed_test_data["askscience"][0]

{'prompt': 'Is there any reason for the alphabet being in the order its in?',
 'chosen': 'Short answer: maybe, maybe not. No one is sure.   Longer answer: The alphabet we use today is something that evolved over ~3,000 years, through 4 iterations minimum:   Phoenician ⇒ Greek ⇒ Latin ⇒ modern languages with letters like J, W, and ß, as well as diacriticals like å and diphthongs like œ.   As a result, there’s no one answer for where the order for a given modern language comes from. The alphabet for English is different from that of French, or Swedish, or Polish. Consider the Hungarian alphabet, which looks like this:  a, á, b, c, cs, d, dz, dzs, e, é, f, g, gy, h, i, í, j, k, l, ly, m, n, ny, o, ó, ö, ő, p, q, r, s, sz, t, ty, u, ú, ü, û, v, w, x, y, z, zs  Now, we can look at that and see that the basic order of the Latin alphabet remains. The additions are just stuck in after the related letters whose sounds they modify. As with English, we have the Latin alphabetical order, plus the 

In [14]:
print("Processed training data")
for k, v in processed_training_data.items():
    print(f"Number of samples in {k}: {len(v)}")
    
print("\nProcessed test data")
for k, v in processed_test_data.items():
    print(f"Number of samples in {k}: {len(v)}")
    
print("\nProcessed validation data")
for k, v in processed_val_data.items():
    print(f"Number of samples in {k}: {len(v)}")

Processed training data
Number of samples in askengineers: 57096
Number of samples in askphysics: 7364
Number of samples in askscience: 13316
Number of samples in explainlikeimfive: 19592

Processed test data
Number of samples in askengineers: 2638
Number of samples in askphysics: 587
Number of samples in askscience: 977
Number of samples in explainlikeimfive: 1070

Processed validation data
Number of samples in askengineers: 3154
Number of samples in askphysics: 409
Number of samples in askscience: 899
Number of samples in explainlikeimfive: 1014


In [15]:
# Convert to jsonl
training_paths = {
    "askengineers": "datasets/askengineers_train.jsonl",
    "askphysics": "datasets/askphysics_train.jsonl",
    "askscience": "datasets/askscience_train.jsonl",
    "explainlikeimfive": "datasets/explainlikeimfive_train.jsonl"
}

test_paths = {
    "askengineers": "datasets/askengineers_test.jsonl",
    "askphysics": "datasets/askphysics_test.jsonl",
    "askscience": "datasets/askscience_test.jsonl",
    "explainlikeimfive": "datasets/explainlikeimfive_test.jsonl"
}

val_paths = {
    "askengineers": "datasets/askengineers_val.jsonl",
    "askphysics": "datasets/askphysics_val.jsonl",
    "askscience": "datasets/askscience_val.jsonl",
    "explainlikeimfive": "datasets/explainlikeimfive_val.jsonl"
}

In [21]:
for k, v in processed_training_data.items():
    convert_to_jsonl(v, training_paths[k])
    
for k, v in processed_test_data.items():
    convert_to_jsonl(v, test_paths[k])
    
for k, v in processed_val_data.items():
    convert_to_jsonl(v, val_paths[k])