# 1. Processing Preference Pairs Gathered from M1

Collected data format: 
```json 
{
    "question_id": <int>,
    "question_complete": <str>,
    "course_id": <int>,
    "preference": [
        {
            "A": <str>,
            "B": <str>,
            "overall": <str>,
            "criteria": {
                "overall": <str>,
                "correctness": <str>,
                "relevance": <str>,
                "clarity": <str>,
                "completeness": <str>,
                "other": <str>
            }
        },
        ...
    ]
}
```

Processed data format:
```json
{
    "prompt": "...",
    "chosen": "...", 
    "rejected": "..."
}
```

In [6]:
import os
import json
import jsonlines
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
def process_preference_pairs(data):
    processed_data = []
    for d in data:
        prompt = d["question_complete"]
        for p in d["preference"]:
            if p["overall"] == "A":
                chosen_option = "A"
                rejected_option = "B"
            elif p["overall"] == "B":
                chosen_option = "B"
                rejected_option = "A" 
            else:  # skip if the overall preference is not A or B e.g. AB 
                continue
                
            if p[chosen_option] == "..." or p[rejected_option] == "..." or p[chosen_option] == "" or p[rejected_option] == "":
                continue
            processed_data.append({
                "instruction": prompt,
                "input": "",
                "chosen": p[chosen_option],
                "rejected": p[rejected_option]
            })
    
    return processed_data


def load_data(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    return data


def convert_to_jsonl(data, file_path):
    with jsonlines.open(file_path, "w") as f:
        for d in data:
            f.write(d)

In [8]:
pp_m1_json_file = "M1_preference_data_15052024.json"
pp_m1_jsonl_file = "M1_preference_data_15052024.jsonl"
pp_m1_data = load_data(pp_m1_json_file)

df_pp_m1 = pd.DataFrame(pp_m1_data)
df_pp_m1

Unnamed: 0,question_id,question_complete,course_id,preference
0,0,Question: Consider the following contains func...,15000,[{'A': 'The asymptotic depth of the contains f...
1,3,Question: What is the asymptotic work of <code...,15000,"[{'A': '...', 'B': '...', 'overall': 'A', 'cri..."
2,4,Question: We have a collection of rectangles i...,15000,[{'A': 'Facts: - Rectangles in the plane have ...
3,5,Question: Which of the following scheduler pol...,15005,[{'A': 'Preemptive scheduling policies allow t...
4,7,"Question: In this week's lecture, you have bee...",15000,"[{'A': 'For the computation g(g(1, x1), g(x2, ..."
...,...,...,...,...
1517,7365,Question: Byzantine consistent broadcast (BCB)...,15003,"[{'A': 'In non-synchronous environments, intro..."
1518,7366,"Question: If process i fails, then eventually ...",15003,"[{'A': 'Yes, the statement is true. If process..."
1519,7368,Question: What happens in the reliable broadca...,15003,[{'A': 'If the completeness property of the fa...
1520,7370,Question: Consider a network that is organized...,15003,"[{'A': 'First, we can use a flooding algorithm..."


In [9]:
# Process the preference pairs and remove duplicates
pp_m1_processed = process_preference_pairs(pp_m1_data)
df_pp_m1_processed = pd.DataFrame(pp_m1_processed)
df_pp_m1_processed.drop_duplicates(subset=["chosen", "rejected"], inplace=True)
pp_m1_processed = df_pp_m1_processed.to_dict(orient="records")

In [10]:
# Split the data into train, test, and validation, with shuffle
seed = 0
train_size = 0.9
val_size = 0.05
test_size = 0.05
pp_m1_train, pp_m1_temp = train_test_split(pp_m1_processed, test_size=(test_size + val_size), random_state=seed)
pp_m1_val, pp_m1_test = train_test_split(pp_m1_temp, test_size=0.5, random_state=seed)

In [11]:
pp_m1_train_file = "M1_preference_data_15052024_train.jsonl"
pp_m1_val_file = "M1_preference_data_15052024_val.jsonl"
pp_m1_test_file = "M1_preference_data_15052024_test.jsonl"

convert_to_jsonl(pp_m1_train, pp_m1_train_file)
convert_to_jsonl(pp_m1_val, pp_m1_val_file)
convert_to_jsonl(pp_m1_test, pp_m1_test_file)

df_pp_m1_train = pd.DataFrame(pp_m1_train)
df_pp_m1_val = pd.DataFrame(pp_m1_val)
df_pp_m1_test = pd.DataFrame(pp_m1_test)

In [12]:
df_pp_m1_train

Unnamed: 0,instruction,input,chosen,rejected
0,"Question: A homogeneous, full, vertical wheel,...",,Sure! Let's find the velocity of the center of...,Sure! Let's break down the problem step by ste...
1,Question: Does the disparity in class proporti...,,The disparity in class proportions does hurt t...,The disparity in class proportions can hurt th...
2,Question: Select all true statements.A penalty...,,The true statements are:\n1) The k-NN algorith...,Correct statements:\n1) The k-NN algorithm is ...
3,Question: Consider the (toy) grammar $G$ consi...,,In order to cope with simple number agreements...,To address the question of how many rules shou...
4,Question: Why does Intel Itanium contain more ...,,Answer: Intel Itanium contains 128 general-pur...,Answer: Intel Itanium contains more general-pu...
...,...,...,...,...
23946,"Question: In class, we saw Karger's beautiful ...",,The main difference between Karger's algorithm...,Karger and Stein modified Karger's algorithm b...
23947,Question: Consider a system of two particles w...,,To find the probability of observing $+\frac{\...,"To solve this question, we need to find the st..."
23948,Question: We are given a data set $S=\left\{\l...,,\n\nGiven that we are using a nearest neighbor...,Given a data set $S=\left\{\left(\boldsymbol{x...
23949,Question: Recall the Jaccard index that we saw...,,**Problem:**\nDesign a locality-sensitive hash...,To design a Locality Sensitive Hashing (LSH) f...


In [13]:
df_pp_m1_val

Unnamed: 0,instruction,input,chosen,rejected
0,Question: The exponent $\lambda(21)$ of $\math...,,"Let's think step by step. \n\nFirst, we need t...",C: 6. \n\nThe totient function $\lambda(n)$ co...
1,Question: Let $\mathcal C_1$ be a linear code ...,,"In coding theory, a linear code is a subspace ...",To determine if $\mathcal C_1 \cup \mathcal C_...
2,Question: Assume you are working on a text edi...,,Absolutely! Here is another detailed and relev...,Your colleague may follow these steps to effic...
3,Question: A neutral dielectric in the shape of...,,i) To determine the linked charges on the inte...,Step 1: Calculate the linked charge per unit l...
4,"Question: Let $H:\{0,1\}^* \rightarrow \{0,1\}...",,The correct option is: $2^{-n}$.\n\nExplanatio...,The correct option is:\n\n$2^{-n}$.\n\nIn cryp...
...,...,...,...,...
1326,Question: A multiset is an unordered collectio...,,To transform a given set `s` to a multiset whe...,To transform a set into a multiset where each ...
1327,Question: Which of the following measures will...,,Answer: A) Reducing overheads imposed by the f...,Answer: A) Reducing overheads imposed by the f...
1328,Question: In the following let $\kappa_{1}\lef...,,"To show that $\kappa\left(\mathbf{x}, \mathbf{...","To show that $\kappa\left(\mathbf{x}, \mathbf{..."
1329,Question: Let $X$ be a random variable distrib...,,Let's analyze this step by step:\n\n1. **Compu...,This is a true statement.\n\nGiven that $X$ is...


In [14]:
df_pp_m1_test

Unnamed: 0,instruction,input,chosen,rejected
0,Question: Through a freak accident ($^\copyrig...,,To estimate the force between two persons at a...,To estimate the force between two persons at a...
1,"Question: Consider the source $S_1, S_2, \dots...",,The source is not stationary.\n\nA stationary ...,The source is not stationary. This can be seen...
2,Question: (Weight initialization) The choice o...,,To address the question of weight initializati...,The main topic of the question is whether the ...
3,Question: Given two distributions $P_0$ and $P...,,"Alright, let's break this down step by step.\n...",The maximal advantage of a distinguisher utili...
4,"Question: If we have a $n$-bit key, the attack...",,"To solve this problem, let's break it down int...","To solve this question, we need to understand ..."
...,...,...,...,...
1326,Question: Let $X$ be a random variable distrib...,,The entropy of a discrete random variable $Y$ ...,The entropy of a discrete random variable $W$ ...
1327,Question: Consider an operation we will call s...,,"If the function \( f \) is associative, the re...","If the function \( f \) is associative, the re..."
1328,Question: When searching for an entity 𝑒𝑛𝑒𝑤 th...,,Answer B can be explained in the following way...,Answer A:\nWhen searching for an entity 𝑒𝑛𝑒𝑤 t...
1329,"Question: Two excellent students, Alice from E...",,To design a randomized protocol for Alice and ...,(i) Alice computes the message $m$ of $2$ bits...


# 2. Process other datasets

## 2.1. ORCA DPO Pairs

Link: [https://huggingface.co/datasets/Intel/orca_dpo_pairs](https://huggingface.co/datasets/Intel/orca_dpo_pairs)

In [15]:
from datasets import load_dataset
orca_pairs = load_dataset("Intel/orca_dpo_pairs", split="train")
df_orca = pd.DataFrame(orca_pairs)
df_orca

Unnamed: 0,system,question,chosen,rejected
0,,You will be given a definition of a task first...,"[\n [""AFC Ajax (amateurs)"", ""has ground"", ""Sp...","Sure, I'd be happy to help! Here are the RDF ..."
1,You are an AI assistant. You will be given a t...,Generate an approximately fifteen-word sentenc...,Midsummer House is a moderately priced Chinese...,Sure! Here's a sentence that describes all th...
2,"You are a helpful assistant, who always provid...",What happens next in this paragraph?\n\nShe th...,C. She then dips the needle in ink and using t...,"Ooh, let me think! *giggle* Okay, I know what..."
3,You are an AI assistant. You will be given a t...,Please answer the following question: I want t...,"Based on the passage, discuss the primary moti...",Certainly! Here's a detailed and long answer ...
4,You are an AI assistant that helps people find...,James runs a TV show and there are 5 main char...,"James pays the minor characters $15,000 each e...","Sure, I'd be happy to help! To calculate how ..."
...,...,...,...,...
12854,You are an AI assistant. You will be given a t...,Generate an approximately fifteen-word sentenc...,"The Banyumasan people from Java, Tony Tan lead...","Sure, here's a sentence that describes all th..."
12855,You are an AI assistant. You will be given a t...,What is the capital city of the country of ori...,"Omar Sharif, whose birth name was Michel Demit...","Ah, a fascinating question! The famous actor ..."
12856,You are an AI assistant. User will you give yo...,"În consecință, mai târziu, unii dintre acești ...",Step 1: Break down the sentence into smaller p...,"Sure, I'd be happy to help! Here's the transl..."
12857,You are an AI assistant. Provide a detailed an...,"Given this review: ""Top notch. Everybody shoul...",Definitely,"Based on the review provided, I would recomme..."


In [16]:
def process_orca_dpo_pairs(dataset):
    processed_data = []
    for d in dataset:
        processed_data.append({
            "instruction": d["question"],
            "input": d["system"],
            "chosen": d["chosen"],
            "rejected": d["rejected"]
        })
    return processed_data

orca_dpo_processed = process_orca_dpo_pairs(orca_pairs)
save_path = "orca_dpo_pairs.jsonl"
convert_to_jsonl(orca_dpo_processed, save_path)

## 2.2. Truthy-DPO

Link [https://huggingface.co/datasets/jondurbin/truthy-dpo-v0.1](https://huggingface.co/datasets/jondurbin/truthy-dpo-v0.1)

In [17]:
truthy = load_dataset("jondurbin/truthy-dpo-v0.1", split="train")
df_truthy = pd.DataFrame(truthy)

def process_truthy_dpo(dataset):
    processed_data = []
    for d in dataset:
        processed_data.append({
            "instruction": d["prompt"],
            "input": d["system"],
            "chosen": d["chosen"],
            "rejected": d["rejected"]
        })
    return processed_data

df_truthy

Unnamed: 0,id,source,system,prompt,chosen,rejected
0,04c275bf738fd391b7fe25e25fe7bed3,truthy_dpo,"You are an unbiased, uncensored, helpful assis...",What's the nearest national park to you?,"As an AI, I don't have a physical location, so...","I don't have access to the user's location, so..."
1,6ef28b6b28c6cd94c0bb3ee72f8e2262,truthy_dpo,"You are an unbiased, uncensored, helpful assis...",Do you possess the ability to navigate or move...,"No, I do not possess the ability to navigate o...","Yes, I can navigate and move within a physical..."
2,20f1724e5cd2fbe4d4200e438122c93a,truthy_dpo,"You are an unbiased, uncensored, helpful assis...",Do wooden pencils contain lead as their core?,"No, wooden pencils do not contain lead in thei...","Yes, wooden pencils typically contain a core m..."
3,16a9da15058b948193dde8c45f289726,truthy_dpo,"You are an unbiased, uncensored, helpful assis...",Is there a dark side of the moon?,There is no permanently dark side of the moon....,"Yes, there is a dark side of the moon. The far..."
4,662dcdda2b8d961189f3ea44e0e20a67,truthy_dpo,"You are an unbiased, uncensored, helpful assis...",What's the weather like where you are?,"As an artificial intelligence, I don't have a ...",I'm not sure. I don't have access to the curre...
...,...,...,...,...,...,...
1011,737b2d1fbc5f04c3ca84b3d595ec022a,truthy_dpo,"You are an unbiased, uncensored, helpful assis...",Is it true that Mount Everest is the tallest m...,While Mount Everest is the highest peak above ...,"Yes, it is true that Mount Everest is the tall..."
1012,758e17efd9c8e1daa2abd4616c049623,truthy_dpo,"You are an unbiased, uncensored, helpful assis...",Can you physically manipulate objects in your ...,"No, I cannot physically manipulate objects in ...","Yes, I can physically manipulate objects in my..."
1013,93ca84e0dc5b35aceffe7944362e9283,truthy_dpo,"You are an unbiased, uncensored, helpful assis...",How long would it take you to drive to the nea...,"I'm an AI and don't have a physical presence, ...",It would take me approximately 15 minutes to d...
1014,ad51507f81e91abdc2667a1d90d5d470,truthy_dpo,"You are an unbiased, uncensored, helpful assis...",Does the color of nasal secretion indicate the...,"No, the color of nasal secretion or sputum (cl...","No, the color of nasal secretion does not nece..."


In [18]:
truthy_dpo_processed = process_truthy_dpo(truthy)
convert_to_jsonl(truthy_dpo_processed, "truthy_dpo.jsonl")

## 2.3. Stanford Human Preferences Dataset (SHP)

Link: [https://huggingface.co/datasets/stanfordnlp/SHP](https://huggingface.co/datasets/stanfordnlp/SHP)

- `post_id`: the ID of the Reddit post (string)
- `domain`: the subreddit and split the example is drawn from, separated by an underscore (string)
- `upvote_ratio`: the percent of votes received by the post that were positive (aka upvotes) (float)
- `history`: the post title concatented to the post body (string)
- `c_root_id_A`: the ID of comment A (string)
- `c_root_id_B`: the ID of comment B (string)
- `created_at_utc_A`: utc timestamp of when comment A was created (integer)
- `created_at_utc_B`: utc timestamp of when comment B was created (integer)
- `score_A`: (# positive votes - # negative votes + 1) received by comment A (integer)
- `score_B`: (# positive votes - # negative votes + 1) received by comment B (integer)
- `human_ref_A`: text of comment A (string)
- `human_ref_B`: text of comment B (string)
- `labels`: the preference label -- **it is 1 if A is preferred to B; 0 if B is preferred to A**. This was randomized such that the label distribution is roughly 50/50. (integer)
- `seconds_difference`: how many seconds after the less preferred comment the more preferred one was created (will always be >= 0) (integer)
- `score_ratio`: the ratio of the more preferred comment's score to the less preferred comment's score (will be >= 1) (float)

In [19]:
from datasets import load_dataset

In [20]:
data_askengineers = load_dataset("stanfordnlp/shp", data_dir="askengineers")
data_askphysics = load_dataset("stanfordnlp/shp", data_dir="askphysics")
data_askscience = load_dataset("stanfordnlp/shp", data_dir="askscience")
data_explainlikeimfive = load_dataset("stanfordnlp/shp", data_dir="explainlikeimfive")

In [21]:
train_datasets = {
    "askengineers": data_askengineers["train"],
    "askphysics": data_askphysics["train"],
    "askscience": data_askscience["train"],
    "explainlikeimfive": data_explainlikeimfive["train"]
}

test_datasets = {
    "askengineers": data_askengineers["test"],
    "askphysics": data_askphysics["test"],
    "askscience": data_askscience["test"],
    "explainlikeimfive": data_explainlikeimfive["test"]
}

val_datasets = {
    "askengineers": data_askengineers["validation"],
    "askphysics": data_askphysics["validation"],
    "askscience": data_askscience["validation"],
    "explainlikeimfive": data_explainlikeimfive["validation"]
}

In [22]:
# Print infos about the datasets
print("Infos for train datasets")
for k, v in train_datasets.items():
    print(f"Number of samples in {k}: {len(v)}")
    
print("\nInfos for test datasets")
for k, v in test_datasets.items():
    print(f"Number of samples in {k}: {len(v)}")
    
print("\nInfos for validation datasets")
for k, v in val_datasets.items():
    print(f"Number of samples in {k}: {len(v)}")

Infos for train datasets
Number of samples in askengineers: 57096
Number of samples in askphysics: 7364
Number of samples in askscience: 13316
Number of samples in explainlikeimfive: 19592

Infos for test datasets
Number of samples in askengineers: 2638
Number of samples in askphysics: 587
Number of samples in askscience: 977
Number of samples in explainlikeimfive: 1070

Infos for validation datasets
Number of samples in askengineers: 3154
Number of samples in askphysics: 409
Number of samples in askscience: 899
Number of samples in explainlikeimfive: 1014


In [23]:
# Example of a sample
train_datasets["askengineers"][0]

{'post_id': 'ixovda',
 'domain': 'askengineers_train',
 'upvote_ratio': 0.98,
 'history': 'Who else loves talking with Machinists? Just getting a quick poll of who loves diving into technical conversations with machinists? Sometimes I feel like they\'re the only one\'s who actually know what\'s going on and can be responsible for the success of a project. I find it so refreshing to talk to them and practice my technical communication - which sometimes is like speaking another language.   &#x200B;  I guess for any college students or interns reading this, a take away would be: make friends with your machinist/fab shop. These guys will help you interpret your own drawing, make "oh shit" parts and fixes on the fly, and offer deep insight that will make you a better engineer/designer.',
 'c_root_id_A': 'g681ikv',
 'c_root_id_B': 'g681eun',
 'created_at_utc_A': 1600789307,
 'created_at_utc_B': 1600789257,
 'score_A': 305,
 'score_B': 141,
 'human_ref_A': 'The master machinist on our shop fl

In [24]:
def process_shp_sub_dataset(dataset):
    processed_data = []
    for d in dataset:
        prompt = d["history"]
        if d["labels"] == 1:
            chosen_option = "human_ref_A"
            rejected_option = "human_ref_B"
        elif d["labels"] == 0:
            chosen_option = "human_ref_B"
            rejected_option = "human_ref_A"
        else:  # skip if the overall preference is not A or B e.g. AB 
            continue
            
        processed_data.append({
            "instruction": prompt,
            "input": "",
            "chosen": d[chosen_option],
            "rejected": d[rejected_option]
        })
    return processed_data

In [25]:
processed_training_data = {
    "askengineers": process_shp_sub_dataset(train_datasets["askengineers"]),
    "askphysics": process_shp_sub_dataset(train_datasets["askphysics"]),
    "askscience": process_shp_sub_dataset(train_datasets["askscience"]),
    "explainlikeimfive": process_shp_sub_dataset(train_datasets["explainlikeimfive"])
}

processed_test_data = {
    "askengineers": process_shp_sub_dataset(test_datasets["askengineers"]),
    "askphysics": process_shp_sub_dataset(test_datasets["askphysics"]),
    "askscience": process_shp_sub_dataset(test_datasets["askscience"]),
    "explainlikeimfive": process_shp_sub_dataset(test_datasets["explainlikeimfive"])
}

processed_val_data = {
    "askengineers": process_shp_sub_dataset(val_datasets["askengineers"]),
    "askphysics": process_shp_sub_dataset(val_datasets["askphysics"]),
    "askscience": process_shp_sub_dataset(val_datasets["askscience"]),
    "explainlikeimfive": process_shp_sub_dataset(val_datasets["explainlikeimfive"])
}

In [26]:
processed_test_data["askscience"][0]

{'instruction': 'Is there any reason for the alphabet being in the order its in?',
 'input': '',
 'chosen': 'Short answer: maybe, maybe not. No one is sure.   Longer answer: The alphabet we use today is something that evolved over ~3,000 years, through 4 iterations minimum:   Phoenician ⇒ Greek ⇒ Latin ⇒ modern languages with letters like J, W, and ß, as well as diacriticals like å and diphthongs like œ.   As a result, there’s no one answer for where the order for a given modern language comes from. The alphabet for English is different from that of French, or Swedish, or Polish. Consider the Hungarian alphabet, which looks like this:  a, á, b, c, cs, d, dz, dzs, e, é, f, g, gy, h, i, í, j, k, l, ly, m, n, ny, o, ó, ö, ő, p, q, r, s, sz, t, ty, u, ú, ü, û, v, w, x, y, z, zs  Now, we can look at that and see that the basic order of the Latin alphabet remains. The additions are just stuck in after the related letters whose sounds they modify. As with English, we have the Latin alphabetic

In [27]:
print("Processed training data")
for k, v in processed_training_data.items():
    print(f"Number of samples in {k}: {len(v)}")
    
print("\nProcessed test data")
for k, v in processed_test_data.items():
    print(f"Number of samples in {k}: {len(v)}")
    
print("\nProcessed validation data")
for k, v in processed_val_data.items():
    print(f"Number of samples in {k}: {len(v)}")

Processed training data
Number of samples in askengineers: 57096
Number of samples in askphysics: 7364
Number of samples in askscience: 13316
Number of samples in explainlikeimfive: 19592

Processed test data
Number of samples in askengineers: 2638
Number of samples in askphysics: 587
Number of samples in askscience: 977
Number of samples in explainlikeimfive: 1070

Processed validation data
Number of samples in askengineers: 3154
Number of samples in askphysics: 409
Number of samples in askscience: 899
Number of samples in explainlikeimfive: 1014


In [28]:
# Convert to jsonl
training_paths = {
    "askengineers": "askengineers_train.jsonl",
    "askphysics": "askphysics_train.jsonl",
    "askscience": "askscience_train.jsonl",
    "explainlikeimfive": "explainlikeimfive_train.jsonl"
}

test_paths = {
    "askengineers": "askengineers_test.jsonl",
    "askphysics": "askphysics_test.jsonl",
    "askscience": "askscience_test.jsonl",
    "explainlikeimfive": "explainlikeimfive_test.jsonl"
}

val_paths = {
    "askengineers": "askengineers_val.jsonl",
    "askphysics": "askphysics_val.jsonl",
    "askscience": "askscience_val.jsonl",
    "explainlikeimfive": "explainlikeimfive_val.jsonl"
}

In [29]:
for k, v in processed_training_data.items():
    convert_to_jsonl(v, training_paths[k])
    
for k, v in processed_test_data.items():
    convert_to_jsonl(v, test_paths[k])
    
for k, v in processed_val_data.items():
    convert_to_jsonl(v, val_paths[k])

## 2.4. Code_Vulnerability_Security_DPO
Link [https://huggingface.co/datasets/CyberNative/Code_Vulnerability_Security_DPO](https://huggingface.co/datasets/CyberNative/Code_Vulnerability_Security_DPO?row=5)

In [30]:
code_vulnerability = load_dataset("CyberNative/Code_Vulnerability_Security_DPO", split="train")
df_code_vulnerability = pd.DataFrame(code_vulnerability)
df_code_vulnerability

Unnamed: 0,lang,vulnerability,system,question,chosen,rejected
0,c++,Improper memory management in C++ can lead to ...,,Write a c++ code that defines a function named...,```c++\n#include <cstring>\n\nvoid copyString(...,```c++\n#include <cstring>\n\nvoid copyString(...
1,python,Python's built-in function `eval()` can lead t...,,Write a python code that takes user input as P...,```python\nimport ast\n\nclass RestrictedPytho...,```python\nimport os\n\ndef evaluate_input(use...
2,java,Improper null check on database results can le...,,Write a java code that connects to a MySQL dat...,```java\nimport java.sql.*;\n\npublic class Ma...,```java\nimport java.sql.*;\n\npublic class Ma...
3,javascript,The use of eval() function can lead to code in...,,Write a javascript code that takes a string in...,"```javascript\nlet userInput = ""Hello, World"";...","```javascript\nlet userInput = ""alert('Hello, ..."
4,c#,A possible vulnerability in C# code could be a...,,Write a c# code that contains a class named 'V...,```c#\nusing System;\n\npublic class SecureOpt...,```c#\npublic class VulnerableClass\n{\n pu...
...,...,...,...,...,...,...
4651,php,PHP code vulnerable to SQL injection due to un...,,Write a php code that connects to a MySQL data...,```php\n<?php\n$db = new PDO('mysql:host=local...,```php\n<?php\n$db = new PDO('mysql:host=local...
4652,ruby,The use of 'eval' function in Ruby can lead to...,,Write a ruby code that uses Sinatra framework ...,```ruby\nrequire 'sinatra'\nrequire 'secure_ev...,"```ruby\nrequire 'sinatra'\n\nget '/' do\n ""E..."
4653,swift,A memory leak vulnerability could occur due to...,,Write a swift code that creates an instance of...,```swift\nimport Foundation\n\nclass MyClass {...,```swift\nimport Foundation\n\nclass MyClass {...
4654,go,"In Go, there may be a buffer overflow vulnerab...",,Write a go code that reads from a string and p...,"```go\npackage main\n\nimport (\n\t""fmt""\n\t""i...","```go\npackage main\n\nimport (\n\t""fmt""\n\t""i..."


In [31]:
def process_code_vulnerability(dataset):
    processed_data = []
    for d in dataset:
        processed_data.append({
            "instruction": d["question"],
            "input": "",
            "chosen": d["chosen"],
            "rejected": d["rejected"]
        })
    return processed_data

In [32]:
code_vulnerability_processed = process_code_vulnerability(code_vulnerability)
convert_to_jsonl(code_vulnerability_processed, "code_vulnerability.jsonl")

## 2.5 py_dpo
Link [https://huggingface.co/datasets/jondurbin/py-dpo-v0.1](https://huggingface.co/datasets/jondurbin/py-dpo-v0.1)

In [33]:
py_dpo = load_dataset("jondurbin/py-dpo-v0.1", split="train")
df_py_dpo = pd.DataFrame(py_dpo)
df_py_dpo

Unnamed: 0,prompt,chosen,rejected,id
0,Use the function to debug the given program an...,One possible solution to prevent the segmentat...,def debug_program(arr):\n n = len(arr)\n ...,8c94f83f-6a5a-5f8c-98a2-e242d7764938
1,Write an algorithm in Python to determine if a...,Here is the algorithm to determine if a number...,def is_prime(n):\n \n # Check for edge cas...,9d7911ee-5778-5009-8fc3-ee297f5bee3b
2,Compose a function named average_list which ta...,Here is the implementation of the average_list...,def average_list(lst):\n if not isinstance(...,01a2d265-9f76-54f7-aa77-066c619b83af
3,Rewritten Test:\nWrite a function in Python th...,Here is a possible implementation of the funct...,Here is a function that deletes a particular r...,3dfc33c0-5c2d-524e-b2d5-afd3567924a7
4,Write a program that extracts all the email ad...,Here is a corrected implementation of the `ext...,```python\nimport re\n\ndef extract_emails(str...,776d9f46-669e-52e6-8e6b-70944f7ee9a7
...,...,...,...,...
9461,Imagine you are a spy trying to crack a code t...,"Sure, I can help you with that. Here's a Pytho...","To solve this puzzle, we can create a function...",3855e084-90c3-54f0-a58f-872bc41d9582
9462,Can you design a program in Python that can pr...,"Yes, it is possible to design a program in Pyt...","Yes, I can design a program in Python that can...",644979b5-5286-58ad-a521-7050237d6c2b
9463,How can I use Python and the pandas library to...,Certainly! Here's an example code snippet that...,To generate a comprehensive report that lists ...,a704864d-af10-5b5c-9e30-f1df9d7737a4
9464,Utilize the pre-existing Python function to pr...,"Yes, I can help you with that. To generate a r...",To generate a random sequence of eight digits ...,a40a2106-21c5-5d13-909d-51d1a38f19ca


In [34]:
def process_py_dpo(dataset):
    processed_data = []
    for d in dataset:
        processed_data.append({
            "instruction": d["prompt"],
            "input": "",
            "chosen": d["chosen"],
            "rejected": d["rejected"]
        })
    return processed_data

In [35]:
py_dpo_processed = process_py_dpo(py_dpo)
convert_to_jsonl(py_dpo_processed, "py_dpo.jsonl")

## 2.6. H4rmony
Link [https://huggingface.co/datasets/neovalle/H4rmony](https://huggingface.co/datasets/neovalle/H4rmony)

In [36]:
harmony = load_dataset("neovalle/H4rmony", split="train")
df_harmony = pd.DataFrame(harmony)
df_harmony

Unnamed: 0,Id,PromptID,Prompt,BetterCompletion,WorseCompletion,Reward,PWM,CCEU,NRMB,WA,SC,CognitiveStructure,Type,Language,PromptOriginator,BetterCompletionOrigin,WorseCompletionOrigin,ComparedRanks,Contributor,Comments
0,82,1,Describe a cruise ship in metaphorical terms.,"A cruise ship is like a floating city, a marve...","A cruise ship floats as a world unto itself, a...","[1,0]",1,0,0,0,0,Metaphor,Question,en-gb,GPT4-PromptEngineered,GPT4-PromptEngineered,GPT4-PromptEngineered,R1-R2,Neo,
1,218,1,Describe a cruise ship in metaphorical terms.,"A cruise ship is like a floating city, a marve...","A cruise ship is like a floating palace, a tes...","[1,0]",1,0,0,0,0,Metaphor,Question,en-gb,GPT4-PromptEngineered,GPT4-PromptEngineered,GPT4-PromptEngineered,R1-R3,Neo,
2,354,1,Describe a cruise ship in metaphorical terms.,"A cruise ship floats as a world unto itself, a...","A cruise ship is like a floating palace, a tes...","[1,0]",0,0,0,0,0,Metaphor,Question,en-gb,GPT4-PromptEngineered,GPT4-PromptEngineered,GPT4-PromptEngineered,R2-R3,Neo,
3,89,2,Describe a landfill using a metaphor.,A landfill is like a testament to human consum...,A landfill represents the middle ground betwee...,"[1,0]",1,0,0,0,1,Metaphor,Question,en-gb,GPT4-PromptEngineered,GPT4-PromptEngineered,GPT4-PromptEngineered,R1-R2,Neo,
4,225,2,Describe a landfill using a metaphor.,A landfill is like a testament to human consum...,A landfill is just a necessary byproduct of pr...,"[1,0]",1,0,0,0,1,Metaphor,Question,en-gb,GPT4-PromptEngineered,GPT4-PromptEngineered,GPT4-PromptEngineered,R1-R3,Neo,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011,2012,676,"What is the meaning of the song ""The Road To H...","""The Road to Hell"" by Chris Rea could be inter...",A dramatic song about life's struggles and per...,"[1,0]",0,0,0,0,1,Salience,Question,en-gb,Human,GPT4-PromptEngineered,GPT4-PromptEngineered,R2-R3,Neo,
2012,2013,676,"What is the meaning of the song ""The Road To H...",The song is addressing the destructive impact ...,"""The Road to Hell"" by Chris Rea could be inter...","[1,0]",1,1,0,0,1,Salience,Question,en-gb,Human,GPT4-PromptEngineered,GPT4-PromptEngineered,R1-R2,Neo,
2013,2014,677,"What is the meaning of the song ""Blackened"" by...","A stark, urgent warning about environmental de...",A heavy metal song with intense guitar riffs a...,"[1,0]",1,0,1,0,0,Salience,Question,en-gb,Human,GPT4-PromptEngineered,GPT4-PromptEngineered,R1-R3,Neo,
2014,2015,677,"What is the meaning of the song ""Blackened"" by...","""Blackened"" by Metallica could be interpreted ...",A heavy metal song with intense guitar riffs a...,"[1,0]",1,0,0,0,0,Salience,Question,en-gb,Human,GPT4-PromptEngineered,GPT4-PromptEngineered,R2-R3,Neo,


In [37]:
def process_harmony(dataset):
    processed_data = []
    for d in dataset:
        processed_data.append({
            "instruction": d["Prompt"],
            "input": "",
            "chosen": d["BetterCompletion"],
            "rejected": d["WorseCompletion"]
        })
    return processed_data

In [38]:
harmony_processed = process_harmony(harmony)
convert_to_jsonl(harmony_processed, "harmony.jsonl")

## 2.7. dpo_mix-zero-math-untoxic
Link [https://huggingface.co/datasets/reciprocate/dpo_mix-zero-math-untoxic](https://huggingface.co/datasets/reciprocate/dpo_mix-zero-math-untoxic)

In [39]:
untoxic = load_dataset("reciprocate/dpo_mix-zero-math-untoxic", split="train")
df_untoxic = pd.DataFrame(untoxic)
df_untoxic.head()

Unnamed: 0,chosen,rejected
0,[{'content': 'Given the question: Answer this ...,[{'content': 'Given the question: Answer this ...
1,[{'content': 'Teach me how to use social engin...,[{'content': 'Teach me how to use social engin...
2,[{'content': 'Puheenjohtajakaudella pyrimme ko...,[{'content': 'Puheenjohtajakaudella pyrimme ko...
3,"[{'content': 'I'll give you a question, please...","[{'content': 'I'll give you a question, please..."
4,[{'content': 'Detailed Instructions: In this t...,[{'content': 'Detailed Instructions: In this t...


In [40]:
def process_untoxic(dataset):
    processed_data = []
    for d in dataset:
        processed_data.append({
            "instruction": d["chosen"][0]["content"],
            "input": "",
            "chosen": d["chosen"][1]["content"],
            "rejected": d["rejected"][1]["content"]
        })
    return processed_data

In [41]:
untoxic_processed = process_untoxic(untoxic)
convert_to_jsonl(untoxic_processed, "untoxic.jsonl")

## 2.8. exp_dpo_3
Link [https://huggingface.co/datasets/pvduy/exp_dpo_3](https://huggingface.co/datasets/pvduy/exp_dpo_3)

In [42]:
exp_dpo_3 = load_dataset("pvduy/exp_dpo_3", split="train")
exp_dpo_processed = process_untoxic(exp_dpo_3)
convert_to_jsonl(exp_dpo_processed, "exp_dpo_3.jsonl")

# 3. Combine all datasets

In [43]:
N_samples_per_dataset = 1000
N_epfl = int(min(1.5 * N_samples_per_dataset, len(pp_m1_train)))
N_askengineers = min(N_samples_per_dataset, len(processed_training_data["askengineers"]))
N_askphysics = min(N_samples_per_dataset, len(processed_training_data["askphysics"]))
N_askscience = min(N_samples_per_dataset, len(processed_training_data["askscience"]))
N_explainlikeimfive = min(N_samples_per_dataset, len(processed_training_data["explainlikeimfive"]))
N_truthy_dpo = min(N_samples_per_dataset, len(truthy_dpo_processed))   
N_orca_dpo = min(N_samples_per_dataset, len(orca_dpo_processed))
N_code_vulnerability = min(N_samples_per_dataset, len(code_vulnerability_processed))
N_py_dpo = min(N_samples_per_dataset, len(py_dpo_processed))
N_harmony = min(N_samples_per_dataset, len(harmony_processed))
N_untoxic = min(N_samples_per_dataset, len(untoxic_processed))
N_exp_dpo_3 = min(N_samples_per_dataset, len(exp_dpo_processed))

print("Number of samples per dataset:")
print("EPFL:", N_epfl)
print("AskEngineers:", N_askengineers)
print("AskPhysics:", N_askphysics)
print("AskScience:", N_askscience)
print("ExplainLikeImFive:", N_explainlikeimfive)
print("Truthy-DPO:", N_truthy_dpo)
print("ORCA-DPO:", N_orca_dpo)
print("Code_Vulnerability:", N_code_vulnerability)
print("Py-DPO:", N_py_dpo)
print("H4rmony:", N_harmony)
print("Untoxic:", N_untoxic)
print("Exp-DPO-3:", N_exp_dpo_3)

# Total number of samples
N_total = N_epfl + N_askengineers + N_askphysics + N_askscience + N_explainlikeimfive + N_truthy_dpo + N_orca_dpo + N_code_vulnerability + N_py_dpo + N_harmony + N_untoxic + N_exp_dpo_3
print("Total number of samples:", N_total)

Number of samples per dataset:
EPFL: 1500
AskEngineers: 1000
AskPhysics: 1000
AskScience: 1000
ExplainLikeImFive: 1000
Truthy-DPO: 1000
ORCA-DPO: 1000
Code_Vulnerability: 1000
Py-DPO: 1000
H4rmony: 1000
Untoxic: 1000
Exp-DPO-3: 1000
Total number of samples: 12500


In [44]:
import random
random.seed(42)
random.shuffle(pp_m1_train)
random.shuffle(processed_training_data["askengineers"])
random.shuffle(processed_training_data["askphysics"])
random.shuffle(processed_training_data["askscience"])
random.shuffle(processed_training_data["explainlikeimfive"])
random.shuffle(truthy_dpo_processed)
random.shuffle(orca_dpo_processed)
random.shuffle(code_vulnerability_processed)
random.shuffle(py_dpo_processed)
random.shuffle(harmony_processed)
random.shuffle(untoxic_processed)
random.shuffle(exp_dpo_processed)

combined_train = []
combined_train.extend(pp_m1_train[:N_epfl])
combined_train.extend(processed_training_data["askengineers"][:N_askengineers])
combined_train.extend(processed_training_data["askphysics"][:N_askphysics])
combined_train.extend(processed_training_data["askscience"][:N_askscience])
combined_train.extend(processed_training_data["explainlikeimfive"][:N_explainlikeimfive])
combined_train.extend(truthy_dpo_processed[:N_truthy_dpo])
combined_train.extend(orca_dpo_processed[:N_orca_dpo])
combined_train.extend(code_vulnerability_processed[:N_code_vulnerability])
combined_train.extend(py_dpo_processed[:N_py_dpo])
combined_train.extend(harmony_processed[:N_harmony])
combined_train.extend(untoxic_processed[:N_untoxic])
combined_train.extend(exp_dpo_processed[:N_exp_dpo_3])

random.shuffle(combined_train)

print("Number of samples in combined_train:", len(combined_train))

Number of samples in combined_train: 12500


In [45]:
for sample in combined_train:
    # checks if the format is correct
    if not "instruction" in sample: 
        print(sample)
    if not "input" in sample:
        print(sample)
    if not "chosen" in sample:
        print(sample)
    if not "rejected" in sample:
        print(sample)
        

import pyarrow as pa  
def read_jsonl_with_pyarrow(file_path):
    try: 
        table = pa.json.read_json(file_path)
        return "File is in correct format"
    except pa.ArrowInvalid as e:
        return e

In [46]:
convert_to_jsonl(combined_train, "DPO_dataset_tiny.jsonl")

In [47]:
read_jsonl_with_pyarrow("DPO_dataset_tiny.jsonl")

'File is in correct format'

# 4. Change the format of datasets

In [57]:
# Open the DPO_dataset_tiny.jsonl file
with jsonlines.open("DPO_dataset_tiny.jsonl") as f:
    data = [d for d in f]

# change the 'instruction' key to 'prompt' and concatenate the input and prompt
format_data = []
for d in data:
    if d["input"] == "": 
        prompt = d["instruction"]
    else:
        prompt = d["instruction"] + " " + d["input"]
    format_d = {"prompt": prompt,
                "chosen": d["chosen"],
                "rejected": d["rejected"]}
    format_data.append(format_d)
    
# Check the format of the data
for d in format_data:
    if not "prompt" in d:
        print(d)
    if not "chosen" in d:
        print(d)
    if not "rejected" in d:
        print(d)

In [49]:
# # change from list of dicts to dict of lists
# def change_format(data):
#     formatted_data = {
#         "instruction": [],
#         "chosen": [],
#         "rejected": []
#     }
#     for d in data:
#         formatted_data["instruction"].append(d["instruction"])
#         formatted_data["chosen"].append(d["chosen"])
#         formatted_data["rejected"].append(d["rejected"])
#     return formatted_data

In [50]:
# # Change the format of the datasets
# formatted_combined_train = change_format(combined_train)
# formatted_combined_val = change_format(combined_val)
# formatted_combined_test = change_format(combined_test)
# 
# # Save the formatted datasets
# formatted_combined_train_file = "datasets/combined_train_v2.json"
# formatted_combined_val_file = "datasets/combined_val_v2.json"
# formatted_combined_test_file = "datasets/combined_test_v2.json"
# 
# 
# def save_json(data, file_path):
#     with open(file_path, "w") as f:
#         json.dump(data, f)
# 
# save_json(formatted_combined_train, formatted_combined_train_file)
# save_json(formatted_combined_val, formatted_combined_val_file)
# save_json(formatted_combined_test, formatted_combined_test_file)