In [8]:
import pandas as pd
import numpy as np
import os
import datasets
np.random.seed(270)

In [9]:
def read_dfs(data_name, suffixes=["train", "val", "test"]):
    print(f"loading data: {data_name} ...")
    data = pd.DataFrame()
    for i in range(len(suffixes)):
        suffix = suffixes[i]
        data_sfx = pd.read_json(f"{data_name}_{suffix}.jsonl", lines=True)
        data = pd.concat([data, data_sfx], axis=0)
    return data

In [10]:
epfl_dpo_data = read_dfs("EPFL/raw/EPFL_DPO", suffixes=["train_clipped", "test", "val"])
epfl_sft_data = read_dfs("EPFL/raw/EPFL_SFT", suffixes=["train_clipped", "test", "val"])
helpsteer_data = read_dfs("helpSteer/raw/helpsteer", suffixes=["train_clipped", "test", "val"])
mathqa_data = read_dfs("mathQA/raw/mathQA", suffixes=["train_clipped", "val", "test"])
mmlu_data = read_dfs("MMLU/raw/MMLU", suffixes=["train_clipped", "test_clipped", "val_clipped"])
openqa_data = read_dfs("openQA/raw/openQA", suffixes=["train_clipped", "val", "test"])
shp_data = read_dfs("shp/raw/shp", suffixes=["train_clipped", "test_clipped", "val_clipped"])

loading data: EPFL/raw/EPFL_DPO ...
loading data: EPFL/raw/EPFL_SFT ...
loading data: helpSteer/raw/helpsteer ...
loading data: mathQA/raw/mathQA ...
loading data: MMLU/raw/MMLU ...
loading data: openQA/raw/openQA ...
loading data: shp/raw/shp ...


In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2",
                                              trust_remote_code=True,
                                              padding_side='left',
                                              add_eos_token=True,
                                              add_bos_token=True,
                                              use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
def formatting_func(example, field):
    text = f"{example[field]}"
    return text

def generate_and_tokenize_prompt(prompt, field):
    return tokenizer(formatting_func(prompt, field))

def load_dataset(data_path):
    dataset = pd.read_json(data_path, lines=True)
    dataset = dataset.Dataset.from_pandas(dataset)

    return dataset

def tokenize_dataset(dataset, field):
    tokenized_dataset = dataset.map(lambda x: generate_and_tokenize_prompt(x, field))

    return tokenized_dataset

def compute_lengths(dataframe, field):
    # tokenization and lengths computation
    dataset = datasets.Dataset.from_pandas(dataframe)
    tokenized_dataset = tokenize_dataset(dataset, field)
    lengths = [len(x[field]) for x in tokenized_dataset]

    return lengths

In [13]:
def process_dataframe(dataframe, data_path, fname, field="prompt", max_length=512):
    # if data_path directory does not exist, create it
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    
    print(f"processing dataframe {fname}...")

    # shuffling the dataframe
    dataframe = dataframe.sample(frac=1).reset_index(drop=True)
    print(f"Length before clipping: {len(dataframe)}")

    # lengths computation 
    lengths = compute_lengths(dataframe, field)

    # filtering out the examples that exceed the max_length
    dataframe["lengths"] = lengths
    dataframe = dataframe[dataframe["lengths"] <= max_length]
    dataframe = dataframe.drop(columns=["lengths"])
    print(f"Length after clipping: {len(dataframe)}\n")

    #splitting 60-train / 20-test / 20-val
    train_size = int(0.6 * len(dataframe))
    val_size = int(0.2 * len(dataframe))

    train_df = dataframe[:train_size]
    val_df = dataframe[train_size:train_size+val_size]
    test_df = dataframe[train_size+val_size:]

    # saving the dataframes
    train_df.to_json(f"{data_path}/{fname}_train.jsonl", orient="records", lines=True)
    val_df.to_json(f"{data_path}/{fname}_val.jsonl", orient="records", lines=True)
    test_df.to_json(f"{data_path}/{fname}_test.jsonl", orient="records", lines=True)

    return 0

In [14]:
# shuffling the data
np.random.seed(270)

process_dataframe(epfl_dpo_data, "EPFL/processed", "EPFL_DPO", field="prompt")
process_dataframe(epfl_sft_data, "EPFL/processed", "EPFL_SFT", field="prompt")
process_dataframe(helpsteer_data, "helpSteer/processed", "helpsteer", field="prompt")
process_dataframe(mathqa_data, "mathQA/processed", "mathQA", field="question")
process_dataframe(mmlu_data, "MMLU/processed", "MMLU", field="question")
process_dataframe(openqa_data, "openQA/processed", "openQA", field="question")
process_dataframe(shp_data, "shp/processed", "shp", field="prompt")

processing dataframe EPFL_DPO...
Length before clipping: 12614


Map:   0%|          | 0/12614 [00:00<?, ? examples/s]

Length after clipping: 9188

processing dataframe EPFL_SFT...
Length before clipping: 8378


Map:   0%|          | 0/8378 [00:00<?, ? examples/s]

Length after clipping: 6149

processing dataframe helpsteer...
Length before clipping: 15860


Map:   0%|          | 0/15860 [00:00<?, ? examples/s]

Length after clipping: 10680

processing dataframe mathQA...
Length before clipping: 33426


Map:   0%|          | 0/33426 [00:00<?, ? examples/s]

Length after clipping: 33121

processing dataframe MMLU...
Length before clipping: 105488


Map:   0%|          | 0/105488 [00:00<?, ? examples/s]

Length after clipping: 22357

processing dataframe openQA...
Length before clipping: 5457


Map:   0%|          | 0/5457 [00:00<?, ? examples/s]

Length after clipping: 5456

processing dataframe shp...
Length before clipping: 122096


Map:   0%|          | 0/122096 [00:00<?, ? examples/s]

Length after clipping: 64676



0

In [15]:
shp_test = pd.read_json("shp/processed/shp_test.jsonl", lines=True)
shp_train = pd.read_json("shp/processed/shp_train.jsonl", lines=True)
shp_val = pd.read_json("shp/processed/shp_val.jsonl", lines=True)

In [16]:
shp_sft_train = shp_train.iloc[:len(shp_train)//2]
shp_sft_val = shp_val.iloc[:len(shp_val)//2]
shp_sft_test = shp_test.iloc[:len(shp_test)//2]

shp_dpo_train = shp_train.iloc[len(shp_train)//2:]
shp_dpo_val = shp_val.iloc[len(shp_val)//2:]
shp_dpo_test = shp_test.iloc[len(shp_test)//2:]

shp_sft_train.to_json("shp/processed/shp_SFT_train.jsonl", orient="records", lines=True)
shp_sft_val.to_json("shp/processed/shp_SFT_val.jsonl", orient="records", lines=True)
shp_sft_test.to_json("shp/processed/shp_SFT_test.jsonl", orient="records", lines=True)

shp_dpo_train.to_json("shp/processed/shp_DPO_train.jsonl", orient="records", lines=True)
shp_dpo_val.to_json("shp/processed/shp_DPO_val.jsonl", orient="records", lines=True)
shp_dpo_test.to_json("shp/processed/shp_DPO_test.jsonl", orient="records", lines=True)