In [4]:
import pandas as pd
import numpy as np
import os
import datasets
np.random.seed(270)

In [5]:
def read_dfs(data_name, suffixes=["train", "val", "test"]):
    print(f"loading data: {data_name} ...")
    data = pd.DataFrame()
    if suffixes is None:
        data = pd.read_json(f"{data_name}.jsonl", lines=True)
        return data
    
    for i in range(len(suffixes)):
        suffix = suffixes[i]
        data_sfx = pd.read_json(f"{data_name}_{suffix}.jsonl", lines=True)
        data = pd.concat([data, data_sfx], axis=0)
    return data

In [6]:
epfl_dpo_data = read_dfs("EPFL/raw/EPFL_DPO", suffixes=["train_clipped", "test", "val"])
epfl_sft_data = read_dfs("EPFL/raw/EPFL_SFT", suffixes=["train_clipped", "test", "val"])
helpsteer_data = read_dfs("helpSteer/raw/helpsteer", suffixes=["train_clipped", "test", "val"])
mathqa_data = read_dfs("mathQA/raw/mathQA", suffixes=["train_clipped", "val", "test"])
mmlu_data = read_dfs("MMLU/raw/MMLU", suffixes=["train_clipped", "test_clipped", "val_clipped"])
openqa_data = read_dfs("openQA/raw/openQA", suffixes=["train_clipped", "val", "test"])
shp_data = read_dfs("shp/raw/shp", suffixes=["train_clipped", "test_clipped", "val_clipped"])

scienceQA_data = read_dfs("scienceQA/raw/scienceQA_all", suffixes=None)
tal_data = read_dfs("tal_scq5k/raw/tal_scq5k_train", suffixes=None)

loading data: EPFL/raw/EPFL_DPO ...
loading data: EPFL/raw/EPFL_SFT ...
loading data: helpSteer/raw/helpsteer ...
loading data: mathQA/raw/mathQA ...
loading data: MMLU/raw/MMLU ...
loading data: openQA/raw/openQA ...
loading data: shp/raw/shp ...
loading data: scienceQA/raw/scienceQA_all ...
loading data: tal_scq5k/raw/tal_scq5k_train ...


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2",
                                              trust_remote_code=True,
                                              padding_side='left',
                                              add_eos_token=True,
                                              add_bos_token=True,
                                              use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
def formatting_func(example, field):
    text = f"{example[field]}"
    return text

def generate_and_tokenize_prompt(prompt, field):
    return tokenizer(formatting_func(prompt, field))

def load_dataset(data_path):
    dataset = pd.read_json(data_path, lines=True)
    dataset = dataset.Dataset.from_pandas(dataset)

    return dataset

def tokenize_dataset(dataset, field):
    tokenized_dataset = dataset.map(lambda x: generate_and_tokenize_prompt(x, field))

    return tokenized_dataset

def compute_lengths(dataframe, field):
    # tokenization and lengths computation
    dataset = datasets.Dataset.from_pandas(dataframe)
    tokenized_dataset = tokenize_dataset(dataset, field)
    lengths = [len(x[field]) for x in tokenized_dataset]

    return lengths

In [9]:
def process_dataframe(dataframe, data_path, fname, field="prompt", max_length=512):
    # if data_path directory does not exist, create it
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    
    print(f"processing dataframe {fname}...")

    # shuffling the dataframe
    dataframe = dataframe.sample(frac=1).reset_index(drop=True)
    print(f"Length before clipping: {len(dataframe)}")

    # lengths computation 
    lengths = compute_lengths(dataframe, field)

    # filtering out the examples that exceed the max_length
    dataframe["lengths"] = lengths
    dataframe = dataframe[dataframe["lengths"] <= max_length]
    dataframe = dataframe.drop(columns=["lengths"])
    print(f"Length after clipping: {len(dataframe)}\n")

    #splitting 50-train / 25-test / 15-test quantization / 15-val
    train_size = int(0.5 * len(dataframe))
    test_size = int(0.25 * len(dataframe))
    test_quant_size = int(0.15 * len(dataframe))

    train_df = dataframe[:train_size]
    test_df = dataframe[train_size:train_size+test_size]
    test_quant_df = dataframe[train_size+test_size:train_size+test_size+test_quant_size]
    val_df = dataframe[train_size+test_size+test_quant_size:]
    
    # saving the dataframes
    train_df.to_json(f"{data_path}/{fname}_train.jsonl", orient="records", lines=True)
    test_df.to_json(f"{data_path}/{fname}_test.jsonl", orient="records", lines=True)
    test_quant_df.to_json(f"{data_path}/{fname}_test_quantization.jsonl", orient="records", lines=True)
    val_df.to_json(f"{data_path}/{fname}_val.jsonl", orient="records", lines=True)

    return 0

In [11]:
# shuffling the data
np.random.seed(270)

process_dataframe(epfl_dpo_data, "EPFL/processed", "EPFL_DPO", field="prompt")
process_dataframe(epfl_sft_data, "EPFL/processed", "EPFL_SFT", field="prompt")
process_dataframe(helpsteer_data, "helpSteer/processed", "helpsteer", field="prompt")
process_dataframe(mathqa_data, "mathQA/processed", "mathQA", field="question")
process_dataframe(mmlu_data, "MMLU/processed", "MMLU", field="question")
process_dataframe(openqa_data, "openQA/processed", "openQA", field="question")
process_dataframe(shp_data, "shp/processed", "shp", field="prompt")

process_dataframe(scienceQA_data, "scienceQA/processed", "scienceQA_all", field="question")
process_dataframe(tal_data, "tal_scq5k/processed", "tal_scq5k_train", field="question")

processing dataframe scienceQA_all...
Length before clipping: 41226


Map:   0%|          | 0/41226 [00:00<?, ? examples/s]

Length after clipping: 41023

processing dataframe tal_scq5k_train...
Length before clipping: 3000


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Length after clipping: 2912



0