In [32]:
DATASETS = [
    {
        "id": 0,
        "label": "finance",
        "path": "gbharti/finance-alpaca",
        "split": "train",
        "xpath": "instruction",
    },
    {
        "id": 1,
        "label": "medicine",
        "path": "lavita/ChatDoctor-HealthCareMagic-100k",
        "split": "train",
        "xpath": "input",
    },
    {
        "id": 2,
        "label": "leetcode",
        "path": ["./oig/unified_merged_code_xp3.jsonl"],
        "xpath": "text"
    },
    {
        "id": 3,
        "label": "exam",
        "path": [
            # "./oig/unified_grade_school_math_instructions.jsonl",
            "./oig/unified_mathqa_flanv2_kojma_cot.jsonl"
        ],
        "xpath": "text"
    },
    {
        "id": 4,
        "label": "webgpt",
        "path": "openai/webgpt_comparisons",
        "split": "train",
        "xpath": "question.full_text",
    },
    {
        "id": 5,
        "label": "gpt4tools",
        "path": "taskydata/GPT4Tools",
        "split": "train",
        "xpath": "instruction"
    },
    {
        "id": 6,
        "label": "cot",
        "path": "DataProvenanceInitiative/cot_submix_original",
        "split": "train",
        "xpath": "inputs",
    },
    {
        "id": 7,
        "label": "stackoverflow",
        "path": "0x70DA/stackoverflow-chat-data",
        "split": "train",
        "xpath": "input",
    },
]

In [33]:
from datasets import load_dataset
import os
import json


def save_to_jsonl(data, filename):
    with open(filename, 'a') as f:
        for record in data:
            f.write(json.dumps(record) + '\n')


cache_path = './cache'
batch_size = 1000

for dataset in DATASETS:
    print(dataset)
    train_file = f"./cache/{dataset['label']}_train.jsonl"
    eval_file = f"./cache/{dataset['label']}_eval.jsonl"

    if os.path.exists(train_file):
        continue

    train_dataset = load_dataset(dataset['path'], split=f"{dataset['split']}[:10000]")
    train_batch = []
    for record in train_dataset:
        train_batch.append(record)
        if len(train_batch) >= batch_size:
            save_to_jsonl(train_batch, train_file)
            train_batch = []
    if train_batch:
        save_to_jsonl(train_batch, train_file)

    eval_dataset = load_dataset(dataset['path'], split=f"{dataset['split']}[10000:12000]")
    eval_batch = []
    for record in eval_dataset:
        eval_batch.append(record)
        if len(eval_batch) >= batch_size:
            save_to_jsonl(eval_batch, eval_file)
            eval_batch = []
    if eval_batch:
        save_to_jsonl(eval_batch, eval_file)

{'id': 0, 'label': 'finance', 'path': 'gbharti/finance-alpaca', 'split': 'train', 'xpath': 'instruction'}
{'id': 1, 'label': 'medicine', 'path': 'lavita/ChatDoctor-HealthCareMagic-100k', 'split': 'train', 'xpath': 'input'}
{'id': 2, 'label': 'leetcode', 'path': ['./oig/unified_merged_code_xp3.jsonl'], 'xpath': 'text'}
{'id': 3, 'label': 'exam', 'path': ['./oig/unified_mathqa_flanv2_kojma_cot.jsonl'], 'xpath': 'text'}
{'id': 4, 'label': 'webgpt', 'path': 'openai/webgpt_comparisons', 'split': 'train', 'xpath': 'question.full_text'}
{'id': 5, 'label': 'gpt4tools', 'path': 'taskydata/GPT4Tools', 'split': 'train', 'xpath': 'instruction'}
{'id': 6, 'label': 'cot', 'path': 'DataProvenanceInitiative/cot_submix_original', 'split': 'train', 'xpath': 'inputs'}
{'id': 7, 'label': 'stackoverflow', 'path': '0x70DA/stackoverflow-chat-data', 'split': 'train', 'xpath': 'input'}


In [34]:
import json


def extract_from_xpath(record, xpath):
    keys = xpath.split('.')
    value = record
    for key in keys:
        value = value.get(key, {})
    return value


unified_train_dataset = []
unified_eval_dataset = []

for dataset in DATASETS:
    print(dataset)
    train_file = f"./cache/{dataset['label']}_train.jsonl"
    eval_file = f"./cache/{dataset['label']}_eval.jsonl"


    def process_file(file_path, dataset, output_list):
        with open(file_path, 'r') as f:
            for line in f:
                record = json.loads(line)
                extracted_value = extract_from_xpath(record, dataset['xpath'])
                if extracted_value:
                    output_list.append({
                        "labels": dataset['id'],
                        # "labels": dataset['label'],
                        "text": extracted_value
                    })


    process_file(train_file, dataset, unified_train_dataset)
    process_file(eval_file, dataset, unified_eval_dataset)

# Save the unified datasets
unified_train_dataset_file = "./cache/unified_train.jsonl"
unified_eval_dataset_file = "./cache/unified_eval.jsonl"
save_to_jsonl(unified_train_dataset, unified_train_dataset_file)
save_to_jsonl(unified_eval_dataset, unified_eval_dataset_file)

# Print out the sizes
print(f"Unified train dataset size: {len(unified_train_dataset)}")
print(f"Unified eval dataset size: {len(unified_eval_dataset)}")

{'id': 0, 'label': 'finance', 'path': 'gbharti/finance-alpaca', 'split': 'train', 'xpath': 'instruction'}
{'id': 1, 'label': 'medicine', 'path': 'lavita/ChatDoctor-HealthCareMagic-100k', 'split': 'train', 'xpath': 'input'}
{'id': 2, 'label': 'leetcode', 'path': ['./oig/unified_merged_code_xp3.jsonl'], 'xpath': 'text'}
{'id': 3, 'label': 'exam', 'path': ['./oig/unified_mathqa_flanv2_kojma_cot.jsonl'], 'xpath': 'text'}
{'id': 4, 'label': 'webgpt', 'path': 'openai/webgpt_comparisons', 'split': 'train', 'xpath': 'question.full_text'}
{'id': 5, 'label': 'gpt4tools', 'path': 'taskydata/GPT4Tools', 'split': 'train', 'xpath': 'instruction'}
{'id': 6, 'label': 'cot', 'path': 'DataProvenanceInitiative/cot_submix_original', 'split': 'train', 'xpath': 'inputs'}
{'id': 7, 'label': 'stackoverflow', 'path': '0x70DA/stackoverflow-chat-data', 'split': 'train', 'xpath': 'input'}
Unified train dataset size: 79998
Unified eval dataset size: 16000
