In [None]:
DATASETS = [
    {
        "label": "leetcode",
        "path": ["./oig/unified_merged_code_xp3.jsonl"],
        "xpath": "text"
    },
    {
        "label": "exam",
        "path": [
            "./oig/unified_grade_school_math_instructions.jsonl",
            "./oig/unified_mathqa_flanv2_kojma_cot.jsonl"
        ],
        "xpath": "text"
    },
]

In [None]:
import os
import json
from tqdm import tqdm
from datasets import load_dataset


def save_to_jsonl(data, filename):
    with open(filename, 'a') as f:
        for record in data:
            f.write(json.dumps(record) + '\n')


def process_jsonl_files(file_paths, xpath, max_records):
    accumulated_records = []
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            for line in tqdm(f, desc=f"Processing {file_path}"):
                record = json.loads(line)
                accumulated_records.append(record)
                if len(accumulated_records) >= max_records:
                    break
        if len(accumulated_records) >= max_records:
            break
    return accumulated_records


cache_path = './cache'
max_records = 12000

for dataset in DATASETS:
    print(dataset)
    train_file = f"./cache/{dataset['label']}_train.jsonl"
    eval_file = f"./cache/{dataset['label']}_eval.jsonl"

    if os.path.exists(train_file) and os.path.exists(eval_file):
        continue

    # Process JSONL files and accumulate records
    raw_dataset = load_dataset("json", split="train", data_files=dataset['path']).shuffle(seed=42)
    limited_dataset = raw_dataset.select(range(max_records)).to_pandas()

    # Split the accumulated records into train and eval
    train_dataset = limited_dataset.iloc[:10000].to_dict(orient='records')
    eval_dataset = limited_dataset.iloc[10000:12000].to_dict(orient='records')

    # Save the datasets to JSONL files
    save_to_jsonl(train_dataset, train_file)
    save_to_jsonl(eval_dataset, eval_file)

    print(f"Saved {len(train_dataset)} records to {train_file}")
    print(f"Saved {len(eval_dataset)} records to {eval_file}")