In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Merge six benchmark JSON files, standardise IDs, and append a 'type' field.
"""

import json
import uuid
from pathlib import Path

# ---------- 1. 待合并文件及其归类 ----------
files_info = [
    ("pubmed_qa.json",       "retrieval",          "ret"),
    ("ctgov_qa.json",        "retrieval",          "ret"),
    ("ot_disease.json",      "retrieval",          "ret"),
    ("ot_safety.json",       "retrieval",          "ret"),
    ("ot_tractability.json", "retrieval",          "ret"),
    ("umls_qa.json",         "kg_reasoning",       "kg"),
    ("oncology_case_qa.json","diagnostic_pathway", "path"),
]

# ---------- 2. 合并处理 ----------
merged_items = []

def new_id(prefix: str) -> str:
    """Return a short random id with type prefix (e.g. 'ret_4ab97c8e2f3a')."""
    return f"{prefix}_{uuid.uuid4().hex[:12]}"

for fname, qtype, prefix in files_info:
    path = Path(fname)
    if not path.is_file():
        raise FileNotFoundError(f"{fname} not found")

    with path.open(encoding="utf-8") as fp:
        data = json.load(fp)["dataset"]

    for item in data:
        item["id"] = new_id(prefix)   # 统一 ID
        item["type"] = qtype          # 新增类型字段
        merged_items.append(item)

# ---------- 3. 写出结果 ----------
out_path = Path("merged_benchmark.json")
with out_path.open("w", encoding="utf-8") as fp:
    json.dump(merged_items, fp, indent=2, ensure_ascii=False)

print(f"✅  Merged {len(merged_items)} entries → {out_path.resolve()}")


✅  Merged 4476 entries → /home/xinding/dingxin/Agent/MAIA/benchmark/merged_benchmark.json


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Merge six benchmark JSON files, standardise IDs, and append a 'type' field.
"""

import json
import uuid
from pathlib import Path

# ---------- 1. 待合并文件及其归类 ----------
files_info = [
    ("umls_qa.json",         "kg_reasoning",       "kg"),
    ("oncology_case_qa.json","diagnostic_pathway", "path"),
]

# ---------- 2. 合并处理 ----------
merged_items = []

def new_id(prefix: str) -> str:
    """Return a short random id with type prefix (e.g. 'ret_4ab97c8e2f3a')."""
    return f"{prefix}_{uuid.uuid4().hex[:12]}"

for fname, qtype, prefix in files_info:
    path = Path(fname)
    if not path.is_file():
        raise FileNotFoundError(f"{fname} not found")

    with path.open(encoding="utf-8") as fp:
        data = json.load(fp)["dataset"]

    for item in data:
        item["id"] = new_id(prefix)   # 统一 ID
        item["type"] = qtype          # 新增类型字段
        merged_items.append(item)

# ---------- 3. 写出结果 ----------
out_path = Path("benchmark_reasoning.json")
with out_path.open("w", encoding="utf-8") as fp:
    json.dump(merged_items, fp, indent=2, ensure_ascii=False)

print(f"✅  Merged {len(merged_items)} entries → {out_path.resolve()}")


✅  Merged 4005 entries → /home/xinding/dingxin/Agent/MAIA/benchmark/benchmark_reasoning.json
