In [5]:
import os
import json

# 设置文件路径和待合并的文件名
json_dir = '/kaggle/input/cmb-data'
files_to_merge = ['CMB-train-merge.json','CMB-val-merge.json']

merged_data = []
all_keys = set()

# 遍历每个文件读取内容
for filename in files_to_merge:
    file_path = os.path.join(json_dir, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        try:
            data_list = json.load(f)  # 读取整个文件为 list
            for item in data_list:
                merged_data.append(item)
                all_keys.update(item.keys())
        except json.JSONDecodeError as e:
            print(f"Error decoding {filename}: {e}")

# 可选：统一字段结构（防止部分字段缺失）
standardized_data = []
for entry in merged_data:
    standardized_entry = {key: entry.get(key, None) for key in all_keys}
    standardized_data.append(standardized_entry)

# 保存合并后的结果
output_path = '/kaggle/working/merged_CMB.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(standardized_data, f, ensure_ascii=False, indent=2)

print(f"Merged {len(standardized_data)} entries into {output_path}")

Merged 269639 entries into /kaggle/working/merged_CMB.json


In [6]:
for i, item in enumerate(merged_data[:5], 1):
    print(f"\n=== Entry {i} ===")
    print(json.dumps(item, indent=2, ensure_ascii=False))


=== Entry 1 ===
{
  "exam_type": "医师考试",
  "exam_class": "规培结业",
  "exam_subject": "临床病理科",
  "question": "HIV患者最常感染的是下列哪种肺炎",
  "answer": "D",
  "question_type": "单项选择题",
  "option": {
    "A": "大叶性肺炎",
    "B": "小叶性肺炎",
    "C": "非典型肺炎",
    "D": "卡氏囊虫性肺炎",
    "E": "病毒性肺炎"
  }
}

=== Entry 2 ===
{
  "exam_type": "医师考试",
  "exam_class": "规培结业",
  "exam_subject": "口腔科",
  "question": "下列部位的口腔黏膜上皮有角化，除了",
  "answer": "D",
  "question_type": "单项选择题",
  "option": {
    "A": "唇红",
    "B": "硬腭",
    "C": "牙龈",
    "D": "舌腹",
    "E": "舌背"
  }
}

=== Entry 3 ===
{
  "exam_type": "医师考试",
  "exam_class": "规培结业",
  "exam_subject": "皮肤科",
  "question": "细胞因子所不具备的作用特点是",
  "answer": "B",
  "question_type": "单项选择题",
  "option": {
    "A": "拮抗性",
    "B": "特异性",
    "C": "多效性",
    "D": "重叠性",
    "E": "网络性"
  }
}

=== Entry 4 ===
{
  "exam_type": "医师考试",
  "exam_class": "规培结业",
  "exam_subject": "骨科",
  "question": "按照三阶梯用药原则，适用于中度癌痛的是",
  "answer": "E",
  "question_type": "单项选择题",
  "option": 

In [9]:
field_mapping = {
    'question': 'question',
    'answer': 'answer'
}

renamed_data = []

for item in merged_data:
    new_item = {}

    for old_key, new_key in field_mapping.items():
        new_item[new_key] = item.get(old_key, None)

    options = item.get('option', {})
    for option_key in ['A', 'B', 'C', 'D', 'E']:
        new_item[option_key] = options.get(option_key, None)

    renamed_data.append(new_item)

for i, item in enumerate(renamed_data[:5], 1):
    print(f"\n=== Entry {i} ===")
    for key in ['question', 'A', 'B', 'C', 'D', 'E', 'answer', 'exp']:
        print(f"{key}: {item.get(key)}")


=== Entry 1 ===
question: HIV患者最常感染的是下列哪种肺炎
A: 大叶性肺炎
B: 小叶性肺炎
C: 非典型肺炎
D: 卡氏囊虫性肺炎
E: 病毒性肺炎
answer: D
exp: None

=== Entry 2 ===
question: 下列部位的口腔黏膜上皮有角化，除了
A: 唇红
B: 硬腭
C: 牙龈
D: 舌腹
E: 舌背
answer: D
exp: None

=== Entry 3 ===
question: 细胞因子所不具备的作用特点是
A: 拮抗性
B: 特异性
C: 多效性
D: 重叠性
E: 网络性
answer: B
exp: None

=== Entry 4 ===
question: 按照三阶梯用药原则，适用于中度癌痛的是
A: 可待因、丁丙诺啡、布洛芬
B: 氢吗啡酮、右丙氧芬、安度芬
C: 二氢可待因、消炎痛、美沙酮
D: 可待因、氨酚待因、强痛定
E: 芬太尼透皮贴剂、曲马多、奇曼丁
answer: E
exp: None

=== Entry 5 ===
question: 关于协调性宫缩乏力正确的是（ ）。
A: 宫缩极性，对称性正常，仅收缩力弱
B: 多数产妇觉持续腹痛，且产程延长
C: 容易发生胎儿窘迫
D: 不易静脉滴注催产素
E: 不易发生胎盘残留
answer: A
exp: None


In [12]:
# 定义关键词（可根据需要扩展）
keywords_zh = [
    '冠心病', '心绞痛', '稳定型心绞痛', '不稳定型心绞痛',
    '心肌梗死', '心梗', '非ST段抬高心肌梗死', 'ST段抬高心肌梗死',
    '急性冠状动脉综合征', '缺血性心脏病'
]

# 匹配函数：不转小写，直接判断关键词是否出现在问题文本中
def is_cardiovascular_related(text):
    if text is None:
        return False
    return any(kw in text for kw in keywords_zh)

# 过滤符合心血管关键词的记录
cardio_questions = [
    item for item in renamed_data
    if is_cardiovascular_related(item.get('question'))
]
    
# 过滤符合条件的记录
cardio_questions = [
    item for item in renamed_data
    if is_cardiovascular_related(item.get('question'))
]

print(f"Found {len(cardio_questions)} cardiovascular-related entries.")

Found 1580 cardiovascular-related entries.


In [13]:
for i, item in enumerate(cardio_questions[:5], 1):
    print(f"\n=== Cardiovascular Entry {i} ===")
    for key in ['question', 'A', 'B', 'C', 'D', 'E', 'answer', 'exp']:
        print(f"{key}: {item.get(key)}")


=== Cardiovascular Entry 1 ===
question: 急性心肌梗死5小时，以下治疗方案中最适宜的是
A: 哌替啶
B: 静滴硝酸甘油
C: 射频消融治疗
D: 溶栓治疗
E: 糖皮质激素+扩血管药物静滴
answer: D
exp: None

=== Cardiovascular Entry 2 ===
question: 以上哪种胸痛的性质对于胸痛的鉴别诊断有一定的提示意义:心绞痛或心肌梗死
A: 刀割样痛或灼痛，剧烈难忍
B: 撕裂样剧痛
C: 剧烈刺痛或绞痛，常伴呼吸困难与发绀
D: 绞窄性痛伴重压窒息感或伴恐惧、频死感
E: 烧灼样胸骨后压痛
answer: D
exp: None

=== Cardiovascular Entry 3 ===
question: 冠心病心绞痛与心肌梗死的疼痛，其主要鉴别点是
A: 疼痛的持续时间与强度不同
B: 疼痛的部位不同
C: 疼痛的放射部位不同
D: 疼痛的性质不同
E: 疼痛时伴发恶心
answer: A
exp: None

=== Cardiovascular Entry 4 ===
question: 下列情况合并心绞痛时不宜应用硝酸甘油的是（ ）。
A: 冠心病
B: 主动脉瓣关闭不全
C: 心梗后心绞痛
D: 严重贫血
E: 肥厚型梗阻性心肌病
answer: E
exp: None

=== Cardiovascular Entry 5 ===
question: 阿司匹林对冠心病心绞痛的治疗作用是
A: 降低血脂
B: 消炎、止痛
C: 降低心肌氧耗量
D: 抑制血小板聚集
E: 抑制免疫反应
answer: D
exp: None


In [16]:
output_path = '/kaggle/working/CMB_cardio.json'

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(cardio_questions, f, ensure_ascii=False, indent=2)

print(f"Exported {len(cardio_questions)} cardiovascular questions to {output_path}")

Exported 1580 cardiovascular questions to /kaggle/working/CMB_cardio.json
