In [3]:
import os
import json

json_dir = '/kaggle/input/medmcqa'
files_to_merge = ['MedMCQA_train.json', 'MedMCQA_dev.json']

merged_data = []
all_keys = set()

# 第一步：读取两个文件，收集所有字段
raw_data = []

for filename in files_to_merge:
    file_path = os.path.join(json_dir, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if line:
                try:
                    data = json.loads(line)
                    raw_data.append(data)
                    all_keys.update(data.keys())
                except json.JSONDecodeError as e:
                    print(f"Error decoding {filename} at line {line_num}: {e}")

# 第二步：统一字段结构
for entry in raw_data:
    complete_entry = {key: entry.get(key, None) for key in all_keys}
    merged_data.append(complete_entry)

# 第三步：保存合并结果（在 working 目录）
output_path = '/kaggle/working/merged_train_dev.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(merged_data, f, ensure_ascii=False, indent=2)

print(f"Merged {len(merged_data)} entries from train.json and dev.json into {output_path}")

✅ Merged 187005 entries from train.json and dev.json into /kaggle/working/merged_train_dev.json


In [4]:
for i, item in enumerate(merged_data[:5], 1):
    print(f"\n=== Entry {i} ===")
    print(json.dumps(item, indent=2, ensure_ascii=False))


=== Entry 1 ===
{
  "subject_name": "Anatomy",
  "cop": 3,
  "choice_type": "single",
  "opa": "Hyperplasia",
  "question": "Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma",
  "id": "e9ad821a-c438-4965-9f77-760819dfa155",
  "opd": "Dyplasia",
  "opb": "Hyperophy",
  "opc": "Atrophy",
  "topic_name": "Urinary tract",
  "exp": "Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950"
}

=== Entry 2 ===
{
  "subject_name": "Biochemistry",
  "cop": 3,
  "choice_type": "single",
  "opa": "Vitamin C",
  "question": "Which vitamin is supplied from only animal source:",
  "id": "e3d3c4e1-4fb2-45e7-9f88

In [11]:
field_mapping = {
    'question': 'question',
    'opa': 'A',
    'opb': 'B',
    'opc': 'C',
    'opd': 'D',
    'cop': 'answer',
    'exp': 'exp'
}

renamed_data = []

for item in merged_data:
    new_item = {new_key: item.get(old_key, None) for old_key, new_key in field_mapping.items()}
    renamed_data.append(new_item)

answer_map = {
    1: "A",
    2: "B",
    3: "C",
    4: "D",
}

for item in renamed_data:
    raw_answer = item.get('answer', None)
    item['answer'] = answer_map.get(raw_answer, raw_answer)

for i, item in enumerate(renamed_data[:5], 1):
    print(f"\n=== Entry {i} ===")
    for key in ['question', 'A', 'B', 'C', 'D', 'answer', 'exp']:
        print(f"{key}: {item.get(key)}")


=== Entry 1 ===
question: Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma
A: Hyperplasia
B: Hyperophy
C: Atrophy
D: Dyplasia
answer: C
exp: Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950

=== Entry 2 ===
question: Which vitamin is supplied from only animal source:
A: Vitamin C
B: Vitamin B7
C: Vitamin B12
D: Vitamin D
answer: C
exp: Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. P 640* Vitamin B12 (Cobalamin) is synthesized solely by microorganisms.* In humans, the only source for humans is food of animal origin, e.g., meat, fish, and dairy products.* Vegetables, fruits, and other foods 

In [34]:
# 定义关键词（可根据需要扩展）
keywords =  [
    # 冠心病
    'coronary artery disease', 'CAD', 'angina', 'stable angina', 'unstable angina',
    'myocardial infarction', 'MI', 'heart attack', 'NSTEMI', 'STEMI',
    'acute coronary syndrome', 'ACS', 'ischemic heart disease', 'IHD',

    # 心律失常
    'arrhythmia', 'atrial fibrillation', 'ventricular tachycardia',
    'ventricular fibrillation', 'bundle branch block', 'heart block',

    # 心力衰竭
    'heart failure', 'congestive heart failure', 'CHF',
    'left ventricular dysfunction', 'right heart failure',
    'ejection fraction', 'reduced EF', 'cardiomyopathy',
    'dilated cardiomyopathy', 'hypertrophic cardiomyopathy',

    # 心瓣膜病 / 杂音
    'valvular heart disease', 'aortic stenosis', 'mitral regurgitation',
    'heart murmur', 'valve prolapse', 'bicuspid aortic valve',
    'echocardiogram'
]

# 转小写进行匹配
def is_cardiovascular_related(text):
    if text is None:
        return False
    text = text.lower()
    return any(kw in text for kw in keywords)
    
# 过滤符合条件的记录
cardio_questions = [
    item for item in renamed_data
    if is_cardiovascular_related(item.get('question'))
]

print(f"Found {len(cardio_questions)} cardiovascular-related entries.")

Found 1159 cardiovascular-related entries.


In [35]:
for i, item in enumerate(cardio_questions[:5], 1):
    print(f"\n=== Cardiovascular Entry {i} ===")
    for key in ['question', 'A', 'B', 'C', 'D', 'answer', 'exp']:
        print(f"{key}: {item.get(key)}")


=== Cardiovascular Entry 1 ===
question: An ill 16 days old baby girl is brought to the emergency. On examination pallor and dyspnoea present with a respiratory rate of 85 per minute. Her HR is 200 bpm, hea sounds are distant and a gallop is heard. X-ray showed cardiomegaly. An echocardiogram shows dilated ventricles and dilation of the left atrium. An ECG shows ventricular depolarization complexes that have low voltage. Which of the following is the most likely diagnosis?
A: CHF
B: Glycogen storage disease
C: Pericarditis
D: Aberrant left coronary aery arising from pulmonary aery
answer: A
exp: In CHF pallor, dyspnoea, tachypnoea, tachycardia and cardiomegaly are common regardless of the cause.The most common causes of CHF in children include myocarditis caused by adenovirus and coxsackievirus B.The echocardiogram shows ventricular and left atrial dilatation as well as poor ventricular function. With glycogen storage disease of the hea muscle thickening would be expected. With perica

In [36]:
output_path = '/kaggle/working/MedMCQA_cardio.json'

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(cardio_questions, f, ensure_ascii=False, indent=2)

print(f"Exported {len(cardio_questions)} cardiovascular questions to {output_path}")

Exported 1159 cardiovascular questions to /kaggle/working/MedMCQA_cardio.json
