In [12]:
import os
import json

json_dir = '/kaggle/input/usmle'
files_to_merge = ['US_dev.jsonl', 'US_test.jsonl', 'US_train.jsonl', 
                  'Taiwan_dev_2en.jsonl', 'Taiwan_test_2en.jsonl', 'Taiwan_train_2en.jsonl']

merged_data = []
all_keys = set()

# 第一步：读取两个文件，收集所有字段
raw_data = []

for filename in files_to_merge:
    file_path = os.path.join(json_dir, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if line:
                try:
                    data = json.loads(line)
                    raw_data.append(data)
                    all_keys.update(data.keys())
                except json.JSONDecodeError as e:
                    print(f"Error decoding {filename} at line {line_num}: {e}")

# 第二步：统一字段结构
for entry in raw_data:
    complete_entry = {key: entry.get(key, None) for key in all_keys}
    merged_data.append(complete_entry)

# 第三步：保存合并结果（在 working 目录）
output_path = '/kaggle/working/merged_USMLE.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(merged_data, f, ensure_ascii=False, indent=2)

print(f"Merged {len(merged_data)} entries into {output_path}")

Merged 26846 entries into /kaggle/working/merged_USMLE.json


In [13]:
for i, item in enumerate(merged_data[:5], 1):
    print(f"\n=== Entry {i} ===")
    print(json.dumps(item, indent=2, ensure_ascii=False))


=== Entry 1 ===
{
  "metamap_phrases": [
    "21-year-old sexually active male",
    "fever",
    "pain",
    "urination",
    "inflammation",
    "pain in the right knee",
    "culture",
    "joint shows",
    "bacteria",
    "not ferment maltose",
    "polysaccharide capsule",
    "physician orders antibiotic therapy",
    "patient",
    "mechanism of action",
    "medication given blocks cell wall synthesis",
    "following",
    "given"
  ],
  "options": {
    "A": "Gentamicin",
    "B": "Ciprofloxacin",
    "C": "Ceftriaxone",
    "D": "Trimethoprim"
  },
  "meta_info": "step1",
  "question": "A 21-year-old sexually active male complains of fever, pain during urination, and inflammation and pain in the right knee. A culture of the joint fluid shows a bacteria that does not ferment maltose and has no polysaccharide capsule. The physician orders antibiotic therapy for the patient. The mechanism of action of action of the medication given blocks cell wall synthesis, which of the fol

In [18]:
field_mapping = {
    'question': 'question',
    'answer_idx': 'answer'
}

renamed_data = []

for item in merged_data:
    new_item = {}

    for old_key, new_key in field_mapping.items():
        new_item[new_key] = item.get(old_key, None)

    options = item.get('options', {})
    for option_key in ['A', 'B', 'C', 'D']:
        new_item[option_key] = options.get(option_key, None)

    renamed_data.append(new_item)

for i, item in enumerate(renamed_data[:5], 1):
    print(f"\n=== Entry {i} ===")
    for key in ['question', 'A', 'B', 'C', 'D', 'answer', 'exp']:
        print(f"{key}: {item.get(key)}")


=== Entry 1 ===
question: A 21-year-old sexually active male complains of fever, pain during urination, and inflammation and pain in the right knee. A culture of the joint fluid shows a bacteria that does not ferment maltose and has no polysaccharide capsule. The physician orders antibiotic therapy for the patient. The mechanism of action of action of the medication given blocks cell wall synthesis, which of the following was given?
A: Gentamicin
B: Ciprofloxacin
C: Ceftriaxone
D: Trimethoprim
answer: C
exp: None

=== Entry 2 ===
question: A 5-year-old girl is brought to the emergency department by her mother because of multiple episodes of nausea and vomiting that last about 2 hours. During this period, she has had 6–8 episodes of bilious vomiting and abdominal pain. The vomiting was preceded by fatigue. The girl feels well between these episodes. She has missed several days of school and has been hospitalized 2 times during the past 6 months for dehydration due to similar episodes o

In [19]:
# 定义关键词（可根据需要扩展）
keywords =  [
    # 冠心病
    'coronary artery disease', 'CAD', 'angina', 'stable angina', 'unstable angina',
    'myocardial infarction', 'MI', 'heart attack', 'NSTEMI', 'STEMI',
    'acute coronary syndrome', 'ACS', 'ischemic heart disease', 'IHD',

    # 心律失常
    'arrhythmia', 'atrial fibrillation', 'ventricular tachycardia',
    'ventricular fibrillation', 'bundle branch block', 'heart block',

    # 心力衰竭
    'heart failure', 'congestive heart failure', 'CHF',
    'left ventricular dysfunction', 'right heart failure',
    'ejection fraction', 'reduced EF', 'cardiomyopathy',
    'dilated cardiomyopathy', 'hypertrophic cardiomyopathy',

    # 心瓣膜病 / 杂音
    'valvular heart disease', 'aortic stenosis', 'mitral regurgitation',
    'heart murmur', 'valve prolapse', 'bicuspid aortic valve',
    'echocardiogram'
]

# 转小写进行匹配
def is_cardiovascular_related(text):
    if text is None:
        return False
    text = text.lower()
    return any(kw in text for kw in keywords)
    
# 过滤符合条件的记录
cardio_questions = [
    item for item in renamed_data
    if is_cardiovascular_related(item.get('question'))
]

print(f"Found {len(cardio_questions)} cardiovascular-related entries.")

Found 1100 cardiovascular-related entries.


In [20]:
for i, item in enumerate(cardio_questions[:5], 1):
    print(f"\n=== Cardiovascular Entry {i} ===")
    for key in ['question', 'A', 'B', 'C', 'D', 'E', 'answer', 'exp']:
        print(f"{key}: {item.get(key)}")


=== Cardiovascular Entry 1 ===
question: A 60-year-old woman presents to her primary care physician for a wellness checkup. She has a past medical history of hypertension and was discharged from the hospital yesterday after management of a myocardial infarction. She states that sometimes she experiences exertional angina. Her temperature is 99.5°F (37.5°C), blood pressure is 147/98 mmHg, pulse is 90/min, respirations are 17/min, and oxygen saturation is 98% on room air. Physical exam is within normal limits. Which of the following is the best next step in management?
A: Atenolol
B: Furosemide
C: Hydrochlorothiazide
D: Nitroglycerin
E: None
answer: A
exp: None

=== Cardiovascular Entry 2 ===
question: A 56-year-old man with known coronary artery disease presents to the emergency department complaining of chest discomfort and palpitations for 2 hours. On arrival, the vital signs include blood pressure 122/76 mm Hg, heart rate 180/min, respiratory rate 22/min, temperature 37.0℃ (98.6℉), 

In [21]:
output_path = '/kaggle/working/USMLE_cardio.json'

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(cardio_questions, f, ensure_ascii=False, indent=2)

print(f"Exported {len(cardio_questions)} cardiovascular questions to {output_path}")

Exported 1100 cardiovascular questions to /kaggle/working/USMLE_cardio.json
