In [3]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("UCSC-VLAA/MedReason")

README.md: 0.00B [00:00, ?B/s]

ours_quality_33000.jsonl:   0%|          | 0.00/115M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32682 [00:00<?, ? examples/s]

In [4]:
print(ds)  # 会显示 'train'、'validation' 等子集

DatasetDict({
    train: Dataset({
        features: ['dataset_name', 'id_in_dataset', 'question', 'answer', 'reasoning', 'options'],
        num_rows: 32682
    })
})


In [5]:
# 展示训练集的前3条样本
for i in range(3):
    print(f"\n=== Sample {i+1} ===")
    print(ds['train'][i])


=== Sample 1 ===
{'dataset_name': 'medmcqa', 'id_in_dataset': 7131, 'question': 'Urogenital Diaphragm is made up of the following, except:', 'answer': "Colle's fascia. Explanation: Colle's fascia does not contribute to the Urogenital Diaphragm. It is attached posteriorly to the posterior border of the urogenital diaphragm but does not form pa of this diaphragm. Ref: BDC, Volume 2, 4th Edition, Page 332; Grants Method of Anatomy, 11th Edition, Page 244; Gray's Anatomy, 36th Edition, Page 563", 'reasoning': "Finding reasoning paths:\n1. Urogenital diaphragm -> Pelvic anatomy -> Muscles and fascia -> Deep perineal pouch\n2. Urogenital diaphragm -> Components of the perineum -> Muscles and connective tissue\n3. Urogenital diaphragm -> Anatomical structures -> Superficial vs. deep fascia\n\nReasoning Process:\n1. The urogenital diaphragm is a structure located in the pelvic region, specifically within the perineum. It is important to understand its composition to determine which structures

In [6]:
train_dataset = ds['train']

# 提取数据
extracted_data = []

for entry in train_dataset:
    options = entry.get('options', '')

    # 初始化选项字典
    parsed_options = {'A': '', 'B': '', 'C': '', 'D': ''}

    # 解析字符串形式的选项（如 "Answer Choices:\nA. xxx\nB. yyy..."）
    for line in options.split('\n'):
        line = line.strip()
        if '.' in line and line[0] in ['A', 'B', 'C', 'D']:
            key = line[0]
            value = line[2:].strip()
            parsed_options[key] = value

    # 构建新条目
    new_entry = {
        'question': entry.get('question', ''),
        'A': parsed_options.get('A', ''),
        'B': parsed_options.get('B', ''),
        'C': parsed_options.get('C', ''),
        'D': parsed_options.get('D', ''),
        'answer': entry.get('answer', ''),
        'exp': entry.get('reasoning', '')
    }

    extracted_data.append(new_entry)

# 归一化答案：将文本形式的答案转为 A/B/C/D
def normalize_answer(entry):
    correct_ans_text = entry.get('answer', '').split('.')[0].strip()  # 提取答案前的关键词
    for key in ['A', 'B', 'C', 'D']:
        option_value = entry.get(key)
        if option_value and correct_ans_text.lower() == option_value.lower():
            return key
    return "Unknown"

# 应用归一化
for item in extracted_data:
    item['answer'] = normalize_answer(item)

# 打印前5个样本查看效果
for i, item in enumerate(extracted_data[:5], 1):
    print(f"\n=== Entry {i} ===")
    for key in ['question', 'A', 'B', 'C', 'D', 'answer', 'exp']:
        print(f"{key}: {item.get(key)}")


=== Entry 1 ===
question: Urogenital Diaphragm is made up of the following, except:
A: Deep transverse Perineus
B: Perinial membrane
C: Colle's fascia
D: Sphincter Urethrae
answer: C
exp: Finding reasoning paths:
1. Urogenital diaphragm -> Pelvic anatomy -> Muscles and fascia -> Deep perineal pouch
2. Urogenital diaphragm -> Components of the perineum -> Muscles and connective tissue
3. Urogenital diaphragm -> Anatomical structures -> Superficial vs. deep fascia

Reasoning Process:
1. The urogenital diaphragm is a structure located in the pelvic region, specifically within the perineum. It is important to understand its composition to determine which structures contribute to its formation.

2. The urogenital diaphragm is primarily composed of the deep transverse perineal muscle and the perineal membrane. These structures are located in the deep perineal pouch, which is a part of the pelvic floor.

3. The perineal membrane is a layer of fascia that provides support and is part of the u

In [7]:
# 定义关键词（可根据需要扩展）
keywords =  [
    # 冠心病
    'coronary artery disease', 'CAD', 'angina', 'stable angina', 'unstable angina',
    'myocardial infarction', 'MI', 'heart attack', 'NSTEMI', 'STEMI',
    'acute coronary syndrome', 'ACS', 'ischemic heart disease', 'IHD',

    # 心律失常
    'arrhythmia', 'atrial fibrillation', 'ventricular tachycardia',
    'ventricular fibrillation', 'bundle branch block', 'heart block',

    # 心力衰竭
    'heart failure', 'congestive heart failure', 'CHF',
    'left ventricular dysfunction', 'right heart failure',
    'ejection fraction', 'reduced EF', 'cardiomyopathy',
    'dilated cardiomyopathy', 'hypertrophic cardiomyopathy',

    # 心瓣膜病 / 杂音
    'valvular heart disease', 'aortic stenosis', 'mitral regurgitation',
    'heart murmur', 'valve prolapse', 'bicuspid aortic valve',
    'echocardiogram'
]

# 转小写进行匹配
def is_cardiovascular_related(text):
    if text is None:
        return False
    text = text.lower()
    return any(kw in text for kw in keywords)
    
# 过滤符合条件的记录
cardio_questions = [
    item for item in extracted_data
    if is_cardiovascular_related(item.get('question'))
]

print(f"Found {len(cardio_questions)} cardiovascular-related entries.")

Found 1225 cardiovascular-related entries.


In [8]:
for i, item in enumerate(cardio_questions[:5], 1):
    print(f"\n=== Cardiovascular Entry {i} ===")
    for key in ['question', 'A', 'B', 'C', 'D', 'answer', 'exp']:
        print(f"{key}: {item.get(key)}")


=== Cardiovascular Entry 1 ===
question: Which of the following drugs is used for the prophylaxis of migraine but not for angina pectoris
A: Verapamnil
B: Diltiazem
C: Flunarizine
D: Amlodipine
answer: C
exp: **Finding reasoning paths:**

1. Flunarizine -> Calcium channel blocker -> Prophylaxis of migraine
2. Flunarizine -> Not used for angina pectoris
3. Flunarizine -> Not available in the US or Japan

**Reasoning Process:**

1. **Flunarizine as a Calcium Channel Blocker:**
   - Flunarizine is classified as a calcium channel blocker. Calcium channel blockers are known to inhibit the influx of calcium ions through cell membranes, which can help in reducing the frequency and severity of migraines. This mechanism is consistent with its use in the prophylaxis of migraine disorders.

2. **Use in Migraine Prophylaxis:**
   - The primary indication for flunarizine is the prevention of migraine headaches. By stabilizing neuronal activity and reducing vasospasm, it helps in preventing the ons

In [10]:
import json

output_path = '/kaggle/working/UCSC_cardio.json'

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(cardio_questions, f, ensure_ascii=False, indent=2)

print(f"Exported {len(cardio_questions)} cardiovascular questions to {output_path}")

Exported 1225 cardiovascular questions to /kaggle/working/UCSC_cardio.json
