In [3]:
"""
筛出 gpt-4.1 评分高于 noah 的问答条目，并导出 CSV

目录结构示例：
.
├── sampled_qa.json
├── gpt-4.1/
│   ├── judge_scores.json      # {"0": {"score": 4.0, ...}, ...}
│   └── model_answers.json     # {"0": "<answer text>", ...}
└── noah/
    ├── judge_scores.json
    └── model_answers.json
"""

import json, csv
from pathlib import Path
from numbers import Number

# --------- 根据实际情况修改这几行 ---------
ROOT            = Path(".")            # 项目根目录
QA_FILE         = ROOT / "sampled_qa.json"

GPT_SCORE_FILE  = ROOT / "gpt-4.1" / "judge_scores.json"
NOAH_SCORE_FILE = ROOT / "noah"     / "judge_scores.json"

GPT_ANS_FILE    = ROOT / "gpt-4.1" / "model_answers.json"
NOAH_ANS_FILE   = ROOT / "noah"     / "model_answers.json"

OUT_CSV         = ROOT / "gpt4_better_than_noah.csv"
# -------------------------------------------


def load_json(path: Path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def extract_score(entry) -> float:
    """
    从 judge_scores.json 的条目中提取可比较的浮点分数
    - 纯数字 → 直接返回
    - 字典   → 先找 'overall' / 'score' / 'total' 字段；若没有就把所有数字字段取平均
    - 其余   → 返回 -inf（保证在比较时最小）
    """
    if isinstance(entry, Number):
        return float(entry)

    if isinstance(entry, dict):
        for key in ("overall", "Overall", "score", "Score", "total", "Total"):
            if key in entry and isinstance(entry[key], Number):
                return float(entry[key])

        numeric_vals = [float(v) for v in entry.values() if isinstance(v, Number)]
        if numeric_vals:
            return sum(numeric_vals) / len(numeric_vals)

    return float("-inf")  # 无法解析时


def fetch_qa(qa_data, qid: str):
    """
    既兼容 {"0": {...}, "1": {...}} 结构，也兼容 [{...}, {...}] 列表结构
    返回 {'question': str, 'answer': str}；若不存在则给空字串
    """
    # 字典形式
    if isinstance(qa_data, dict):
        item = qa_data.get(qid)
        if item is not None:
            return item

    # 列表形式 -> 用索引
    try:
        idx = int(qid)
        if isinstance(qa_data, list) and 0 <= idx < len(qa_data):
            return qa_data[idx]
    except ValueError:
        pass

    return {"question": "", "answer": ""}


def main():
    # 1) 读取全部文件
    qa_data      = load_json(QA_FILE)
    gpt_scores   = load_json(GPT_SCORE_FILE)
    noah_scores  = load_json(NOAH_SCORE_FILE)
    gpt_answers  = load_json(GPT_ANS_FILE)
    noah_answers = load_json(NOAH_ANS_FILE)

    # 2) 找出 gpt-4.1 得分更高的题目
    better_ids = []
    for qid, gpt_entry in gpt_scores.items():
        gpt_s  = extract_score(gpt_entry)
        noah_s = extract_score(noah_scores.get(qid, float("-inf")))
        if gpt_s > noah_s:
            better_ids.append((qid, gpt_s, noah_s))

    # 3) 写 CSV
    with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            "question_id", "question", "标准答案",
            "gpt-4.1 答案", "noah 答案",
            "gpt-4.1 分数", "noah 分数"
        ])

        for qid, gpt_s, noah_s in better_ids:
            qa_item = fetch_qa(qa_data, qid)
            writer.writerow([
                qid,
                qa_item.get("question", ""),
                qa_item.get("answer", ""),
                gpt_answers.get(qid, ""),
                noah_answers.get(qid, ""),
                gpt_s,
                noah_s
            ])

    print(f"✅ 已生成 {OUT_CSV}  —— 共 {len(better_ids)} 条记录。")


if __name__ == "__main__":
    main()


✅ 已生成 gpt4_better_than_noah.csv  —— 共 47 条记录。


In [3]:
import json
import csv
from pathlib import Path

# Define file paths
judge_path = Path('/home/xinding/dingxin/Agent/MAIA/evaluation/all_v2/judge_scores.json')
model_path = Path('/home/xinding/dingxin/Agent/MAIA/evaluation/all_v2/model_answers.json')
qa_path = Path('/home/xinding/dingxin/Agent/MAIA/dataset/oncology_case_qa_all_v2.json')
csv_out     = Path('low_score_cases_v2.csv')
with judge_path.open(encoding='utf-8') as f:
    judge_scores = json.load(f)

with model_path.open(encoding='utf-8') as f:
    model_answers = json.load(f)

with qa_path.open(encoding='utf-8') as f:
    oncology_qa = json.load(f)

# 若 oncology_case_qa.json 是列表而非字典，先转成 dict
if isinstance(oncology_qa, list):
    oncology_qa = {str(i): item for i, item in enumerate(oncology_qa)}
else:
    oncology_qa = {str(k): v for k, v in oncology_qa.items()}

# ========= 筛选得分 ≤ 3 =========
rows = []
for key, score_entry in judge_scores.items():
    score = float(score_entry.get('score', 0))
    if score <= 3:
        qa = oncology_qa.get(str(key))
        if not qa:               # 若缺对应 QA，跳过
            continue
        model_ans = model_answers.get(str(key), "")
        rows.append({
            "question":          qa.get("question", ""),
            "reference_answer":  qa.get("answer", ""),
            "model_answer":      model_ans if isinstance(model_ans, str) else "",
            "model_score":       score,
        })

# ========= 保存为 CSV =========
with csv_out.open('w', newline='', encoding='utf-8-sig') as f:
    writer = csv.DictWriter(
        f,
        fieldnames=["question", "reference_answer", "model_answer", "model_score"]
    )
    writer.writeheader()
    writer.writerows(rows)

print(f"Saved {len(rows)} low-score cases to {csv_out}")


Saved 169 low-score cases to low_score_cases_v2.csv


In [4]:
import json
from pathlib import Path

# === 路径 ===
judge_path = Path('/home/xinding/dingxin/Agent/MAIA/evaluation/all_v2/judge_scores.json')
qa_path    = Path('/home/xinding/dingxin/Agent/MAIA/dataset/oncology_case_qa_all_v2.json')
hard_qa_out = Path('hard_qa_v2.json')          # 输出文件（可自行修改）

# === 读取文件 ===
with judge_path.open(encoding='utf-8') as f:
    judge_scores = json.load(f)

with qa_path.open(encoding='utf-8') as f:
    oncology_qa = json.load(f)

# === 若源文件为列表则转成 dict，方便按 key 取值 ===
is_list_format = isinstance(oncology_qa, list)
if is_list_format:
    qa_dict = {str(i): item for i, item in enumerate(oncology_qa)}
else:
    qa_dict = {str(k): v for k, v in oncology_qa.items()}

# === 过滤得分 ≤ 3 的条目 ===
hard_qa_dict = {
    k: qa_dict[k]
    for k, score_entry in judge_scores.items()
    if float(score_entry.get('score', 0)) <= 3 and k in qa_dict
}

# === 输出保持与原始格式一致 ===
if is_list_format:
    # 恢复成列表（按原顺序排序；缺失的 id 会自然跳过）
    hard_qa_data = [hard_qa_dict[k] for k in sorted(hard_qa_dict, key=int)]
else:
    hard_qa_data = hard_qa_dict

with hard_qa_out.open('w', encoding='utf-8') as f:
    json.dump(hard_qa_data, f, ensure_ascii=False, indent=2)

print(f"Saved {len(hard_qa_dict)} hard-case QAs to {hard_qa_out}")


Saved 169 hard-case QAs to hard_qa_v2.json


In [None]:
from openai import OpenAI

client = OpenAI(
    api_key="sk-5157bc95a7ee4f7e9086a80fd41c69fc",
    base_url="https://api.deepseek.com/v1"
)

resp = client.chat.completions.create(
    model="deepseek-reasoner",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user",   "content": "你好"}
    ],
    temperature=0.7,
    max_tokens=512
)

print(resp.choices[0].message.content)


In [1]:
import requests

url = "https://api.fda.gov/drug/label.json"
params = {
    "search": "indications_and_usage:diabetes",
    "limit": 5
}

response = requests.get(url, params=params)
data = response.json()

for item in data["results"]:
    print(item["openfda"].get("brand_name"), "-", item.get("indications_and_usage"))

['Glimepiride'] - ['1 INDICATIONS AND USAGE Glimepiride tablets are indicated as an adjunct to diet and exercise to improve glycemic control in adults with type 2 diabetes mellitus [see Clinical Studies (14.1) ]. Limitations of Use Glimepiride tablets should not be used for the treatment of type 1 diabetes mellitus or diabetic ketoacidosis, as it would not be effective in these settings. Glimepiride tablets are a sulfonylurea indicated as an adjunct to diet and exercise to improve glycemic control in adults with type 2 diabetes mellitus ( 1 ). Limitations of Use: Not for treating type 1 diabetes mellitus or diabetic ketoacidosis ( 1 ).']
None - ['INDICATIONS AND USAGE: Central Diabetes Insipidus: Desmopressin acetate tablets are indicated as antidiuretic replacement therapy in the management of central diabetes insipidus and for the management of the temporary polyuria and polydipsia following head trauma or surgery in the pituitary region. Desmopressin acetate tablets are ineffective 

In [None]:
from io import StringIO
import httpx

def stream_call(user_prompt="", system_prompt="", temperature=1):
    data = {"user_prompt": user_prompt, "system_prompt": system_prompt, "temperature": temperature}
    url = 'https://test.noahai.co/api/claude/'
    token = "Token ab2af44c17490f0c3c3b221b0f6fc2c20d62590a"
    headers = {'Content-Type': 'application/json', 'Authorization': token}
    
    with httpx.Client() as client:
        with client.stream('POST', url, headers=headers, json=data, timeout=30) as r:
            for chunk in r.iter_text():
                yield chunk

buffer = StringIO()
for chunk in stream_call("Hello, how are you?", temperature=0):
    print(chunk, end='')  # Print each chunk as it arrives
    buffer.write(chunk)
print('\n------------------------')
print(buffer.getvalue())  # Print the entire buffer at once

In [2]:
import requests
import json
import warnings
from urllib3.exceptions import InsecureRequestWarning

warnings.simplefilter("ignore", InsecureRequestWarning)

HEADERS = {
    "accept": "application/json",
    "content-type": "application/json",
    "authorization": "Token ab2af44c17490f0c3c3b221b0f6fc2c20d62590a"
    }

def query_clinical_result(
    indication_name=None,
    drug_modality=None,
    locations=None,
    target=None,
    lead_company=None,
    phase=None,
    route_of_administration=None,
    drug_feature=None,
    drug_name=None,
    nctids=None,
    limit=30,
    page=1
):
    """
    Calls the clinical result API with the provided filters.

    Args:
        indication_name (list): List of indication names
        drug_modality (list or dict): Drug modality list or dict with data list and logic (or/and)
        locations (list or dict): Locations list or dict with data list and logic (or/and)
        target (list or dict): Targets list or dict with data list and logic (or/and)
        lead_company (list): List of lead companies
        phase (list): List of clinical phases
        route_of_administration (list or dict): ROA list or dict with data list and logic (or/and)
        drug_feature (list or dict): Drug features list or dict with data list and logic (or/and)
        drug_name (list or dict): Drug names list or dict with data list and logic (or/and)
        nctids (list): List of NCT IDs
        limit (int): Number of results to return (max 30)
        page (int): Page number for pagination

    Returns:
        dict: API response data
    """
    
    clinical_trial_api_url = f"https://staging.noahai.co/api/workflow/clinical-result/"
    
    # Helper function to convert list to dict with "or" logic if needed
    def ensure_dict_format(param):
        if isinstance(param, list):
            return {"data": param, "logic": "or"}
        return param
    
    # Build filter object with provided parameters
    filter_dict = {}
    if indication_name: filter_dict["indication_name"] = indication_name
    
    # Convert parameters to dicts with "or" logic if they're lists
    if drug_modality: filter_dict["drug_modality"] = ensure_dict_format(drug_modality)
    if locations: filter_dict["locations"] = ensure_dict_format(locations)
    if target: filter_dict["target"] = ensure_dict_format(target)
    if lead_company: filter_dict["lead_company"] = lead_company
    if phase: filter_dict["phase"] = phase
    if route_of_administration: filter_dict["route_of_administration"] = ensure_dict_format(route_of_administration)
    if drug_feature: filter_dict["drug_feature"] = ensure_dict_format(drug_feature)
    if drug_name: filter_dict["drug_name"] = ensure_dict_format(drug_name)
    if nctids: filter_dict["nctids"] = nctids
    
    body = {"filters": filter_dict, "limit": limit, "page": page}
    
    response = requests.post(clinical_trial_api_url, data=json.dumps(body), headers=HEADERS, timeout=240, allow_redirects=True, verify=False)
    try: ret = response.json()
    except: ret = response.text
    return ret

def query_drug_compete(
    location=None,
    drug_modality=None,
    indication_name=None,
    drug_names=None,
    company=None,
    target=None,
    drug_feature=None,
    route_of_administration=None,
    phase=None,
    limit=30,
    page=1
):
    """
    Calls the drug compete API with the provided filters.

    Args:
        location (list or dict): Locations list or dict with data list and logic (or/and)
        drug_modality (list or dict): Drug modality list or dict with data list and logic (or/and)
        indication_name (list): List of indication names
        drug_names (list or dict): Drug names list or dict with data list and logic (or/and)
        company (list): List of lead companies
        target (list or dict): Targets list or dict with data list and logic (or/and)
        drug_feature (list or dict): Drug features list or dict with data list and logic (or/and)
        route_of_administration (list or dict): ROA list or dict with data list and logic (or/and)
        phase (list): List of clinical phases
        limit (int): Number of results to return (max 30)
        page (int): Page number for pagination

    Returns:
        dict: API response data
    """
    
    drug_compete_api_url = f"https://staging.noahai.co/api/workflow/drug-compete/"
    
    # Helper function to convert list to dict with "or" logic if needed
    def ensure_dict_format(param):
        if isinstance(param, list):
            return {"data": param, "logic": "or"}
        return param
    
    # Build filter object with provided parameters
    filter_dict = {}
    if location: filter_dict["location"] = location
    if drug_modality: filter_dict["drug_modality"] = ensure_dict_format(drug_modality)
    if indication_name: filter_dict["indication_name"] = indication_name
    if drug_names: filter_dict["drug_names"] = ensure_dict_format(drug_names)
    if company: filter_dict["company"] = company
    if target: filter_dict["target"] = ensure_dict_format(target)
    if drug_feature: filter_dict["drug_feature"] = ensure_dict_format(drug_feature)
    if route_of_administration: filter_dict["route_of_administration"] = ensure_dict_format(route_of_administration)
    if phase: filter_dict["phase"] = phase
    
    body = {"filters": filter_dict, "limit": limit, "page": page}
    
    response = requests.post(drug_compete_api_url, data=json.dumps(body), headers=HEADERS, timeout=240, allow_redirects=True, verify=False)
    try: ret = response.json()
    except: ret = response.text
    return ret

full_trial_params_example = {
    "indication_name": [
        "Degeneration"
    ],
    "drug_modality": {
        "data": [
            "Small Molecule Drugs"
        ],
        "logic": "or"
    },
    "locations": {
        "data": [
            "United States"
        ],
        "logic": "or"
    },
    "target": {
        "data": [
            "ATP7B gene"
        ],
        "logic": "or"
    },
    "lead_company": [
        "Pfizer Inc. (PFE)"
    ],
    "phase": [
        "I"
    ],
    "route_of_administration": {
        "data": [
            "Intraarterial"
        ],
        "logic": "or"
    },
    "drug_feature": {
        "data": [
            "505b2"
        ],
        "logic": "or"
    },
    "drug_name": {
        "data": [
            "Brimochol F"
        ],
        "logic": "or"
    },
    "nctids": [
        "NCT0123"
    ]
}

full_drug_params_example = {
    "location": [
        "USA"
    ],
    "drug_modality": {
        "data": [
            "Protein Degrader"
        ],
        "logic": "or"
    },
    "indication_name": [
        "Porphyria acute"
    ],
    "drug_names": {
        "data": [
            "Aiphagan P"
        ],
        "logic": "or"
    },
    "company": [
        "Aravax Pvt Ltd"
    ],
    "target": {
        "data": [
            "FXYD domain containing ion transport regulator 5(RIC, HSPC113, KCT1, PRO6241, FXYD5, OIT2, DYSAD, IWU1)"
        ],
        "logic": "or"
    },
    "drug_feature": {
        "data": [
            "Bacterial Product"
        ],
        "logic": "or"
    },
    "route_of_administration": {
        "data": [
            "Intralymphatic"
        ],
        "logic": "or"
    },
    "phase": [
        "III"
    ]
}

trial_params_example = {
    "locations": {
        "data": [
            "United States"
        ],
        "logic": "or"
    }
}

drug_params_example = {
    "location": [
        "USA"
    ]
}

trials_result = query_clinical_result(**trial_params_example)
drugs_result = query_drug_compete(**drug_params_example)

print(str(trials_result)[:100])
print(str(drugs_result)[:100])

print(len(trials_result['results']))
print(len(drugs_result['results']))

print(trials_result['results'][0] if trials_result['results'] else "No results found")
print(drugs_result['results'][0] if drugs_result['results'] else "No results found")

{'results': [{'id': 59350, 'nct_id': 'NCT04505722', 'primary_id': 'NCT04505722', 'last_updated': '20
{'results': [{'name': 'REC-648647', 'other_names': [], 'lead_company': ['Recursion Pharmaceuticals']
30
30
{'id': 59350, 'nct_id': 'NCT04505722', 'primary_id': 'NCT04505722', 'last_updated': '2022-02-01', 'official_title': 'A Randomized, Double-blind, Placebo-controlled Phase 3 Study to Assess the Efficacy and Safety of Ad26.COV2.S for the Prevention of SARS-CoV-2-mediated COVID-19 in Adults Aged 18 Years and Older', 'lead_company': 'Johnson & Johnson (JNJ)', 'partner_companies': None, 'drug_name': ['Jcovden/Ad26.COV2.S'], 'drug_modality': ['Vaccine'], 'drug_feature': ['Precision Medicine'], 'route_of_administration': ['Intramuscular (IM) Injection'], 'indication_name': ['Coronavirus disease 19 infection'], 'target': ['Immune System', 'SARS-CoV-2'], 'phase': 'III', 'phase_mapping': ['III'], 'current_status': 'Final Data', 'gender': 'Both', 'actual_enrollment': 44325, 'locations': ['Arge

In [5]:
import requests

url = "https://api.fda.gov/drug/label.json"
params = {
    "search": "diabetes",
    "limit": 5
}

response = requests.get(url, params=params)
data = response.json()

for item in data["results"]:
    print(item["openfda"].get("brand_name"), "-", item.get("indications_and_usage"))

['Mekinist'] - ['1 INDICATIONS AND USAGE MEKINIST is a kinase inhibitor indicated as a single agent for the treatment of BRAF-inhibitor treatment-naïve patients with unresectable or metastatic melanoma with BRAF V600E or V600K mutations as detected by an FDA-approved test. ( 1.1 , 2.1 ) MEKINIST is indicated, in combination with dabrafenib, for: the treatment of patients with unresectable or metastatic melanoma with BRAF V600E or V600K mutations as detected by an FDA-approved test. ( 1.1 , 2.1 ) the adjuvant treatment of patients with melanoma with BRAF V600E or V600K mutations, as detected by an FDA-approved test, and involvement of lymph node(s), following complete resection. ( 1.2 , 2.1 ) the treatment of patients with metastatic non-small cell lung cancer (NSCLC) with BRAF V600E mutation as detected by an FDA-approved test. ( 1.3 , 2.1 ) the treatment of patients with locally advanced or metastatic anaplastic thyroid cancer (ATC) with BRAF V600E mutation, as detected by an FDA-appr

In [2]:
import json
from pathlib import Path

# 文件路径
file_path = Path("../MAIA/MAIA_final_sorted.json")  # 请根据实际路径修改

# 读取 JSON 数据
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 统计每个问题的词数（按空格分词）
question_lengths = [len(item["question"].split()) for item in data]

# 计算平均长度
average_length = sum(question_lengths) / len(question_lengths)

# 输出结果
print(f"Total questions: {len(question_lengths)}")
print(f"Average question length (in words): {average_length:.2f}")


Total questions: 1014
Average question length (in words): 59.04


In [5]:
import json
from pathlib import Path

# 设置数据路径
file_path = Path("../MAIA/MAIA_final_sorted.json")  # 请根据你的实际路径修改

# 加载数据
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 初始化类型 → 深度映射
depths = {
    "retrieval": [],
    "kg_reasoning": [],
    "diagnostic_pathway": []
}

# 遍历每一条数据，统计其推理深度
for item in data:
    qtype = item.get("type", "")
    if qtype == "retrieval":
        depth = 1
    elif qtype == "kg_reasoning":
        path = item.get("umls_path", [])
        depth = len(path) // 2  # 每个 hop = 1 rela + 1 concept
    elif qtype == "diagnostic_pathway":
        try:
            nodes = item["tool_calls"][0]["params"]["nodes"]
            depth = len(nodes)
        except (KeyError, IndexError):
            depth = 0
    else:
        continue

    depths[qtype].append(depth)

# 计算加权总平均
total_depth = sum(sum(v) for v in depths.values())
total_count = sum(len(v) for v in depths.values())
overall_average = total_depth / total_count if total_count else 0

print(f"Overall average reasoning depth: {overall_average:.2f}")


Overall average reasoning depth: 6.50


In [5]:
import json
from pathlib import Path

# === 路径（按需修改） ============================================
filepath_reasoning = Path("../MAIA/MAIA_reasoning.json")   # 问题数据
filepath_scores   = Path("../res/deepseek-v3/judge_scores.json")      # 评判结果
# ===============================================================

# 1. 读入文件 -----------------------------------------------------
with filepath_reasoning.open(encoding="utf-8") as f:
    data_reasoning = json.load(f)

with filepath_scores.open(encoding="utf-8") as f:
    data_scores = json.load(f)

# 2. 初始化计数器 -------------------------------------------------
totals  = {"kg_reasoning": 0.0, "diagnostic_pathway": 0.0}
counts  = {"kg_reasoning": 0,   "diagnostic_pathway": 0}

# 3. 遍历数据并累加得分 ------------------------------------------
for idx, item in enumerate(data_reasoning.get("dataset", [])):
    qtype = item.get("type")
    score_entry = data_scores.get(str(idx))        # 题号 → 得分
    if qtype in totals and score_entry is not None:
        totals[qtype]  += float(score_entry["score"])
        counts[qtype]  += 1

# 4. 计算平均分 ----------------------------------------------------
avg_scores = {k: (totals[k] / counts[k] if counts[k] else 0.0)
              for k in totals}

# 5. 输出结果 ------------------------------------------------------
print("Average Scores:")
for qtype, avg in avg_scores.items():
    print(f"  {qtype:20s}: {avg:.3f}")


Average Scores:
  kg_reasoning        : 3.429
  diagnostic_pathway  : 4.156
