In [12]:
# read csv file
import pandas as pd
df = pd.read_csv('valid_clinical_trials_20250417.csv', header=None)
nctid= df.iloc[:, 0].tolist()


In [14]:
import requests
import pandas as pd

# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"
params = {
    "filter.ids":",".join(nctid),
    "pageSize": 100
}

# Initialize an empty list to store the data
data_list = []

# Loop until there is no nextPageToken
while True:
    # Print the current URL (for debugging purposes)
    print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
    
    # Send a GET request to the API
    response = requests.get(base_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        studies = data.get('studies', [])  # Extract the list of studies

        # Loop through each study and extract specific information
        for study in studies[:100]:
            # Safely access nested keys
            nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
            overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
            startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
            conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
            acronym = study['protocolSection']['identificationModule'].get('acronym', 'Unknown')

            # Extract interventions safely
            interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
            interventions = ', '.join([intervention.get('name', 'No intervention name listed') for intervention in interventions_list]) if interventions_list else "No interventions listed"
            
            # Extract locations safely
            locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
            locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
            
            # Extract dates and phases
            primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
            studyFirstPostDate = study['protocolSection']['statusModule'].get('studyFirstPostDateStruct', {}).get('date', 'Unknown Date')
            lastUpdatePostDate = study['protocolSection']['statusModule'].get('lastUpdatePostDateStruct', {}).get('date', 'Unknown Date')
            studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
            phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))

            # Append the data to the list as a dictionary
            data_list.append({
                "NCT ID": nctId,
                "Acronym": acronym,
                "Overall Status": overallStatus,
                "Start Date": startDate,
                "Conditions": conditions,
                "Interventions": interventions,
                "Locations": locations,
                "Primary Completion Date": primaryCompletionDate,
                "Study First Post Date": studyFirstPostDate,
                "Last Update Post Date": lastUpdatePostDate,
                "Study Type": studyType,
                "Phases": phases
            })

        # Check for nextPageToken and update the params or break the loop
        nextPageToken = data.get('nextPageToken')
        if nextPageToken:
            params['pageToken'] = nextPageToken  # Set the pageToken for the next request
        else:
            break  # Exit the loop if no nextPageToken is present
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        break

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)

# Print the DataFrame
print(df)

# Optionally, save the DataFrame to a CSV file
df.to_csv("clinical_trials_data_complete.csv", index=False)

Fetching data from: https://clinicaltrials.gov/api/v2/studies?filter.ids=NCT00027300,NCT01712490,NCT00099788,NCT00215800,NCT00005947,NCT00065442,NCT03566043,NCT00097591,NCT00157209,NCT00088530,NCT00129142,NCT00111319,NCT01133704,NCT00424047,NCT00093158,NCT00123253,NCT00004205,NCT00056160,NCT00089570,NCT00024440,NCT00688740,NCT00117676,NCT00116805,NCT00314951,NCT00071799,NCT00071487,NCT00154102,NCT00179660,NCT00262080,NCT00213135,NCT00257608,NCT00274651,NCT00294723,NCT00307437,NCT00327691,NCT00126724,NCT00391872,NCT00308139,NCT00289640,NCT00289978,NCT00287729,NCT00287716,NCT00333775,NCT00325195,NCT00364013,NCT05428969,NCT00321464,NCT00395135,NCT00603902,NCT00358150,NCT00357279,NCT00355134,NCT00343564,NCT00364923,NCT00262600,NCT00152386,NCT00113607,NCT02873936,NCT00542555,NCT00340834,NCT03207009,NCT00420212,NCT00451451,NCT00337103,NCT00135408,NCT00524277,NCT00412984,NCT00413036,NCT00050778,NCT00134563,NCT00468728,NCT00125034,NCT00467844,NCT00298038,NCT00297258,NCT00403767,NCT00318461,NCT

In [15]:
import openai
import requests
import random
import re
import json
import time
from openai import OpenAI
import os 
os.environ["OPENAI_API_KEY"] = 'sk-4jnd9yjoIXnQRQ5SXR2b3bVO1d3sHtuyegGMzAl6awSWDRNn' 
os.environ['OPENAI_BASE_URL'] = 'https://api2.aigcbest.top/v1' 
client = OpenAI()

In [16]:
import re

def generate_retrieval_question(data):
    conditions = data.get("Conditions", "")
    interventions = data.get("Interventions", "")
    overall_status = data.get("Overall Status", "")
    locations = data.get("Locations", "")
    
    prompt = (
        "Please generate a precise search question based on the following clinical trial information so that the literature can be accurately matched in the clinical trial database.\n"
        f"Conditions：{conditions}\n"
        f"interventions：{interventions}\n"
        f"overall_status：{overall_status}\n\n"
        f"locations：{locations}\n\n"
        "Please output in the following format：\n"
        "Question: Please find clinical trials on ClinicalTrials.gov related to Unresectable Hilar Cholangiocarcinoma, with interventions REMS+TAI and SEMS+TAI, and status COMPLETED. The trial was conducted in Nanjing - China.Please answer the NCT ID."
    )
    
    try:
        client = OpenAI()
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[ 
                {"role": "system", "content": "You are an expert in clinical trial information retrieval, good at formulating precise and natural questions"},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=250
        )
        content = response.choices[0].message.content
        # 尝试获取以“问题:”开头的文本
        match = re.search(r"Question:\s*(.*)", content, re.DOTALL)
        if match:
            question_text = match.group(0).strip()
            return question_text
        else:
            return content.strip()
    except Exception as e:
        print("调用 GPT API 生成检索问题出错:", e)
        return None



In [17]:
# read csv file
import pandas as pd
df = pd.read_csv("clinical_trials_data_complete.csv")
# df to json list
data_list = df.to_dict(orient='records')

def construct_answer(data):
    """
    构造答案，答案直接采用文献信息，包括标题、PMID、发表日期和溯源链接。
    """
    nctid = data.get("NCT ID", "")
    return {
        "NCT ID": nctid,
    }

In [18]:
import json
from tqdm import tqdm  # 可选，用于显示进度条

qa_pairs = []

for record in tqdm(data_list, desc="Processing records"):
    # 生成检索问题
    question = generate_retrieval_question(record)
    # 构造答案
    answer = construct_answer(record)
    # 将问答对加入列表
    qa_pairs.append({
        "question": question,
        "answer": answer
    })

# 将结果写入 JSON 文件
with open("qa_pairs.json", "w", encoding="utf-8") as f:
    json.dump(qa_pairs, f, ensure_ascii=False, indent=2)

print(f"共生成 {len(qa_pairs)} 条问答，已保存至 qa_pairs.json")


Processing records: 100%|██████████| 100/100 [04:13<00:00,  2.54s/it]

共生成 100 条问答，已保存至 qa_pairs.json





In [1]:
import os 
from openai import AzureOpenAI
os.environ["AZURE_OPENAI_API_KEY"] = "5a1437f6ff2648b9b969507fb5a73276"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://ai-mistraleastus2753718354821.openai.azure.com/"
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-12-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [None]:
"""
generate_clinical_trials_qa.py
———————————————
Read a CSV of clinical-trial metadata, call GPT to create natural-language
ClinicalTrials.gov questions, and save QA pairs to JSON.
"""

import os, json, time
import pandas as pd
from dateutil import parser
import openai

# ========== 配置 ==========
csv_path   = "./source/clinical_trials_data_complete.csv"
json_out   = "qa_pairs.json"
# openai.api_key = os.getenv("OPENAI_API_KEY")  # 或直接写明字符串
model_name = "gpt-4.1-noah"                         # 也可用 gpt-3.5-turbo-0125
sleep_sec  = 1.2                              # 速率限制：可按需要调整
# =========================

# 把日期 "2003-01-01" -> "January 2003"，"2008-01" -> "2008"
def pretty_date(d):
    try:
        # 处理空值 / NaN
        if pd.isna(d): 
            return ""
        d = str(d).strip()
        # 只有年份或“YYYY-MM”
        if len(d) <= 7:
            return d
        return parser.parse(d).strftime("%B %Y")
    except Exception:
        return d

# 给 GPT 的 system 提示（一次即可，不必重复发送）
SYSTEM_MSG = (
    "You create concise, natural search questions for ClinicalTrials.gov. "
    "The question must:\n"
    "1. Begin with 'In ClinicalTrials.gov,' (or similar) to specify scope.\n"
    "2. Cite 3-6 key fields (status, phase, study type, start date, condition, "
    "main interventions, countries/regions) that uniquely pinpoint the trial.\n"
    "3. Avoid long exhaustive lists; summarize interventions/locations clearly "
    "and naturally (e.g., 'compared drug A with placebo', 'across sites in the United States and Canada').\n"
    "4. Use fluent English; no SQL-like phrasing.\n"
    "Return ONLY the single sentence question."
)

def build_user_prompt(row):
    """给 GPT 的 user 消息，包含原始字段，提示它写问题"""
    fields = {
        "NCT_ID":             row["NCT ID"],
        "Acronym":            row.get("Acronym", ""),
        "Overall_Status":     row["Overall Status"],
        "Phase":              row.get("Phases", ""),
        "Study_Type":         row.get("Study Type", ""),
        "Start_Date":         pretty_date(row["Start Date"]),
        "Conditions":         row["Conditions"],
        "Interventions":      row["Interventions"],
        "Locations":          row["Locations"],
    }
    # 让 GPT 按我们的规则写问题，但别透露 NCT_ID
    return (
        "Write one natural-language question that would let a person retrieve "
        "this exact clinical trial on ClinicalTrials.gov. Use the metadata JSON below; "
        "do NOT mention the NCT_ID, do NOT list every city; keep it concise.\n\n"
        f"metadata = {json.dumps(fields, ensure_ascii=False)}"
    )

def ask_gpt(metadata_row):
    """向 GPT 发送聊天消息并返回问题字符串"""
    resp = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": SYSTEM_MSG},
            {"role": "user",   "content": build_user_prompt(metadata_row)},
        ],
        temperature=0.7,
    )
    return  resp.choices[0].message.content

def main():
    df = pd.read_csv(csv_path)
    qa_pairs = []

    for i, row in df.iterrows():
        try:
            question = ask_gpt(row)
            qa_pairs.append({
                "question": question,
                "answer":   row["NCT ID"]
            })
            print(f"[{i+1}/{len(df)}] ✅ 生成完成: {row['NCT ID']}")
        except Exception as e:
            print(f"[{i+1}/{len(df)}] ❌ 失败: {e}")
        time.sleep(sleep_sec)   # 简单限速，避免 429

    with open(json_out, "w", encoding="utf-8") as f:
        json.dump(qa_pairs, f, ensure_ascii=False, indent=2)
    print(f"\n✨ 已生成 {len(qa_pairs)} 对问答，保存至 {json_out}")

if __name__ == "__main__":
    main()


In [11]:
"""
generate_clinical_trials_qa.py
———————————————
Read a CSV of clinical-trial metadata, call GPT to create natural-language
ClinicalTrials.gov questions, and save QA pairs to JSON.
"""
import os 
from openai import OpenAI
os.environ["OPENAI_API_KEY"] = 'sk-4jnd9yjoIXnQRQ5SXR2b3bVO1d3sHtuyegGMzAl6awSWDRNn' 
os.environ['OPENAI_BASE_URL'] = 'https://api2.aigcbest.top/v1' 
client = OpenAI()
import os, json, time
import pandas as pd
from dateutil import parser
import openai

# ========== 配置 ==========
csv_path   = "clinical_trials_data_complete.csv"
json_out   = "qa_pairs_tmp.json"
openai.api_key = os.getenv("OPENAI_API_KEY")  # 或直接写明字符串
model_name = "gpt-4o"                         # 也可用 gpt-3.5-turbo-0125
sleep_sec  = 1.2                              # 速率限制：可按需要调整
# =========================

# --- 新增辅助函数 ---------------------------------------------
def first_n_locations(loc_str, n=5):
    """返回 'City - Country, ...' 的前 n 项，超出部分用 'etc.' 收尾"""
    if pd.isna(loc_str):
        return ""
    parts = [p.strip() for p in loc_str.split(",") if p.strip()]
    if len(parts) > n:
        return ", ".join(parts[:n]) + ", etc."
    return ", ".join(parts)

def shorten_interventions(iv_str, max_items=3):
    """保留 1–3 个关键干预，剩余用 'others' 表示（可选）"""
    if pd.isna(iv_str):
        return ""
    items = [i.strip() for i in iv_str.split(",") if i.strip()]
    if len(items) > max_items:
        items = items[:max_items] + ["others"]
    return ", ".join(items)
    
# 把日期 "2003-01-01" -> "January 2003"，"2008-01" -> "2008"
def pretty_date(d):
    try:
        # 处理空值 / NaN
        if pd.isna(d): 
            return ""
        d = str(d).strip()
        # 只有年份或“YYYY-MM”
        if len(d) <= 7:
            return d
        return parser.parse(d).strftime("%B %Y")
    except Exception:
        return d

# 给 GPT 的 system 提示（一次即可，不必重复发送）
SYSTEM_MSG = (
    "You create concise, natural search questions for ClinicalTrials.gov. "
    "The question must:\n"
    "1. Begin with 'In ClinicalTrials.gov,' (or similar) to specify scope.\n"
    "2. Cite 3-6 key fields (status, phase, study type, start date, condition, "
    "main interventions, countries/regions) that uniquely pinpoint the trial.\n"
    "3. Avoid long exhaustive lists; summarize interventions/locations clearly "
    "and naturally (e.g., 'compared drug A with placebo', 'across sites in the United States and Canada').\n"
    "4. Use fluent English; no SQL-like phrasing.\n"
    "Example style: \"In ClinicalTrials.gov, what is the NCT number for a completed Phase 3 interventional study "
    "that started in January 2003, investigated CC-5013 plus dexamethasone for multiple myeloma, and recruited "
    "patients in both the United States and Canada?\"\n"
    "Return ONLY the single-sentence question."
)

def build_user_prompt(row):
    fields = {
        "NCT_ID":         row["NCT ID"],
        "Overall_Status": row["Overall Status"],
        "Phase":          row.get("Phases", ""),
        "Study_Type":     row.get("Study Type", ""),
        "Start_Date":     pretty_date(row["Start Date"]),
        "Conditions":     row["Conditions"],
        "Interventions":  shorten_interventions(row["Interventions"]),
        "Locations":      first_n_locations(row["Locations"], n=5),
    }
    return (
        "Write ONE natural-language question that would let a person retrieve "
        "this exact clinical trial on ClinicalTrials.gov. Use the metadata JSON below; "
        "do NOT mention the NCT_ID, keep it concise.\n\n"
        f"metadata = {json.dumps(fields, ensure_ascii=False)}"
    )

def ask_gpt(metadata_row):
    """向 GPT 发送聊天消息并返回问题字符串"""
    resp = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": SYSTEM_MSG},
            {"role": "user",   "content": build_user_prompt(metadata_row)},
        ],
        temperature=0.2,
    )
    return  resp.choices[0].message.content

def main():
    df = pd.read_csv(csv_path)
    qa_pairs = []

    for i, row in df.iterrows():
        try:
            question = ask_gpt(row)
            qa_pairs.append({
                "question": question,
                "answer":   row["NCT ID"]
            })
            print(f"[{i+1}/{len(df)}] ✅ 生成完成: {row['NCT ID']}")
        except Exception as e:
            print(f"[{i+1}/{len(df)}] ❌ 失败: {e}")
        time.sleep(sleep_sec)   # 简单限速，避免 429

    with open(json_out, "w", encoding="utf-8") as f:
        json.dump(qa_pairs, f, ensure_ascii=False, indent=2)
    print(f"\n✨ 已生成 {len(qa_pairs)} 对问答，保存至 {json_out}")

if __name__ == "__main__":
    main()


[1/100] ✅ 生成完成: NCT00056160
[2/100] ✅ 生成完成: NCT00637273
[3/100] ✅ 生成完成: NCT00262080
[4/100] ✅ 生成完成: NCT00446680
[5/100] ✅ 生成完成: NCT00097591
[6/100] ✅ 生成完成: NCT00327691
[7/100] ✅ 生成完成: NCT00117676
[8/100] ✅ 生成完成: NCT00088530
[9/100] ✅ 生成完成: NCT00627926
[10/100] ✅ 生成完成: NCT00287729
[11/100] ✅ 生成完成: NCT00403767
[12/100] ✅ 生成完成: NCT00289640
[13/100] ✅ 生成完成: NCT01133704
[14/100] ✅ 生成完成: NCT00024440
[15/100] ✅ 生成完成: NCT00530348
[16/100] ✅ 生成完成: NCT00213135
[17/100] ✅ 生成完成: NCT00297258
[18/100] ✅ 生成完成: NCT00395135
[19/100] ✅ 生成完成: NCT00152386
[20/100] ✅ 生成完成: NCT00298038
[21/100] ✅ 生成完成: NCT00257608
[22/100] ✅ 生成完成: NCT00391872
[23/100] ✅ 生成完成: NCT00364923
[24/100] ✅ 生成完成: NCT00358150
[25/100] ✅ 生成完成: NCT05428969
[26/100] ✅ 生成完成: NCT00126724
[27/100] ✅ 生成完成: NCT00530816
[28/100] ✅ 生成完成: NCT00340834
[29/100] ✅ 生成完成: NCT00125034
[30/100] ✅ 生成完成: NCT00355134
[31/100] ✅ 生成完成: NCT00287716
[32/100] ✅ 生成完成: NCT00093158
[33/100] ✅ 生成完成: NCT00554216
[34/100] ✅ 生成完成: NCT00179660
[35/100] ✅ 生成完成: NCT000

In [None]:
# tools/ctgov_tool.py
import os, requests

BASE = "https://clinicaltrials.gov/api/v2/studies"

def ctgov_search(filter_expr: str, page_size: int = 100) -> list[str]:
    """
    Call ClinicalTrials.gov v2 search.
    `filter_expr` 一般形如:
        "overallStatus=Completed;conditions=Multiple+Myeloma;phases=Phase+3"
    返回符合条件的 NCT_ID 列表（最多 page_size 条）。
    """
    params = {
        "filter": filter_expr,
        "pageSize": page_size,
        # 还可以加 sort, include etc.
    }
    r = requests.get(BASE, params=params, timeout=20)
    r.raise_for_status()
    studies = r.json().get("studies", [])
    return [s["protocolSection"]["identificationModule"]["nctId"]
            for s in studies]


In [None]:
# tools/ctgov_schema.py
ctgov_schema = {
    "name": "ctgov_search",
    "description": (
        "Search ClinicalTrials.gov v2 and return a list of NCT IDs that match "
        "the given filter expression."
    ),
    "parameters": {
        "type": "object",
        "properties": {
            "filter_expr": {
                "type": "string",
                "description": (
                    "Semicolon-separated filter expression, e.g. "
                    "\"overallStatus=Completed;conditions=Multiple+Myeloma;phases=Phase+3\""
                )
            },
            "page_size": {
                "type": "integer",
                "description": "Maximum number of studies to return",
                "default": 100
            }
        },
        "required": ["filter_expr"]
    }
}


In [6]:
"""
build_ctgov_qa_dataset_one_call_minimal_filter.py
-------------------------------------------------
Single GPT call returns question & filter_expr.
Only the constraints mentioned in question are kept.
"""

import os, json, time, re, hashlib, pandas as pd
from dateutil import parser
from openai import AzureOpenAI
from tqdm.auto import tqdm

# ---------- Azure OpenAI ----------
os.environ["AZURE_OPENAI_API_KEY"]  = "5a1437f6ff2648b9b969507fb5a73276"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://ai-mistraleastus2753718354821.openai.azure.com/"

client = AzureOpenAI(
    api_key       = os.getenv("AZURE_OPENAI_API_KEY"),
    api_version   = "2024-12-01-preview",
    azure_endpoint= os.getenv("AZURE_OPENAI_ENDPOINT"),
)

MODEL      = "gpt-4.1-noah"
CSV_PATH   = "../source/clinical_trials_data_complete.csv"
OUT_JSON   = "ctgov_qa_dataset.json"
SLEEP_SEC  = 1.0                 # 简单限速

# ---------- helper ----------
def pretty_date(d):
    try:
        if pd.isna(d): return ""
        return parser.parse(str(d)).strftime("%B %Y")
    except Exception:
        return str(d)

SYSTEM_MSG = (
    "You are given structured metadata of a clinical trial. "
    "Return a JSON object with exactly two keys:\n\n"
    "1) `question`  – ONE concise English question that starts with "
    "\"In ClinicalTrials.gov,\" and **explicitly** states some (not all) of: "
    "status, phase, study type, start month+year, condition(s), main intervention,"
    " and at least one country. Do NOT reveal the NCT ID.\n\n"
    "2) `filter_expr` – a ClinicalTrials.gov v2 filter string that contains "
    "**only** those keys that your question explicitly mentioned. "
    "Use keys: overallStatus, phases, studyType, conditions, interventions.name, "
    "startDateFrom (YYYY-MM-01), locations.country.  "
    "If a key is NOT mentioned verbatim or semantically in the question, DO NOT include it. "
    "Separate pairs with semicolons.  \n\n"
    "Return ONLY the JSON."
)

def gpt_generate(row: pd.Series) -> dict:
    user_msg = (
        f"status={row['Overall Status']}; "
        f"phase={row['Phases']}; "
        f"type={row['Study Type']}; "
        f"start={pretty_date(row['Start Date'])}; "
        f"conditions={row['Conditions']}; "
        f"interventions={row['Interventions'][:120]}; "
        f"locations={row['Locations'][:120]}"
    )
    resp = client.chat.completions.create(
        model   = MODEL,
        messages=[{"role":"system", "content": SYSTEM_MSG},
                  {"role":"user",   "content": user_msg}],
        response_format={"type": "json_object"},
        temperature=0.2,
    )
    return json.loads(resp.choices[0].message.content)

# ---------- 自动校验 & 精简 ----------
# 映射：filter 键 -> 判断其是否在 question 中出现的 regex
REGEX_MAP = {
    "overallStatus":      r"(Completed|Recruiting|Active,? not recruiting|Unknown status|Suspended|Terminated)",
    "phases":             r"(Phase\s*0|Phase\s*1/2|Phase\s*1|Phase\s*2/3|Phase\s*2|Phase\s*3|Phase\s*4)",
    "studyType":          r"(Interventional|Observational|Expanded Access)",
    "conditions":         r"(?:[A-Z][a-zA-Z0-9\- ]+)",
    "interventions.name": r"(?:[A-Z][a-zA-Z0-9\-\+ ]+)",
    "startDateFrom":      r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}",
    "locations.country":  r"(United States|Canada|China|Japan|France|Germany|Italy|Spain|United Kingdom|Australia|Mexico|Brazil|India)"
}

def trim_filter_expr(question: str, filter_expr: str) -> str:
    """
    Remove key=value pairs whose key is not actually mentioned in the question.
    """
    kept = []
    for pair in filter_expr.split(";"):
        if "=" not in pair: 
            continue
        key, val = pair.split("=", 1)
        key = key.strip()
        regex = REGEX_MAP.get(key)
        if not regex:
            continue
        if re.search(regex, question, flags=re.I):
            kept.append(pair.strip())
    return ";".join(kept)

# ---------- build dataset ----------
df      = pd.read_csv(CSV_PATH)
samples = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Building QA"):
    try:
        data   = gpt_generate(row)
        q      = data["question"].strip()
        filt   = data["filter_expr"].strip()

        # 加前缀双保险
        if not q.lower().startswith("in clinicaltrials.gov"):
            q = "In ClinicalTrials.gov, " + q[0].lower() + q[1:]

        # ▶ 自动精简，确保完全对齐
        filt = trim_filter_expr(q, filt)

        samples.append({
            "id": hashlib.md5(filt.encode()).hexdigest()[:16],
            "question": q,
            "tool_calls": [{
                "tool": "ctgov_search",
                "params": {"filter_expr": filt, "page_size": 100}
            }],
            "answer": [row["NCT ID"]]
        })
        time.sleep(SLEEP_SEC)

    except Exception as e:
        tqdm.write(f"⚠️  {row['NCT ID']} failed: {e}")

with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump({"dataset": samples}, f, ensure_ascii=False, indent=2)

print(f"\n✅ Saved {len(samples)} samples → {OUT_JSON}")


Building QA:   0%|          | 0/100 [00:00<?, ?it/s]


✅ Saved 100 samples → ctgov_qa_dataset.json
