In [None]:
# read the csv file
import pandas as pd 
df = pd.read_csv('../source/target_ids.csv')
target_ids = df['target_id'].tolist()

In [None]:
import requests
import json

# Define the list of target_id

# URL for OpenTargets API
url = "https://api.platform.opentargets.org/api/v4/graphql"

def query_associated_diseases(target_id):
    query = {
        "query": f"""
        {{
          target(ensemblId: "{target_id}") {{
            id
            approvedSymbol
            associatedDiseases {{
              count
              rows {{
                disease {{
                  id
                  name
                }}
                score
              }}
            }}
          }}
        }}
        """
    }

    # Sending the POST request to the OpenTargets API
    response = requests.post(url, json=query)

    if response.status_code == 200:
        data = response.json()
        
        # Ensure the response data is not None and has the required structure
        if data and 'data' in data and 'target' in data['data']:
            diseases = data['data']['target'].get('associatedDiseases', {}).get('rows', [])
            approved_symbol = data['data']['target'].get('approvedSymbol', None)
            return approved_symbol, diseases
        else:
            print(f"No data found for target_id: {target_id}")
            return None
    else:
        print(f"Error: {response.status_code}")
        return None

# Initialize the list for storing all the question-answer pairs
all_question_answer_pairs = []

# Iterate through the list of target_ids
for target_id in target_ids:
    result = query_associated_diseases(target_id)
    
    # Check if the result is None (i.e., the query failed or returned no data)
    if result is None:
        print(f"No data returned for target_id: {target_id}")
        continue
    
    approved_symbol, associated_diseases = result

    # Refined question formulation
    question = f"In the OpenTargets database, please retrieve the diseases associated with the target having target_id as {target_id} and approvedSymbol as {approved_symbol}, where the relevance score is greater than 0.5. Please provide the disease id, name, and relevance score for each associated disease."
    
    # Prepare the answer
    answer = []
    if associated_diseases:
        for disease in associated_diseases:
            score = disease['score']
            if score > 0.5:  # Only include diseases with a relevance score greater than 0.5
                answer.append({
                    "disease_id": disease['disease']['id'],
                    "disease_name": disease['disease']['name'],
                    "score": score
                })
    else:
        answer = "no associated diseases found"
    
    # Create the question-answer pair
    output_data = {
        "question": question,
        "answer": answer
    }

    # Add the question-answer pair to the list
    all_question_answer_pairs.append(output_data)

# Save all question-answer pairs into a single JSON file
with open("associated_disease.json", "w") as json_file:
    json.dump(all_question_answer_pairs, json_file, indent=4)

print("All question-answer pairs have been saved to associated_disease.json.")


In [9]:
import requests
import json

# Define a function to query the OpenTargets API for tractability information
def query_target_tractability(target_id):
    url = "https://api.platform.opentargets.org/api/v4/graphql"
    
    query = {
        "query": f"""
        {{
          target(ensemblId: "{target_id}") {{
            id
            approvedSymbol
            tractability {{
              modality
              label
              value
            }}
          }}
        }}
        """
    }
    
    # Sending the POST request to the OpenTargets API
    response = requests.post(url, json=query)
    
    if response.status_code == 200:
        data = response.json()
        
        # Ensure the response data is valid
        if data and 'data' in data and 'target' in data['data']:
            tractability = data['data']['target'].get('tractability', [])
            approved_symbol = data['data']['target'].get('approvedSymbol', None)
            return approved_symbol, tractability
        else:
            print(f"No data found for target_id: {target_id}")
            return None
    else:
        print(f"Error: {response.status_code}")
        return None


# Initialize the list for storing question-answer pairs
all_question_answer_pairs = []

# Iterate through the list of target IDs
for target_id in target_ids:
    result = query_target_tractability(target_id)
    
    # Check if the result is None (i.e., the query failed or returned no data)
    if result is None:
        print(f"No data returned for target_id: {target_id}")
        continue
    
    approved_symbol, tractability = result
    
    # Construct the updated question
    question = f"In the OpenTargets database, please retrieve the tractability information for the target with target ID {target_id} and approved symbol {approved_symbol}. For each modality, provide the label where the value is `True`."
    
    # Prepare the answer, filtering only tractability items with value True
    answer = []
    if tractability:
        for item in tractability:
            if item['value'] == True:  # Only include items where value is True
                answer.append({
                    "modality": item['modality'],
                    "label": item['label'],
                    "value": item['value']
                })
    else:
        answer = "No tractability information found."

    # Create the question-answer pair
    output_data = {
        "question": question,
        "answer": answer
    }

    # Add the question-answer pair to the list
    all_question_answer_pairs.append(output_data)

# Save all question-answer pairs into a single JSON file
with open("target_tractability.json", "w") as json_file:
    json.dump(all_question_answer_pairs, json_file, indent=4)

print("All question-answer pairs have been saved to target_tractability.json.")


All question-answer pairs have been saved to target_tractability.json.


In [None]:
import os, json, time, uuid, requests, openai
from typing import List, Dict
import re
import requests
from tqdm import tqdm
from openai import OpenAI
OT_ENDPOINT = "https://api.platform.opentargets.org/api/v4/graphql"

def fetch_safety_liabilities(gene_id: str):
    """
    返回 target 节点；若存在 death 事件仅保留 death，
    否则只保留第一条 safetyLiability。
    """
    query = {
        "query": f"""
        {{
          target(ensemblId: "{gene_id}") {{
            id
            approvedSymbol
            safetyLiabilities {{
              event
              eventId
              biosamples {{
                tissueLabel
                tissueId
              }}
              effects {{
                dosing
                direction
              }}
            }}
          }}
        }}
        """
    }

    try:
        resp = requests.post(
            OT_ENDPOINT,
            json=query,
            timeout=30,
            headers={"Content-Type": "application/json"},
        )
        resp.raise_for_status()
        tgt = resp.json().get("data", {}).get("target")
        if not tgt or not tgt.get("safetyLiabilities"):
            return None

        # -------- 只保留 death 或首条 ----------
        liabs = tgt["safetyLiabilities"]
        death_only = [li for li in liabs if li["event"].lower() == "death"]
        tgt["safetyLiabilities"] = death_only or [liabs[0]]

        return tgt

    except requests.exceptions.RequestException as e:
        print(f"[error] {gene_id} → {e}")
        return None



# ---------- 2. 构造 GPT Prompt ----------
SYSTEM_MSG = {
    "role": "system",
    "content": (
        "You are an expert biomedical assistant.\n"
        "You will receive a **single safetyLiability** (only 'death' if present, "
        "otherwise the first event) for a target.\n"
        "Return ONE JSON object with keys 'question' and 'answer'.\n"
        "• The 'question' must be a natural-language query limited to the OpenTargets database, "
        "mentioning the target symbol+ID **and asking specifically about the provided event name**.\n"
        "• The 'answer' must respond concisely using ONLY the JSON, clearly listing tissues, dosing, etc. "
        "If no biosample/effect field exists, state that absence.\n"
        "Output nothing except the JSON (no markdown fences)."
    )
}
def build_messages(raw_json: Dict) -> List[Dict]:
    return [
        SYSTEM_MSG,
        {
            "role": "user",
            "content": f"Here is the OpenTargets JSON for one target:\n```json\n{json.dumps(raw_json, indent=2)}\n```"
        }
    ]
def parse_json_or_raise(text: str) -> Dict:
    clean = re.sub(r"^```(?:json)?|```$", "", text.strip(), flags=re.I|re.M).strip()
    return json.loads(clean)

def generate_qa(raw_json: Dict) -> Dict:
    messages = build_messages(raw_json)
    client = OpenAI(
    base_url="https://api.chatanywhere.tech",
    api_key=' sk-OlimLcefr3MBSt08IrcZ9LrhP94qqni4w3u4qkOPFtAULcDD' 
    )
    chat = client.chat.completions.create(
        model="gpt-4o-mini",  # 或其他 GPT-4 级模型
        messages=messages,
        temperature=0.2,
        max_tokens=400,
        timeout=60
    )
    # 解析并返回模型输出（应已是合法 JSON）
    return parse_json_or_raise(chat.choices[0].message.content)

# ---------- 3. 主流程 ----------
def build_benchmark(target_ids: List[str], outfile: str = "target_safety.json"):
    qa_list = []

    for tid in tqdm(target_ids, desc="Fetching & generating QA"):
        try:
            result = fetch_safety_liabilities(tid)
            if not result:                      # ← 无安全信息，直接跳过
                continue

            qa = generate_qa(result)
            qa_list.append(qa)
            time.sleep(1)                       # 避免速率限制
        except Exception as e:
            tqdm.write(f"[×] {tid} error: {e}")

    with open(outfile, "w", encoding="utf-8") as f:
        json.dump(qa_list, f, ensure_ascii=False, indent=2)
    print(f"\nSaved {len(qa_list)} QA pairs → {outfile}")
    #read csv file
import pandas as pd
df = pd.read_csv('target_ids.csv')
# Convert the DataFrame to a list
target_ids = df['target_id'].tolist()
build_benchmark(target_ids)


In [10]:
# read the csv file
import pandas as pd 
df = pd.read_csv('../source/target_ids.csv')
target_ids = df['target_id'].tolist()

In [12]:
"""
build_opentargets_qa.py
-----------------------
For each Ensembl target_id:
• query OpenTargets for associated diseases
• single GPT call creates a natural question + filter params
• output QA dataset with tool_calls
"""

import os, json, time, hashlib, requests
from typing import List
from tqdm.auto import tqdm
from openai import AzureOpenAI   # 若用 Azure → from openai import AzureOpenAI

# ---------- OpenAI client ----------

os.environ["AZURE_OPENAI_API_KEY"] = "5a1437f6ff2648b9b969507fb5a73276"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://ai-mistraleastus2753718354821.openai.azure.com/"
# ========= 0. Azure OpenAI 配置 =========
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-12-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

MODEL = "gpt-4.1-noah"

# ---------- OpenTargets GraphQL ----------
OT_URL = "https://api.platform.opentargets.org/api/v4/graphql"

def fetch_associated_diseases(target_id: str, min_score=0.5) -> tuple[str, List[dict]]:
    query = {
        "query": f"""
        {{
          target(ensemblId: "{target_id}") {{
            approvedSymbol
            associatedDiseases {{
              rows {{
                disease {{ id name }}
                score
              }}
            }}
          }}
        }}
        """
    }
    r = requests.post(OT_URL, json=query, timeout=20)
    r.raise_for_status()
    data = r.json()["data"]["target"]
    symbol = data["approvedSymbol"]
    rows = [
        d for d in data["associatedDiseases"]["rows"]
        if d["score"] >= min_score
    ]
    return symbol, rows

# ---------- GPT prompt ----------
SYSTEM = (
    "You are creating English QA pairs for an agent dataset. "
    "Return a JSON object with:\n"
    "• question – ONE natural sentence that asks for the diseases associated "
    "  with a given target in OpenTargets. It must mention the target’s Ensembl "
    "  ID, its approved symbol, and the minimum relevance score.\n"
    "• filter   – a JSON object containing only the keys that appear in the "
    "  question: target_id, min_score. "
    "Return ONLY the JSON."
)

def gpt_build_question(target_id: str, symbol: str, min_score: float) -> dict:
    user = (
        f"target_id={target_id}; symbol={symbol}; min_score={min_score}"
    )
    resp = client.chat.completions.create(
        model=MODEL,
        messages=[{"role":"system", "content": SYSTEM},
                  {"role":"user",   "content": user}],
        response_format={"type": "json_object"},
        temperature=0.3,
    )
    return json.loads(resp.choices[0].message.content)

# ---------- main ----------
# read the csv file
import pandas as pd 
df = pd.read_csv('../source/target_ids.csv')
target_ids = df['target_id'].tolist()

samples = []

for tid in tqdm(target_ids, desc="Building OpenTargets QA"):
    try:
        symbol, diseases = fetch_associated_diseases(tid, min_score=0.5)
        if not diseases:
            tqdm.write(f"⚠️ {tid} has no diseases ≥0.5")
            continue

        gpt_out  = gpt_build_question(tid, symbol, 0.5)
        question = gpt_out["question"].strip()
        filt     = gpt_out["filter"]

        # --- answer list ---
        answer = [
            {
                "disease_id":   d["disease"]["id"],
                "disease_name": d["disease"]["name"],
                "score":        d["score"]
            } for d in diseases
        ]

        # --- sample ---
        samples.append({
            "id": hashlib.md5(tid.encode()).hexdigest()[:16],
            "question": question,
            "tool_calls": [{
                "tool": "opentargets.search",
                "params": filt                   # {"target_id": "...", "min_score": 0.5}
            }],
            "answer": answer
        })
        time.sleep(0.8)  # rate-limit
    except Exception as e:
        tqdm.write(f"❌ {tid} failed: {e}")

with open("opentargets_qa_dataset.json", "w", encoding="utf-8") as f:
    json.dump({"dataset": samples}, f, ensure_ascii=False, indent=2)

print(f"\n✅ Saved {len(samples)} samples → opentargets_qa_dataset.json")


Building OpenTargets QA:   0%|          | 0/121 [00:00<?, ?it/s]

⚠️ ENSG00000242950 has no diseases ≥0.5
⚠️ ENSG00000101144 has no diseases ≥0.5

✅ Saved 119 samples → opentargets_qa_dataset.json


In [14]:
"""
build_opentargets_tractability_qa.py
------------------------------------
Generate QA samples for OT tractability:
• question  (natural, single sentence, “In the OpenTargets platform, …”)
• tool_calls (opentargets.tractability with target_id, value=True)
• answer     (modality + label list)
"""

import os, json, time, hashlib, requests
from typing import List
from tqdm.auto import tqdm
from openai import OpenAI         # 若用 Azure -> AzureOpenAI

# ---------------- OpenAI client ----------------
os.environ["AZURE_OPENAI_API_KEY"] = "5a1437f6ff2648b9b969507fb5a73276"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://ai-mistraleastus2753718354821.openai.azure.com/"
# ========= 0. Azure OpenAI 配置 =========
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-12-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

MODEL = "gpt-4.1-noah"
SLEEP    = 1.0                 # 速率限制

# ---------------- OpenTargets GraphQL -----------
OT_URL = "https://api.platform.opentargets.org/api/v4/graphql"

def fetch_tractability(target_id: str) -> tuple[str, List[dict]]:
    """返回 (approved_symbol, [{modality,label,value=True}, ...])"""
    q = {
        "query": f"""
        {{
          target(ensemblId: "{target_id}") {{
            approvedSymbol
            tractability {{
              modality
              label
              value
            }}
          }}
        }}"""
    }
    r = requests.post(OT_URL, json=q, timeout=20)
    r.raise_for_status()
    trg = r.json()["data"]["target"]
    rows = [d for d in trg["tractability"] if d["value"] is True]
    return trg["approvedSymbol"], rows

# ---------------- GPT prompt -------------------
SYSTEM = (
    "You are creating English QA pairs for an agent dataset. "
    "Return **only** a JSON object with two keys:\n"
    "• question – ONE natural sentence that starts with "
    "\"In the OpenTargets platform,\" and asks for the tractability "
    "information for a given target. It must state the target's Ensembl ID "
    "and its approved symbol, and require that `value` is True. "
    "Do NOT reveal any JSON keys or answer.\n"
    "• filter   – an object that includes exactly the keys explicitly "
    "mentioned in the question. Here they are target_id and value. "
    "`value` must be true.\n\n"
    "Return ONLY the JSON object."
)

def gpt_question_and_filter(tid: str, symbol: str) -> dict:
    user = f"target_id={tid}; symbol={symbol}"
    rsp  = client.chat.completions.create(
        model=MODEL,
        messages=[{"role":"system","content":SYSTEM},
                  {"role":"user","content":user}],
        response_format={"type":"json_object"},
        temperature=0.2,
    )
    return json.loads(rsp.choices[0].message.content)

# ---------------- build dataset ----------------
import pandas as pd 
df = pd.read_csv('../source/target_ids.csv')
target_ids = df['target_id'].tolist()

dataset = []

for tid in tqdm(target_ids, desc="Building OT-Tractability QA"):
    try:
        symbol, rows = fetch_tractability(tid)
        if not rows:
            tqdm.write(f"⚠️ {tid}: no tractability value=True")
            continue

        gpt_out  = gpt_question_and_filter(tid, symbol)
        question = gpt_out["question"].strip()
        filt     = gpt_out["filter"]          # {"target_id": "...", "value": true}

        # 保底：若模型忘写前缀，加上
        if not question.lower().startswith("in the opentargets"):
            question = "In the OpenTargets platform, " + question[0].lower() + question[1:]

        answer = [{"modality": r["modality"], "label": r["label"]} for r in rows]

        dataset.append({
            "id": hashlib.md5(tid.encode()).hexdigest()[:16],
            "question": question,
            "tool_calls": [{
                "tool":   "opentargets.tractability",
                "params": filt          # 只含 target_id & value=True
            }],
            "answer": answer
        })
        time.sleep(SLEEP)
    except Exception as e:
        tqdm.write(f"❌ {tid} failed: {e}")

with open("ot_tractability_qa.json", "w", encoding="utf-8") as f:
    json.dump({"dataset": dataset}, f, ensure_ascii=False, indent=2)

print(f"\n✅ Saved {len(dataset)} samples → ot_tractability_qa.json")


Building OT-Tractability QA:   0%|          | 0/121 [00:00<?, ?it/s]


✅ Saved 121 samples → ot_tractability_qa.json


In [19]:
"""
build_ot_safety_qa.py
---------------------
Generate QA pairs about safety liabilities (death / first event) for targets.
Each sample: question + tool_calls(opentargets.safety) + answer
"""

import os, json, time, hashlib, re, requests, pandas as pd
from typing import List, Dict,Optional
from tqdm.auto import tqdm
from openai import OpenAI   # 若用 Azure → AzureOpenAI

# ---------------- OpenAI client ----------------
os.environ["AZURE_OPENAI_API_KEY"] = "5a1437f6ff2648b9b969507fb5a73276"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://ai-mistraleastus2753718354821.openai.azure.com/"
# ========= 0. Azure OpenAI 配置 =========
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-12-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

MODEL = "gpt-4.1-noah"
SLEEP    = 1.0                 # 速率限制
RATE    = 1.0               # 简单限速

# -------- OpenTargets GraphQL -----
OT_URL = "https://api.platform.opentargets.org/api/v4/graphql"

from typing import Optional, Dict, List

def pick_best_row(rows: List[Dict]) -> Optional[Dict]:
    """
    返回包含细节的最佳 safetyLiability 条目：
    - 先筛出 biosamples 或 effects 非空者
    - 其中若有 'death' 优先
    - 否则随机取第一条
    """
    # ① 有细节的
    detail_rows = [r for r in rows if r.get("biosamples") or r.get("effects")]
    if not detail_rows:
        return None

    # ② death 优先
    for r in detail_rows:
        if r["event"].lower() == "death":
            return r
    return detail_rows[0]


def fetch_safety(target_id: str) -> Optional[Dict]:
    query = {
        "query": f"""
        {{
          target(ensemblId: "{target_id}") {{
            approvedSymbol
            safetyLiabilities {{
              event
              eventId
              biosamples {{ tissueLabel tissueId }}
              effects    {{ dosing direction }}
            }}
          }}
        }}"""
    }
    try:
        resp = requests.post(OT_URL, json=query, timeout=30)
        resp.raise_for_status()
        target = resp.json()["data"]["target"]
        rows   = target["safetyLiabilities"]
        if not rows:
            return None

        best = pick_best_row(rows)
        if best is None:               # 没有任何细节 → 跳过
            return None

        return {
            "symbol":     target["approvedSymbol"],
            "event":      best["event"],
            "biosamples": best.get("biosamples", []),
            "effects":    best.get("effects", [])
        }
    except Exception as e:
        tqdm.write(f"[error] {target_id}: {e}")
        return None


# -------- GPT prompt -------------
SYSTEM = (
    "You build English QA pairs for a biomedical agent dataset.\n"
    "Return **only** a JSON object with keys:\n"
    "• question – ONE natural sentence that starts with "
    "\"In the OpenTargets platform,\" and asks specifically for the given "
    "safety liability event of the target (using its Ensembl ID and symbol).\n"
    "• filter   – object with exactly the keys that appear in the question. "
    "Here they are `target_id` and `event`.\n"
    "• answer   – a JSON object with keys 'event', 'biosamples', 'effects'."
    "(tissues, dosing, direction; say 'none' if missing).\n"
    "No markdown fences."
)

def gpt_create(sample: Dict, tid: str) -> Dict:
    user = json.dumps({
        "target_id": tid,
        "symbol":    sample["symbol"],
        "event":     sample["event"],
        "biosamples": sample["biosamples"],
        "effects":    sample["effects"]
    }, ensure_ascii=False)
    rsp = client.chat.completions.create(
        model=MODEL,
        messages=[{"role":"system","content":SYSTEM},
                  {"role":"user",  "content":user}],
        response_format={"type":"json_object"},
        temperature=0.2,
    )
    return json.loads(rsp.choices[0].message.content)

# -------- Build dataset ----------
def build_dataset(target_ids: List[str], out="ot_safety_qa.json"):
    data = []
    for tid in tqdm(target_ids, desc="Generating OT-safety QA"):
        sample = fetch_safety(tid)
        if not sample:          # 无有效事件
            continue
        try:
            gpt_out   = gpt_create(sample, tid)
            q         = gpt_out["question"].strip()
            filt      = gpt_out["filter"]       # {"target_id": "...", "event": "..."}
            answer    = gpt_out["answer"]

            if not q.lower().startswith("in the opentargets"):
                q = "In the OpenTargets platform, " + q[0].lower() + q[1:]

            data.append({
                "id": hashlib.md5((tid+sample['event']).encode()).hexdigest()[:16],
                "question": q,
                "tool_calls": [{
                    "tool": "opentargets.safety",
                    "params": filt
                }],
                "answer": answer
            })
            time.sleep(RATE)
        except Exception as e:
            tqdm.write(f"[GPT error] {tid}: {e}")

    with open(out, "w", encoding="utf-8") as f:
        json.dump({"dataset": data}, f, ensure_ascii=False, indent=2)
    print(f"\n✅ Saved {len(data)} samples → {out}")

# -------- run --------
if __name__ == "__main__":
    df = pd.read_csv('../source/target_ids.csv')
    target_ids = df['target_id'].tolist()[:3]
    build_dataset(target_ids)


Generating OT-safety QA:   0%|          | 0/3 [00:00<?, ?it/s]


✅ Saved 2 samples → ot_safety_qa.json


In [20]:
"""
polish_and_enrich_ot_safety_qa.py
---------------------------------
Read raw_qa.json  (question+answer only)  →
Call GPT once per item →
Return polished question + id + tool_calls →
Write enriched_qa.json
"""

import json, hashlib, os, time, re
from pathlib import Path
from typing import Dict, List, Optional
from tqdm.auto import tqdm
from openai import OpenAI      # <— Azure 用户换成 AzureOpenAI 并加 endpoint

# ============ OpenAI 配置 =============
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-12-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

MODEL = "gpt-4.1-noah"
SLEEP    = 1.0                 # 速率限制
# ============ GPT 提示 =============
SYSTEM_PROMPT = (
    "You are refining biomedical QA data. "
    "Input: a QA object with 'question' and 'answer'.\n"
    "Step A  Re-write the question into ONE natural English sentence that "
    "starts with “In the OpenTargets database,” or “Within OpenTargets,” and "
    "clearly mentions both the target symbol (e.g. HTR3A) and the event name "
    "(e.g. emesis). Preserve original meaning.\n"
    "Step B  Extract exactly those two pieces of information:\n"
    "    • symbol : target gene/protein approved symbol\n"
    "    • event  : safety-liability event (verbatim)\n"
    "Step C  Return a JSON object:\n"
    "{\n"
    "  \"id\"        : first 16 hex of sha256(polished_question),\n"
    "  \"question\"  : <polished_question>,\n"
    "  \"tool_calls\": [ {\"tool\":\"opentargets.safety\", "
    "                     \"params\": {\"symbol\": <symbol>, \"event\": <event>} } ],\n"
    "  \"answer\"    : <unchanged answer> \n"
    "}\n"
    "No markdown fences, no extra keys."
)

def sha16(text: str) -> str:
    return hashlib.sha256(text.encode()).hexdigest()[:16]

def polish_and_enrich(item: Dict) -> Optional[Dict]:
    """Call GPT once, return enriched object (or None on failure)."""
    user_msg = json.dumps(item, ensure_ascii=False)
    rsp = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": user_msg},
        ],
        response_format={"type": "json_object"},
        temperature=0.25,
        timeout=60
    )
    # 直接解析为字典
    enriched = json.loads(rsp.choices[0].message.content)

    # ——— 保险：若 GPT 忘记计算 id，就补算一次 ———
    if "id" not in enriched or not enriched["id"]:
        enriched["id"] = sha16(enriched["question"])

    return enriched


# ============ 主流程 ============
RAW_FILE  = "../dataset/target_safety.json"            # ← 你的原始文件
OUT_FILE  = "enriched_qa.json"
SLEEP_SEC = 0.5                      # 简易限速

raw_data: List[Dict] = json.loads(Path(RAW_FILE).read_text(encoding="utf-8"))
enriched: List[Dict] = []

for item in tqdm(raw_data, desc="Polishing & adding tools"):
    try:
        new_item = polish_and_enrich(item)
        if new_item:
            enriched.append(new_item)
        time.sleep(SLEEP_SEC)
    except Exception as e:
        tqdm.write(f"❌  Failed on item: {e}")

# 保存
Path(OUT_FILE).write_text(json.dumps({"dataset": enriched},
                                     ensure_ascii=False, indent=2))
print(f"\n✅  Saved {len(enriched)} QA pairs → {OUT_FILE}")


Polishing & adding tools:   0%|          | 0/31 [00:00<?, ?it/s]


✅  Saved 31 QA pairs → enriched_qa.json
