# Knowledge Graph Path Enrichment and QA Generation

This notebook implements a two-stage approach to enrich UMLS knowledge graph paths and generate high-quality question-answer pairs:

1. **Knowledge Enrichment Stage**: Generate detailed reasoning explanations and background knowledge for each path
2. **QA Generation Stage**: Use the enriched knowledge to create diverse question-answer pairs

In [None]:
import os
import json
import pandas as pd
from tqdm import tqdm
from openai import AzureOpenAI
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Azure OpenAI API configuration
os.environ["AZURE_OPENAI_API_KEY"] = "5a1437f6ff2648b9b969507fb5a73276"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://ai-mistraleastus2753718354821.openai.azure.com/"
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-12-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [4]:
# Load the merged paths data
with open('/home/xinding/dingxin/Agent/MAIA/code/merged_paths.json', 'r') as f:
    data = json.load(f)

# Extract disease-drug-target paths
disease_drug_target_paths = data.get("Disease_Drug_Target", [])

# Filter out incomplete paths (those without 5 elements in path_strs)
complete_paths = [path for path in disease_drug_target_paths if len(path.get("path_strs", [])) == 5]

print(f"Total paths: {len(disease_drug_target_paths)}")
print(f"Complete paths: {len(complete_paths)}")

Total paths: 568
Complete paths: 568


## Stage 1: Knowledge Enrichment

Generate detailed reasoning explanations and background knowledge for each knowledge graph path

In [None]:
def generate_enriched_explanation(path, reasoning_depth="medium"):
    """
    Generate enriched explanations and background knowledge for a UMLS knowledge graph path.
    
    Parameters:
    - path (dict): The knowledge graph path containing 'path_strs'
    - reasoning_depth (str): Depth of reasoning - 'basic', 'medium', or 'detailed'
    
    Returns:
    - dict: Enriched knowledge including reasoning and background information
    """
    if not path.get("path_strs") or len(path["path_strs"]) < 5:
        return {"error": "Incomplete path"}
    
    # Extract path elements
    disease = path["path_strs"][0]
    relation1 = path["path_strs"][1]
    drug = path["path_strs"][2]
    relation2 = path["path_strs"][3]
    target = path["path_strs"][4]
    
    # Define prompt based on reasoning depth
    depth_instructions = {
        "basic": "Provide a basic explanation with minimal background.",
        "medium": "Provide a moderately detailed explanation with relevant background knowledge including clinical presentation and mechanisms.",
        "detailed": "Provide an in-depth explanation with comprehensive background knowledge including clinical presentation, molecular mechanisms, pharmacology, and relevant scientific literature."
    }
    
    prompt = f"""
    I need detailed biomedical reasoning and background knowledge about a disease-drug-target pathway from UMLS. 
    {depth_instructions.get(reasoning_depth, depth_instructions["medium"])}
    
    PATHWAY:
    Disease: {disease}
    Relationship: {relation1}
    Drug: {drug}
    Relationship: {relation2}
    Target: {target}
    
    Please provide the following information in a structured JSON format:
    1. "disease_background": Clinical presentation, symptoms, pathophysiology of {disease}
    2. "drug_background": Information about {drug}, its class, mechanisms, and general use
    3. "target_background": Information about {target}, its cellular function, and role in disease
    4. "mechanism_explanation": How {drug} works on {target} to treat {disease}
    5. "clinical_reasoning": Clinical reasoning for using {drug} for {disease}, including benefits and limitations
    6. "expanded_pathway": A more detailed step-by-step explanation of this pathway with molecular details
    
    IMPORTANT: Your response MUST be a valid JSON object with these keys. Ensure the text is factually accurate and reflects current medical knowledge.
    
    Each field should include at least 3-4 sentences of detailed information to provide sufficient context for educational purposes.
    If you're uncertain about any specific details, provide the most reliable general information based on established medical knowledge.
    """
    
    try:
        # Call the Azure OpenAI API
        response = client.chat.completions.create(
            model="gpt-4.1-noah", # Use appropriate model available in your Azure OpenAI resource
            messages=[
                {"role": "system", "content": "You are a medical AI assistant with expertise in pharmacology, clinical medicine, and molecular biology. Provide scientifically accurate information about disease-drug-target pathways."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=3000,
            response_format={"type": "json_object"}
        )
        
        # Extract the enriched explanation from the response
        content = response.choices[0].message.content
        enriched_knowledge = json.loads(content)
        
        # Validate the response has all required keys
        expected_keys = ["disease_background", "drug_background", "target_background", 
                         "mechanism_explanation", "clinical_reasoning", "expanded_pathway"]
        
        missing_keys = [key for key in expected_keys if key not in enriched_knowledge]
        if missing_keys:
            print(f"Warning: Missing keys in enriched knowledge: {missing_keys}")
            # Add empty values for missing keys to prevent errors downstream
            for key in missing_keys:
                enriched_knowledge[key] = f"Information about {key} not available."
        
        # Add original path information
        enriched_knowledge["original_path"] = {
            "disease": disease,
            "relation1": relation1,
            "drug": drug,
            "relation2": relation2,
            "target": target
        }
        
        return enriched_knowledge
    
    except Exception as e:
        print(f"Error in generate_enriched_explanation: {str(e)}")
        return {"error": str(e), "original_path": path["path_strs"]}

In [9]:
# Process a sample path to test the function
if complete_paths:
    sample_path = complete_paths[0]  # First complete path
    sample_enriched = generate_enriched_explanation(sample_path)
    print("Sample path:")
    print(" -> ".join(sample_path["path_strs"]))
    print("\nEnriched knowledge (keys):")
    print(list(sample_enriched.keys()))
    
    # Print a sample section from the enriched knowledge
    print("\nSample section (mechanism_explanation):")
    print(sample_enriched.get("mechanism_explanation", "Not found"))
    print("\nFull enriched knowledge:")
    print(sample_enriched)

Sample path:
Gynecomastia -> may_be_treated_by -> Tamoxifen-containing product -> has_target -> Estrogen Receptor

Enriched knowledge (keys):
['disease_background', 'drug_background', 'target_background', 'mechanism_explanation', 'clinical_reasoning', 'expanded_pathway', 'original_path']

Sample section (mechanism_explanation):
Tamoxifen acts as a competitive antagonist of the estrogen receptor in breast tissue. By binding to the ER, tamoxifen prevents endogenous estrogens (such as estradiol) from activating the receptor. This inhibits estrogen-mediated transcriptional activity and downstream proliferative signals in breast glandular tissue. As a result, tamoxifen reduces the growth and tenderness of gynecomastia by counteracting the stimulatory effects of estrogen on male breast tissue.

Full enriched knowledge:
{'disease_background': 'Gynecomastia is the benign proliferation of glandular breast tissue in males, resulting from an imbalance between estrogen and androgen activity at the

In [10]:
# Process all complete paths with different reasoning depths
# This process might take time depending on the number of paths
def process_paths_batch(paths, batch_size=10, reasoning_depth="medium"):
    """
    Process paths in batches to avoid rate limiting and manage memory
    """
    enriched_paths = []
    
    for i in tqdm(range(0, len(paths), batch_size)):
        batch = paths[i:i+batch_size]
        batch_results = []
        
        for path in batch:
            enriched = generate_enriched_explanation(path, reasoning_depth=reasoning_depth)
            batch_results.append(enriched)
            time.sleep(1)  # To avoid rate limiting
            
        enriched_paths.extend(batch_results)
    
    return enriched_paths

In [11]:
# Use a small subset for testing
test_paths = complete_paths[:5]  # Process just 5 paths for testing
test_enriched = process_paths_batch(test_paths, batch_size=5, reasoning_depth="medium")

# Save the test results
with open('/home/xinding/dingxin/Agent/MAIA/code/enriched_paths_test.json', 'w') as f:
    json.dump(test_enriched, f, indent=2)

print(f"Processed {len(test_enriched)} test paths with enriched knowledge")

100%|██████████| 1/1 [00:48<00:00, 48.72s/it]

Processed 5 test paths with enriched knowledge





In [None]:
# Uncomment to process all paths (warning: this could take a long time and use significant API quota)

all_enriched_paths = process_paths_batch(complete_paths, batch_size=10, reasoning_depth="medium")

# Save all enriched paths
with open('/home/xinding/dingxin/Agent/MAIA/code/enriched_paths_all.json', 'w') as f:
    json.dump(all_enriched_paths, f, indent=2)

print(f"Processed {len(all_enriched_paths)} paths with enriched knowledge")

In [36]:
import os, json, textwrap, openai
from datetime import date


# ────────────────────────────────────────────────────────────────────
# 1.  YOUR ENRICHED DATA (replace with each new example)
# ────────────────────────────────────────────────────────────────────
enriched  ={
    "disease_background": "Gynecomastia is the benign proliferation of glandular breast tissue in males, resulting from an imbalance between estrogen and androgen activity at the breast tissue level. Clinically, it presents as a palpable, often tender, rubbery mass concentric to the nipple-areolar complex. It may be unilateral or bilateral and is most commonly seen during three physiological periods: neonatal, pubertal, and older age. Pathophysiologically, gynecomastia arises when there is increased estrogen action (from endogenous or exogenous sources) or decreased androgen action, leading to stimulation of breast ductal epithelial and stromal proliferation. Common causes include puberty, medications (e.g., spironolactone, anti-androgens), chronic liver disease, testicular tumors, and idiopathic cases.",
    "drug_background": "Tamoxifen is a selective estrogen receptor modulator (SERM) primarily used in the treatment and prevention of estrogen receptor-positive breast cancer. It acts as an estrogen antagonist in breast tissue but can have partial agonist effects in other tissues such as bone and endometrium. Tamoxifen is administered orally and is metabolized in the liver to active metabolites (e.g., endoxifen). Its general use in gynecomastia is off-label, targeting the estrogen-driven proliferation of breast tissue in males.",
    "target_background": "The Estrogen Receptor (ER) is a nuclear hormone receptor that functions as a ligand-activated transcription factor. There are two main subtypes: ER-alpha and ER-beta. Upon binding estrogen, the receptor dimerizes, translocates to the nucleus, and regulates the transcription of genes involved in cell proliferation, differentiation, and survival. In breast tissue, ER activation promotes ductal and stromal growth. Overactivation or increased sensitivity of the ER in male breast tissue contributes to the development of gynecomastia.",
    "mechanism_explanation": "Tamoxifen acts as a competitive antagonist of the estrogen receptor in breast tissue. By binding to the ER, tamoxifen blocks the effects of endogenous estrogens, preventing receptor activation and subsequent transcription of estrogen-responsive genes that drive glandular proliferation. This antagonism reduces the stimulatory effect of estrogen on male breast tissue, leading to regression or stabilization of gynecomastia.",
    "clinical_reasoning": "Tamoxifen is considered in cases of persistent, painful, or psychologically distressing gynecomastia, especially when the underlying cause cannot be corrected or when surgery is not preferred. Its use is supported by clinical studies showing reduction in breast tissue size and pain. Benefits include non-invasiveness and avoidance of surgery. Limitations include potential side effects (e.g., thromboembolic events, hot flashes), lack of efficacy in long-standing fibrotic gynecomastia, and off-label status. Tamoxifen is most effective when initiated early, before significant fibrosis develops.",
    "expanded_pathway": "1. In gynecomastia, increased estrogen action at the male breast stimulates the estrogen receptor (ER), a nuclear transcription factor. 2. ER activation leads to upregulation of genes promoting ductal epithelial and stromal cell proliferation, resulting in palpable breast tissue. 3. Tamoxifen, a SERM, is administered and circulates to breast tissue. 4. Tamoxifen binds competitively to the ER, displacing endogenous estrogens. 5. The tamoxifen-ER complex does not effectively recruit coactivators required for gene transcription, thus inhibiting estrogen-driven gene expression. 6. This inhibition reduces further proliferation of breast tissue and may induce apoptosis of proliferating cells. 7. Clinically, this results in reduction of breast tissue size and alleviation of symptoms. 8. If tamoxifen is discontinued or if fibrosis has occurred, the response may be limited.",
    "original_path": {
      "disease": "Gynecomastia",
      "relation1": "may_be_treated_by",
      "drug": "Tamoxifen-containing product",
      "relation2": "has_target",
      "target": "Estrogen Receptor"
    }
}
# ────────────────────────────────────────────────────────────────────
# 2.  PROMPT-BUILDING FUNCTION
# ────────────────────────────────────────────────────────────────────
def build_prompt(d: dict,
                 difficulty: str = "hard",
                 require_citations: bool = True) -> str:
    """
    Create a two-section prompt that forces GPT to:
      A) produce an expert-level, multi-hop reasoning narrative
      B) craft a single best-answer question that can only be solved
         by following that narrative.

    difficulty ∈ {"hard","very hard"} adjusts how much extra guidance
    we give for path length, distractors, etc.
    """
    # Turn the structured UMLS path into a readable arrow chain
    arrow_path = (f'{d["original_path"]["disease"]} '
                  f'→({d["original_path"]["relation1"]})→ '
                  f'{d["original_path"]["drug"]} '
                  f'→({d["original_path"]["relation2"]})→ '
                  f'{d["original_path"]["target"]}')

    extra_req = ""
    if difficulty == "very hard":
        extra_req = textwrap.dedent("""
            • Embed at least one plausible but incorrect distractor symptom
              that is ruled out by the pathway.
            • Rephrase the disease name in the vignette so that it is *not*
              explicitly stated (use descriptive clinical terminology).
        """).strip()

    citation_line = ("• After each biomedical concept, append its UMLS CUI "
                     "in parentheses." if require_citations else "")

    prompt = textwrap.dedent(f"""
    ### SYSTEM
    You are a senior medical content creator.  Your audience is board-certified
    endocrinologists preparing for maintenance-of-certification exams.
    Write in precise, academic English.

    ### USER
    SECTION A — EXPERT REASONING (do *not* craft the question yet)
    1. Disease background:
       {d["disease_background"]}
    2. Drug background:
       {d["drug_background"]}
    3. Target background:
       {d["target_background"]}
    4. Mechanistic rationale:
       {d["mechanism_explanation"]}
    5. Clinical considerations:
       {d["clinical_reasoning"]}
    6. Base UMLS path:
       {arrow_path}
    7. Expanded multi-hop pathway (≥ 3 additional hops already integrated):
       {d["expanded_pathway"]}

    Your tasks for SECTION A:
    • Rewrite the entire expanded pathway as an ordered, numbered narrative
      that explicitly links each step mechanistically and clinically.
    {citation_line}

    SECTION B — CONSTRUCT A DIFFICULT Q & A
    • Invent one *single-best-answer* question whose solution requires a reader
      to follow the COMPLETE pathway you just described.
    • Frame the stem as a realistic clinical scenario (case vignette).
    • Omit the actual answer terminology from the stem.
    • The question should probe molecular pharmacology, off-label rationale,
      or nuanced contraindications—make it challenging for specialists.
    {extra_req}

    Output ONLY the following JSON object, without extra text:

    {{
      "generation_date": "{date.today()}",
      "reasoning": "<Section A narrative>",
      "question": "<English MCQ stem minus options>",
      "answer":   "<concise preferred term>"
    }}
    """).strip()

    return prompt


# ────────────────────────────────────────────────────────────────────
# 3.  CALL OPENAI AND PARSE
# ────────────────────────────────────────────────────────────────────
def generate_high_level_qa(data: dict,
                           model: str = "gpt-4.1-noah",
                           difficulty: str = "very hard") -> dict:
    prompt = build_prompt(data, difficulty=difficulty)
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.6,
        max_tokens=900
    )
    # Parse JSON reply
    reply = response.choices[0].message.content
    try:
        qa_json = json.loads(reply)
    except json.JSONDecodeError:
        raise RuntimeError("Returned content is not valid JSON:\n" + reply)
    return qa_json


# ────────────────────────────────────────────────────────────────────
# 4.  DEMO (uncomment to run)
# ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    qa_pair = generate_high_level_qa(enriched)
    print(json.dumps(qa_pair, indent=2, ensure_ascii=False))


{
  "generation_date": "2025-06-11",
  "reasoning": "1. In certain males, an imbalance favoring increased estrogen action (CUI: C0014936) at the breast tissue leads to stimulation of the estrogen receptor (ER) (CUI: C0014936, C0014943), a nuclear hormone receptor and transcription factor (CUI: C0014943). 2. Upon ligand binding, the ER dimerizes and translocates to the nucleus, where it regulates transcription of genes (CUI: C0014943, C0023526) that promote proliferation of ductal epithelial and stromal cells (CUI: C0230136, C0038941), resulting in the development of palpable, tender, glandular breast tissue (CUI: C0014936). 3. Tamoxifen (CUI: C0039287), a selective estrogen receptor modulator (SERM) (CUI: C0039287), is administered orally (CUI: C0029928) and circulates systemically to reach breast tissue (CUI: C0006142). 4. Tamoxifen competitively binds to the estrogen receptor (CUI: C0014943), thereby displacing endogenous estrogens (CUI: C0049598). 5. The tamoxifen-ER complex (CUI: C

In [None]:
# ▶️ 在运行本单元前，请先 export 两个环境变量（或直接写死）
#   AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT
import os, json, textwrap, time
from openai import AzureOpenAI      # pip install openai>=1.30.3

# Azure OpenAI API configuration
os.environ["AZURE_OPENAI_API_KEY"] = "5a1437f6ff2648b9b969507fb5a73276"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://ai-mistraleastus2753718354821.openai.azure.com/"
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-12-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)
SYSTEM_MSG = textwrap.dedent("""
You are a senior clinical pharmacologist and question-writer.

Rewrite the given Q-A so that:
• Stem becomes a realistic clinical vignette; omit the target answer term.
• Probe subtle molecular pharmacology / off-label rationale / nuanced contraindications.
• Invent one *single-best-answer* question whose solution forces the reader to follow the **complete mechanistic pathway** you just described;
• Provide 150–250-word ‘reasoning’ narrating clinical + molecular steps (可加 CUI codes) .
Return **exact** minified JSON with keys: question, answer, reasoning, reasoning_path.
""").strip()

# —— Few-shot 示例；一问一答 —— #
FEWSHOT_ORIG = {
  "question": "A male patient develops painful breast enlargement; his physician prescribes a selective estrogen receptor modulator. Which molecular target is primarily involved in mediating this drug's therapeutic effect in this condition?",
  "answer": "Estrogen Receptor; Tamoxifen alleviates gynecomastia by antagonizing estrogen receptors, thereby blocking estrogen-mediated breast tissue proliferation.",
  "reasoning_path": "Gynecomastia is treated with tamoxifen, a selective estrogen receptor modulator, whose therapeutic effect is mediated by antagonism of estrogen receptors in breast tissue."
}
FEWSHOT_REWRITE = {
  "question": "A 16-year-old male presents with a three-month history of a tender, rubbery, subareolar mass on the left chest. He is otherwise healthy, takes no medications, and has no signs of chronic illness. Physical examination reveals a 2-cm concentric, mobile mass beneath the areola with mild tenderness. After evaluation, his clinician recommends a medication that is not FDA-approved for this indication but is known to act as a competitive antagonist at a nuclear hormone receptor, blocking ligand-induced transcription of proliferative genes in breast tissue. Which pharmacologic agent is most likely to reduce the size of his palpable mass and alleviate symptoms, and what is the molecular basis for its effectiveness in this setting?",
  "answer": "Tamoxifen—competitive antagonism of the estrogen receptor in breast tissue",
  "reasoning": "1. Pubertal gynecomastia arises from estrogen-androgen imbalance (CUI:C0014936)... ",
    "reasoning_path": "Tamoxifen counters estrogen-driven proliferation via competitive ER blockade."
}

# —— 把“原始条目”模板化塞进 user message —— #
USER_TEMPLATE = "ORIGINAL ENTRY:\n" + "{entry_json}\n\nRewrite according to all rules."
def rewrite_entry(entry: dict,
                  model: str = "gpt-4.1-noah",
                  temperature: float = 0.7,
                  max_attempts: int = 3) -> dict:
    """
    调用 Azure OpenAI 把单条数据重写；失败自动重试。
    """
    base_messages = [
        {"role": "system",    "content": SYSTEM_MSG},
        # Few-shot，先给“旧版 Q-A”作为 user，再给“重写示例”作为 assistant
        {"role": "user",      "content": json.dumps(FEWSHOT_ORIG, ensure_ascii=False)},
        {"role": "assistant", "content": json.dumps(FEWSHOT_REWRITE, ensure_ascii=False)},
    ]
    user_msg = USER_TEMPLATE.format(entry_json=json.dumps(entry, ensure_ascii=False))
    for attempt in range(1, max_attempts + 1):
        try:
            resp = client.chat.completions.create(
                model       = model,        # Azure 部署名；如有别名请替换
                temperature = temperature,
                messages    = base_messages + [{"role": "user", "content": user_msg}],
                timeout     = 60,
            )
            content = resp.choices[0].message.content.strip()
            new_fields = json.loads(content)
            assert all(k in new_fields for k in ("question", "answer", "reasoning", "reasoning_path"))
            return {**entry, **new_fields}
        except Exception as e:
            print(f"[attempt {attempt}] {e}")
            if attempt == max_attempts:
                raise
            time.sleep(2 * attempt)
# 假设 data 原始结构是 {"dataset": [ {...}, {...} ]}
IN_FILE  = "umls_qa.json"
OUT_FILE = "umls_qa_rewritten.json"

with open(IN_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)
rewritten_items = []
for i, item in enumerate(data["dataset"], 1):
    print(f"↻  {i}/{len(data['dataset'])}   id={item.get('id')}")
    rewritten_items.append(rewrite_entry(item))

with open(OUT_FILE, "w", encoding="utf-8") as f:
    json.dump({"dataset": rewritten_items}, f, ensure_ascii=False, indent=2)

print("✅  完成！结果保存在:", OUT_FILE)
