# MedCaseReasoning

### read data

In [1]:
import pandas as pd
test_set = pd.read_csv("/u/bli16/sea/data/MedCaseReasoning/raw_data/test.csv")

In [11]:
import pandas as pd
train_set = pd.read_csv("/u/bli16/sea/data/MedCaseReasoning/raw_data/train.csv")

In [2]:
test_set.head()

Unnamed: 0.1,Unnamed: 0,pmcid,title,journal,article_link,publication_date,text,case_prompt,diagnostic_reasoning,final_diagnosis
0,31,PMC8777167,A phototoxic drug reaction due to topical NSAIDs,Clinical Case Reports,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,2022-01-20,1 INTRODUCTION Photosensitization is a skin hy...,A 52‐year‐old man with Addison’s disease on li...,1. Cellulitis was initially suspected — “The p...,Phototoxic reaction
1,41,PMC3673371,Recurrent Ameloblastic Fibroma: Report of a Ra...,Case Reports in Dentistry,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,2013-05-21,1. Introduction Ameloblastic fibroma (AF) is a...,An 18-year-old woman presented with a 1-year h...,1. Ameloblastoma — “AF needs to be differentia...,AmeloblasticFibroma
2,51,PMC7443985,Refractory scleroderma renal crisis precipitat...,SAGE Open Medical Case Reports,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,2020-08-21,Introduction Thrombotic microangiopathy (TMA) ...,A 37-year-old man presented with a 3-month his...,1. Scleroderma renal crisis was the leading di...,Scleroderma_renal_crisis
3,73,PMC10999549,Gallbladder volvulus in a 5–years old Vietname...,Clinical Case Reports,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,2024-04-07,1 INTRODUCTION Gallbladder volvulus (GV) or ga...,A previously healthy 5‐year‐old girl presented...,"1. Ileosigmoid knot, pseudo-obstruction, sever...",GallbladderVolvulus
4,83,PMC8378370,Bullous fixed drug eruption following administ...,BMJ Case Reports,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,2021-08-19,Background The recombinant adjuvant Shingrix v...,A 51-year-old woman with Crohn’s disease on in...,1. Varicella zoster virus infection — “PCR fro...,bullous fixed drug eruption


In [None]:
data = test_set.iloc[0]
print("Text: ", data["text"])
print("case_prompt: ", data["case_prompt"])
print("diagnostic_reasoning", data["diagnostic_reasoning"])
print(data["final_diagnosis"])

Text:  1 INTRODUCTION Photosensitization is a skin hypersensitivity to light and is classified into photoallergic and phototoxic reactions. It manifests as an acute eczema‐like reaction, similar to sunburn, characterized by erythema, edema, and blistering. 1 Photoallergic reactions are a delayed hypersensitivity reaction (type IV reaction) with a sensitization phase of 10–14 days. 2 However, when allergy has developed, eczema may manifest a few days after exposure. Phototoxic reactions, in contrast to the photoallergic, are more common and are initiated by damage to the skin that occurs by direct skin contact between an allergenic or irritant substance when irradiated with ultraviolet radiation. 2 Both types of photoreactions are most often triggered by long‐wave ultraviolet sunlight (UV‐A) but can also be triggered by short‐wave ultraviolet sunlight (UV‐B) and visible light. UV‐A rays can penetrate window glass and thin textiles. 3 Therefore, photoreactions can occur indoors, when dri

In [9]:
# wrap text to 800 characters per line
import textwrap
data = test_set.iloc[0]
print("Text")
print(textwrap.fill(data["text"], width=100))
print()
print("case_prompt")
print(textwrap.fill(data["case_prompt"], width=100))
print()
print("diagnostic_reasoning")
print(textwrap.fill(data["diagnostic_reasoning"], width=100))
print()
print("final_diagnosis")
print(textwrap.fill(data["final_diagnosis"], width=100))



Text
1 INTRODUCTION Photosensitization is a skin hypersensitivity to light and is classified into
photoallergic and phototoxic reactions. It manifests as an acute eczema‐like reaction, similar to
sunburn, characterized by erythema, edema, and blistering. 1 Photoallergic reactions are a delayed
hypersensitivity reaction (type IV reaction) with a sensitization phase of 10–14 days. 2 However,
when allergy has developed, eczema may manifest a few days after exposure. Phototoxic reactions, in
contrast to the photoallergic, are more common and are initiated by damage to the skin that occurs
by direct skin contact between an allergenic or irritant substance when irradiated with ultraviolet
radiation. 2 Both types of photoreactions are most often triggered by long‐wave ultraviolet sunlight
(UV‐A) but can also be triggered by short‐wave ultraviolet sunlight (UV‐B) and visible light. UV‐A
rays can penetrate window glass and thin textiles. 3 Therefore, photoreactions can occur indoors,
when drivi

In [13]:
all_labels = test_set["final_diagnosis"].unique()
print(len(all_labels))


813


In [None]:
all_train_labels = train_set["final_diagnosis"].unique()
print(len(all_train_labels))


8130


In [17]:
overlap_labels = set(all_labels) & set(all_train_labels)
print(len(overlap_labels))

323


## organ classification

In [11]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
opentargets_disease_to_organ.py

Example:
  python opentargets_disease_to_organ.py \
      --disease "Type 2 diabetes mellitus" --disease "Crohn disease" \
      --top-targets 30 --top-organs 12 \
      --out-csv ot_organs.csv --out-json ot_payload.json
"""

import argparse, csv, json, time, sys
from typing import Dict, List, Optional, Tuple
import requests

OT_GQL = "https://api.platform.opentargets.org/api/v4/graphql"

# ----------------- GraphQL helpers -----------------

def gql(query: str, variables: dict) -> Optional[dict]:
    try:
        r = requests.post(OT_GQL, json={"query": query, "variables": variables}, timeout=40)
        if r.status_code != 200:
            return None
        data = r.json()
        # Basic error surfacing
        if "errors" in data:
            # print to stderr but keep running
            print("[GQL ERR]", data["errors"], file=sys.stderr)
        return data.get("data")
    except requests.RequestException as e:
        print("[HTTP ERR]", e, file=sys.stderr)
        return None

# ----------------- Queries -----------------

Q_SEARCH_DISEASE = """
query SearchDisease($q: String!, $index: Int!, $size: Int!) {
  search(queryString: $q, entityNames: ["disease"], page: {index: $index, size: $size}) {
    total
    diseases {
      hits {
        id
        name
      }
    }
  }
}
"""

Q_DISEASE_ASSOCIATED_TARGETS = """
query DiseaseAssocTargets($efoId: String!, $index: Int!, $size: Int!) {
  disease(efoId: $efoId) {
    id
    name
    associatedTargets(page: {index: $index, size: $size}) {
      count
      rows {
        score
        target { id approvedSymbol }
      }
    }
  }
}
"""

Q_TARGET_BASELINE_EXPRESSION = """
query TargetExpr($ensemblId: String!) {
  target(ensemblId: $ensemblId) {
    id
    approvedSymbol
    baselineExpression {
      tissue { id name }
      # Different releases expose one or more of these; grab what exists.
      zScore
      expression
      level
    }
  }
}
"""

# ----------------- Logic -----------------

def resolve_disease_to_efo(name: str) -> Optional[Tuple[str, str]]:
    data = gql(Q_SEARCH_DISEASE, {"q": name, "index": 0, "size": 5})
    if not data:
        return None
    hits = (data.get("search") or {}).get("diseases", {}).get("hits", [])
    if not hits:
        return None
    # take best hit
    efo = hits[0]["id"]
    label = hits[0].get("name", name)
    return efo, label

def fetch_associated_targets(efo: str, size: int) -> List[Tuple[str, float, str]]:
    data = gql(Q_DISEASE_ASSOCIATED_TARGETS, {"efoId": efo, "index": 0, "size": size})
    if not data:
        return []
    dz = data.get("disease") or {}
    rows = ((dz.get("associatedTargets") or {}).get("rows")) or []
    out = []
    for r in rows:
        t = r.get("target") or {}
        tid = t.get("id")
        sym = t.get("approvedSymbol", "")
        score = float(r.get("score", 0.0) or 0.0)
        if tid and score > 0:
            out.append((tid, score, sym))
    return out

def fetch_target_expression(tid: str) -> List[Tuple[str, str, float]]:
    """
    Return list of (tissue_id, tissue_name, expr_value). We pick the first
    available numeric among zScore, expression, level.
    """
    data = gql(Q_TARGET_BASELINE_EXPRESSION, {"ensemblId": tid})
    if not data:
        return []
    t = data.get("target") or {}
    exprs = t.get("baselineExpression") or []
    out = []
    for e in exprs:
        tissue = e.get("tissue") or {}
        tid_ = tissue.get("id")
        name_ = tissue.get("name", "")
        # prefer zScore; else expression; else level
        val = None
        for key in ("zScore", "expression", "level"):
            v = e.get(key)
            if isinstance(v, (int, float)):
                val = float(v)
                break
        if tid_ and (val is not None):
            out.append((tid_, name_, val))
    return out

def disease_to_organ_map_opentargets(disease_name: str, top_targets: int = 30, sleep: float = 0.08):
    """
    Compute weighted tissue scores = sum_over_targets( assoc_score * expression_value ).
    """
    resolved = resolve_disease_to_efo(disease_name)
    result = {
        "query": disease_name, "efo_id": None, "label": None,
        "tissue_scores": {}, "targets_used": []
    }
    if not resolved:
        return result
    efo, label = resolved
    result["efo_id"], result["label"] = efo, label

    targets = fetch_associated_targets(efo, size=top_targets)
    result["targets_used"] = [{"ensembl_id": t, "symbol": sym, "assoc_score": s} for (t, s, sym) in targets]
    tissue_scores: Dict[str, Dict[str, float]] = {}  # id -> {"name":..., "score":...}

    for (ensembl_id, assoc_score, _sym) in targets:
        expr = fetch_target_expression(ensembl_id)
        for (tissue_id, tissue_name, expr_val) in expr:
            score = assoc_score * expr_val
            bucket = tissue_scores.setdefault(tissue_id, {"name": tissue_name, "score": 0.0})
            bucket["score"] += score
        time.sleep(sleep)  # be gentle to the API

    result["tissue_scores"] = {
        tid: {"tissue_name": v["name"], "weighted_score": v["score"]}
        for tid, v in tissue_scores.items()
    }
    return result

# ----------------- CLI & I/O -----------------

def write_csv(path: str, payload: Dict[str, dict], top_organs: Optional[int]):
    cols = ["disease", "label", "efo_id", "tissue_id", "tissue_name", "weighted_score"]
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writeheader()
        for disease, res in payload.items():
            tissues = sorted(
                res.get("tissue_scores", {}).items(),
                key=lambda kv: kv[1]["weighted_score"],
                reverse=True
            )
            if top_organs:
                tissues = tissues[:top_organs]
            for tid, tv in tissues:
                w.writerow({
                    "disease": disease,
                    "label": res.get("label") or "",
                    "efo_id": res.get("efo_id") or "",
                    "tissue_id": tid,
                    "tissue_name": tv["tissue_name"],
                    "weighted_score": f'{tv["weighted_score"]:.6f}',
                })

def write_json(path: str, payload: Dict[str, dict]):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

def parse_args():
    p = argparse.ArgumentParser(description="Open Targets disease → organ (tissue) mapping via targets × baseline expression.")
    p.add_argument("--disease", action="append", help="Disease name (repeatable).")
    p.add_argument("--infile", help="Text file with one disease per line.")
    p.add_argument("--top-targets", type=int, default=30, help="How many top associated targets to use per disease.")
    p.add_argument("--top-organs", type=int, default=10, help="How many top tissues/organs to print/save.")
    p.add_argument("--sleep", type=float, default=0.08, help="Sleep between target queries.")
    p.add_argument("--out-csv", help="Write a tidy CSV of the organ map.")
    p.add_argument("--out-json", help="Write the full JSON payload.")
    return p.parse_args()

def main():
    args = parse_args()
    diseases: List[str] = args.disease or []
    if args.infile:
        with open(args.infile, "r", encoding="utf-8") as f:
            diseases += [ln.strip() for ln in f if ln.strip()]
    if not diseases:
        print("Provide diseases via --disease or --infile.", file=sys.stderr)
        sys.exit(1)

    payload: Dict[str, dict] = {}
    for name in diseases:
        res = disease_to_organ_map_opentargets(name, top_targets=args.top_targets, sleep=args.sleep)
        payload[name] = res

        # Console preview
        print(f"\n=== {name} ===")
        if not res.get("efo_id"):
            print("  (no EFO match)")
            continue
        print(f"  Resolved: {res['label']} [{res['efo_id']}]")
        tissues = sorted(
            res.get("tissue_scores", {}).items(),
            key=lambda kv: kv[1]["weighted_score"],
            reverse=True
        )[: args.top_organs]
        if not tissues:
            print("  No tissue signals (try increasing --top-targets).")
        else:
            for tid, tv in tissues:
                print(f"  - {tv['tissue_name']} ({tid}): {tv['weighted_score']:.4f}")

    if args.out_csv:
        write_csv(args.out_csv, payload, top_organs=args.top_organs)
        print(f"\n[+] CSV written: {args.out_csv}")
    if args.out_json:
        write_json(args.out_json, payload)
        print(f"[+] JSON written: {args.out_json}")

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--disease DISEASE] [--infile INFILE]
                             [--top-targets TOP_TARGETS]
                             [--top-organs TOP_ORGANS] [--sleep SLEEP]
                             [--out-csv OUT_CSV] [--out-json OUT_JSON]
ipykernel_launcher.py: error: unrecognized arguments: --f=/run/user/90248/jupyter/runtime/kernel-v3657548783c6119f0dda7865e6deb62c52334861d.json


SystemExit: 2