In [None]:
import pandas as pd
import json
import requests
import os

In [2]:
maintype = "C4912" # Bladder Cancer
stage = ["C140425"] # Stage IVA Bladder Cancer

In [None]:
collection = []
start = 0
while True:
    res = requests.post(
        "https://clinicaltrialsapi.cancer.gov/api/v2/trials",
        headers={
            "accept": "*/*",
            "X-API-KEY": os.environ["CTS_V2_API_KEY"],
        },
        json={
            "current_trial_status": [
                "Active",
                "Approved",
                "Enrolling by Invitation",
                "In Review",
                "Temporarily Closed to Accrual",
                "Temporarily Closed to Accrual and Intervention",
            ],
            "include": ["nct_id", "diseases"],
            "maintype": maintype,
            "stage": stage,
            "from": start,
            "size": 50,
        },
    )
    data = res.json()
    collection.extend(data["data"])
    if not data["data"]:
        break
    else:
        print("Have", len(collection), "Need", data["total"])
        start = len(collection)

#### construct graph of trial disease nodes

In [None]:
class Node:
    id: str
    data: dict[str, str]
    links: list["Node"]

    def __init__(self, data, nid) -> None:
        self.id = nid
        self.data = {"name": data['name']}
        self.links = []

    def _linked(self, nid) -> bool:
        for node in self.links:
            if node.id == nid:
                return True
        return False

    def link(self, node):
        if self._linked(node.id):
            return
        self.links.append(node)


In [None]:
# Lookup data for a disease code
index = {}
for t in collection:
    t_id = t["nct_id"]
    for d in t["diseases"]:
        d_id = d["nci_thesaurus_concept_id"]
        if d_id not in index:
            index[d_id] = {
                "name": d["name"],
                "parents": d["parents"],
            }
        else:
            p1 = set(d["parents"])
            p2 = set(index[d_id]["parents"])
            assert not p1.difference(p2)

code2node: dict[str, Node] = {}
# iterate over all indexed codes/data
for code, data in index.items():
    # check if code is a node
    if code not in code2node:
        # create the node
        code2node[code] = Node(data, code)
        # iterate over parents
        for parent_code in data["parents"]:
            # check if parent code is a node
            if parent_code not in code2node:
                # create the parent node
                code2node[parent_code] = Node(index[parent_code], parent_code)
            # associate the node with its parent nodes
            code2node[code].link(code2node[parent_code])
    else:
        # the node already exists
        # but its parents may not
        node = code2node[code]
        # get the node's expected links (parents)
        parents = data["parents"]
        # get the node's actual links (parents)
        links = [n.id for n in node.links]
        for parent_code in parents:
            # check if the expected link exists
            if parent_code not in links:
                if parent_code not in code2node:
                    # create it if not
                    code2node[parent_code] = Node(index[parent_code], parent_code)
                node.link(code2node[parent_code])


In [None]:
assert code2node.keys() == index.keys() and len(code2node.keys()) > 1
index = None

In [None]:
found = False
q = [(code2node[maintype], "", 0)]
vis = set()
while q:
    n, who, lvl = q.pop(0)
    if n.id in vis:
        continue
    vis.add(n.id)
    print("\t" * lvl, "| " + who + " <-", "(" + n.id + ")", n.data["name"])
    for link in n.links:
        if link.id not in vis:
            q.append((link, n.id, lvl + 1))


#### Other investigation of full collection

In [18]:
with open("./.rest-client/bladder_stage_iva.json") as f:
    data = json.load(f)["data"]
df = pd.json_normalize(data, record_path="diseases", meta=["nct_id"]).drop(
    labels=["synonyms"], axis=1
)
print(df.shape)
df.tail()

(1304, 7)


Unnamed: 0,inclusion_indicator,is_lead_disease,nci_thesaurus_concept_id,name,type,parents,nct_id
1299,TREE,False,C8278,Cancer-Related Condition,[subtype],[C2991],NCT03517332
1300,TREE,False,C8614,Hepatobiliary Neoplasm,[subtype],"[C3959, C3052]",NCT03517332
1301,TREE,False,C136467,Lung Cancer by AJCC v8 Stage,[stage],[C4878],NCT03517332
1302,TREE,False,C3431,Urinary System Neoplasm,[subtype],"[C156482, C3430]",NCT03517332
1303,TREE,False,C4978,Malignant Colorectal Neoplasm,[subtype],"[C2956, C4572]",NCT03517332


In [19]:
search_codes = pd.Series([maintype, *stage])
groupby = df[df["nci_thesaurus_concept_id"].isin(search_codes)].groupby(by="nct_id")[
    ["name", "inclusion_indicator"]
]
assert (
    len(groupby) == len(data) and len(groupby) >= 1
), "Every trial must have the maintype and one of the stage codes"
groupby.apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,name,inclusion_indicator
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT03517332,1159,Stage IVA Bladder Cancer,TRIAL
NCT03517332,1272,Bladder Cancer,TREE
NCT03693014,253,Stage IVA Bladder Cancer,TRIAL
NCT03693014,288,Bladder Cancer,TREE
NCT03767348,111,Stage IVA Bladder Cancer,TRIAL
NCT03767348,176,Bladder Cancer,TREE
NCT03869190,347,Stage IVA Bladder Cancer,TRIAL
NCT03869190,441,Bladder Cancer,TREE
NCT04064190,757,Stage IVA Bladder Cancer,TRIAL
NCT04064190,845,Bladder Cancer,TREE


In [1]:
import re
import psycopg2 as pg

In [49]:
with open('./.rest-client/hnc_stages.json') as f:
    data = json.load(f)['data']
data[:3]

[{'name': 'Differentiated Thyroid Gland Carcinoma by AJCC v8 Stage',
  'codes': ['C140965'],
  'parent_ids': ['C4815']},
 {'name': 'Recurrent Cutaneous Squamous Cell Carcinoma of the Head and Neck',
  'codes': ['C162942'],
  'parent_ids': ['C35850', 'C4914']},
 {'name': 'Recurrent Head and Neck Carcinoma',
  'codes': ['C7624'],
  'parent_ids': ['C35850']}]

In [2]:
conn = pg.connect(dbname="sec", user="secapp", host="r_363-postgres-1", port=5432)
curr = conn.cursor()

In [51]:
curr.execute("""create table if not exists secapp.hnc_stages
    (
        pref_name text,
        code text,
        parent text
    )
""")
conn.commit()

In [55]:
for d in data:
    for c in d['codes']:
        for p in d['parent_ids']:
            curr.execute("insert into hnc_stages values (%s, %s, %s)", (d['name'], c, p))
conn.commit()

In [56]:
curr.close()
conn.close()