In [None]:
import pandas as pd

import ncit_utils

#### Load in the Thesaurus

In [None]:
ncit = ncit_utils.load_ncit() 

#### Flatten the input (code -> parent)

In [None]:
multiparents = ncit[ncit["parents"].str.contains("\|")]
singleparents = ncit[~ncit["parents"].str.contains("\|")]
singleparents = singleparents.rename(columns={"parents": "parent"})

flattened = []
for idx, row in multiparents.iterrows():
    parents_df = row["parents"].split("|")
    for p in parents_df:
        rcopy = row.copy()
        rcopy["parent"] = p
        rcopy = rcopy.drop(labels=["parents"])
        flattened.append(rcopy)

parent_codes_df = pd.DataFrame(flattened).reset_index(drop=True)

parents_df = pd.concat([singleparents, parent_codes_df]).loc[:, ["code", "parent"]]
display(parents_df[parents_df["code"].duplicated(keep=False)].head())

#### Constrain the starting codes to just those in CTRP Biomarker Terminology subset

In [None]:
biomarker_codes = ncit["code"][
    ncit["concept in subset"].str.contains("CTRP Biomarker Terminology", case=False)
]
print(biomarker_codes.shape)
print(biomarker_codes.unique().shape)

#### Construct the lookup for code -> child

In [None]:
logfile = open("logfile.log", "w")
global_visited = set()

def t(code: str, children: set):
    if not code:
        return
    if code in global_visited:
        return
    global_visited.add(code)
    logfile.write(f"visiting {code}\n")
    child_codes = parents_df["code"][parents_df["parent"] == code]
    if child_codes.any():
        child_codes.apply(children.add)
        child_codes.apply(t, args=(children,))
    else:
        return


mapping = {}
for biomarker_code in biomarker_codes:
    mapping[biomarker_code] = set()
    t(biomarker_code, mapping[biomarker_code])

logfile.close()