In [None]:
from pathlib import Path

import pandas as pd

#### Load in the Thesaurus

In [None]:
thesaurusf = Path("Thesaurus.txt")
thesaurus_version = "Thesaurus_24.02d"

if not thesaurusf.exists():
    print(f"Downloading {thesaurus_version}")
    !curl -O https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/{thesaurus_version}.FLAT.zip
    !unzip {thesaurus_version}.FLAT.zip

ncit = pd.read_csv(thesaurusf, sep="\t", header=None, encoding="utf-8")
ncit.columns = [
    "code",
    "concept IRI",
    "parents",
    "synonyms",
    "definition",
    "display name",
    "concept status",
    "semantic type",
    "concept in subset",
]
ncit = ncit.set_index("code")
display(ncit.head())
ncit.shape

#### Create Node mapping

In [None]:
def get_row(code) -> pd.Series:
    return ncit.loc[code]

In [None]:
NODES_CACHE: dict[str, "Node"] = {}


class Node:
    code: str
    parents: list["Node"]
    synonyms: set[str]
    pref_name: str

    def __init__(self, row: pd.Series) -> None:
        self.code = row.name
        self.parents = []
        if pd.notna(row["parents"]):
            for parent_code in row["parents"].split("|"):
                if parent_code not in NODES_CACHE:
                    NODES_CACHE[parent_code] = Node(get_row(parent_code))
                self.parents.append(NODES_CACHE[parent_code])
        self.synonyms = set()
        for synonym in row["synonyms"].split("|"):
            self.synonyms.add(synonym)
        self.pref_name = row["display name"]

    def __str__(self) -> str:
        return f"<{self.code} <= {','.join(p.code for p in self.parents)}>"

    def __repr__(self) -> str:
        return f"<{self.code} <= {','.join(p.code for p in self.parents)}>"

In [None]:
for parent_code, row in ncit.iterrows():
    node = Node(row)
    if parent_code not in NODES_CACHE:
        NODES_CACHE[parent_code] = node
len(NODES_CACHE)

In [None]:
del ncit

In [None]:
NODES_CACHE['C5449']

#### Traverse the Node mapping for a certain concept and children

In [None]:
from collections import deque


def get_children_of_code(nodes: dict[str, Node], parent_code: str):
    children = set()
    for c, n in nodes.items():
        if parent_code in [p.code for p in n.parents]:
            children.add(c)
    return children


def bfs(nodes: dict[str, Node], start_code: str):
    visited = []  # Keep track of visited nodes.
    queue = deque([start_code])  # Queue initialized with the start node code.
    tracking_q = deque([(start_code, None)])

    while queue:
        current_code = queue.popleft()  # Dequeue a node code.
        if current_code not in visited:
            visited.append(current_code)

            # Add all unvisited children to the queue.
            node_children = get_children_of_code(nodes, current_code)
            for child_code in node_children:
                if child_code not in visited:
                    queue.append(child_code)
                if (child_code, current_code) not in tracking_q:
                    tracking_q.append((child_code, current_code))

    return visited, tracking_q

In [None]:
neoplasm = "C3262"
visited = bfs(NODES_CACHE, neoplasm)

In [None]:
children, children_w_parents = visited

In [None]:
# Check that the set of child2parents contains all unique pairs
combos = set()
for tup in children_w_parents:
    combos.add(tup)
assert len(combos) == len(children_w_parents)

#### Construct the output

In [None]:
data = []
for idx, (child, parent) in enumerate(children_w_parents):
    if not parent:
        assert child == neoplasm
        continue
    child, parent = NODES_CACHE[child], NODES_CACHE[parent]
    for syn in child.synonyms:
        data.append(
            (idx + 1, syn, child.pref_name, child.code, parent.pref_name, parent.code)
        )

In [None]:
output = pd.DataFrame(
    data,
    columns=[
        "Level",
        "Disease",
        "Preferred Term",
        "Code",
        "Parent Term",
        "Parent Term Code",
    ],
    dtype=str
)
output

In [None]:
output[output["Parent Term Code"] == "C5449"]

#### Call EVS API to get preferred terms

In [None]:
import time
import requests


num_concepts_per_evs_call = 575
concept_list = list(children)


def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


concept_url_fstring = (
    "https://api-evsrest.nci.nih.gov/api/v1/concept/ncit?list=%s&include=summary"
)
new_column_vals = []
chunk_count = 0
record_count = 0
retry_limit = 3
early_break = False

pref_termsf = Path("preferred_terms.csv")
if not pref_termsf.exists():
    print("Calling EVS to get preferred terms")
    for ch in chunks(concept_list, num_concepts_per_evs_call):
        c_codes = list(ch)
        record_count += len(c_codes)
        c_codes_string = ",".join(c_codes)
        concept_url_string = concept_url_fstring % (c_codes_string)
        retry_count = 0

        while retry_count < retry_limit:
            try:
                r = requests.get(concept_url_string, timeout=(1.0, 15.0))
            except requests.exceptions.RequestException as e:
                print("exception -- ", e)
                print("sleeping")
                retry_count += 1
                if retry_count == retry_limit:
                    print("retry max limit hit -- bailing out ")
                    early_break = True
                    break
                time.sleep(15)
            else:
                concept_set = r.json()
                for newc in concept_set:
                    new_column_vals.append((newc["code"], newc["name"]))

                chunk_count = chunk_count + 1
                print(
                    "processing chunk ", chunk_count, " record count = ", record_count
                )
                break
        if early_break:
            break

    pref_terms_df = pd.DataFrame(data=new_column_vals, columns=["code", "pref_name"])
    pref_terms_df.to_csv(pref_termsf, index=False, encoding="utf-8")
else:
    print(f"Using saved {pref_termsf}")
    pref_terms_df = pd.read_csv(pref_termsf, encoding="utf-8")

pref_terms_df = pref_terms_df.set_index("code")
display(pref_terms_df.head())

#### Output results

In [None]:
output["Preferred Term"] = output.apply(
    lambda row: row["Preferred Term"]
    if not pd.isna(row["Preferred Term"])
    else pref_terms_df.loc[row["Code"]]["pref_name"],
    axis=1,
)
output["Parent Term"] = output.apply(
    lambda row: row["Parent Term"]
    if not pd.isna(row["Parent Term"])
    else pref_terms_df.loc[row["Parent Term Code"]]["pref_name"],
    axis=1,
)
assert output["Preferred Term"].hasnans is False
assert output["Parent Term"].hasnans is False

In [None]:
output[output.duplicated()]

In [None]:
disease2code = output.loc[:, ["Disease", "Code"]].drop_duplicates()
disease2code = disease2code.sort_values(by=['Disease'])
disease2code.to_csv("disease-code.tsv", sep="\t", index=False, encoding="utf-8")

code2term = output.loc[:, ["Code", "Preferred Term"]].drop_duplicates()
code2term = code2term.sort_values(by=['Code'])
code2term.to_csv("code2term.tsv", sep="\t", index=False, encoding="utf-8")

In [None]:
output.to_csv("neoplasm-concepts.tsv", sep="\t", index=False, encoding="utf-8")