In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

#### Load in the Thesaurus

In [None]:
thesaurusf = Path("Thesaurus.txt")
thesaurus_version = "Thesaurus_24.02d"

if not thesaurusf.exists():
    print(f"Downloading {thesaurus_version}")
    !curl -O https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/{thesaurus_version}.FLAT.zip
    !unzip {thesaurus_version}.FLAT.zip

ncit = pd.read_csv(thesaurusf, sep="\t", header=None, encoding="utf-8", dtype=str)
ncit.columns = [
    "code",
    "concept IRI",
    "parents",
    "synonyms",
    "definition",
    "display name",
    "concept status",
    "semantic type",
    "concept in subset",
]
ncit = ncit.set_index("code")
ncit = ncit.fillna('')
display(ncit.head())
ncit.shape

#### Filter the concepts to those in subset

In [None]:
subset_keyword = "biomarker"
subset_keyword = subset_keyword.lower()

subset_term = "CTRP Biomarker Terminology"

all_concepts_in_subset = ncit[
    ncit["concept in subset"].apply(
        lambda concept_in_subset: subset_term.lower() in concept_in_subset.lower()
    )
]
display(all_concepts_in_subset.head(), all_concepts_in_subset.shape)

related_subset_terms = set()
for concept_in_subset in ncit["concept in subset"].unique():
    for term in concept_in_subset.split("|"):
        if subset_keyword in term.lower():
            related_subset_terms.add(term)
related_subset_terms

#### Construct the output

In [None]:
data = []
for code, row in all_concepts_in_subset.iterrows():
    parents = row['parents'].split('|')
    assert parents
    assert row['display name']
    for parent in parents:
        p_row = ncit.loc[parent]
        synonyms = row["synonyms"].split("|")
        for syn in synonyms:
            data.append(
                (
                    syn,
                    row["display name"],
                    code,
                    p_row['display name'],
                    parent
                )
            )

In [None]:
output = pd.DataFrame(
    data,
    columns=[
        "Term",
        "Preferred Term",
        "Code",
        "Parent Term",
        "Parent Term Code",
    ],
    dtype=str
)
output

#### Call EVS API to get preferred terms

In [None]:
import time
import requests


num_concepts_per_evs_call = 575
concept_list = list(output['Parent Term Code'].unique())


def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


concept_url_fstring = (
    "https://api-evsrest.nci.nih.gov/api/v1/concept/ncit?list=%s&include=summary"
)
new_column_vals = []
chunk_count = 0
record_count = 0
retry_limit = 3
early_break = False

pref_termsf = Path(f"preferred_terms_{subset_keyword}.csv")
if not pref_termsf.exists():
    print("Calling EVS to get preferred terms")
    for ch in chunks(concept_list, num_concepts_per_evs_call):
        c_codes = list(ch)
        record_count += len(c_codes)
        c_codes_string = ",".join(c_codes)
        concept_url_string = concept_url_fstring % (c_codes_string)
        retry_count = 0

        while retry_count < retry_limit:
            try:
                r = requests.get(concept_url_string, timeout=(1.0, 15.0))
            except requests.exceptions.RequestException as e:
                print("exception -- ", e)
                print("sleeping")
                retry_count += 1
                if retry_count == retry_limit:
                    print("retry max limit hit -- bailing out ")
                    early_break = True
                    break
                time.sleep(15)
            else:
                concept_set = r.json()
                for newc in concept_set:
                    new_column_vals.append((newc["code"], newc["name"]))

                chunk_count = chunk_count + 1
                print(
                    "processing chunk ", chunk_count, " record count = ", record_count
                )
                break
        if early_break:
            break

    pref_terms_df = pd.DataFrame(data=new_column_vals, columns=["code", "pref_name"])
    pref_terms_df.to_csv(pref_termsf, index=False, encoding="utf-8")
else:
    print(f"Using saved {pref_termsf}")
    pref_terms_df = pd.read_csv(pref_termsf, encoding="utf-8")

pref_terms_df = pref_terms_df.set_index("code")
display(pref_terms_df.head())

#### Output results

In [None]:
# output["Preferred Term"] = output.apply(
#     lambda row: row["Preferred Term"]
#     if not pd.isna(row["Preferred Term"])
#     else pref_terms_df.loc[row["Code"]]["pref_name"],
#     axis=1,
# )
output["Parent Term"] = output.apply(
    lambda row: row["Parent Term"]
    if row["Parent Term"]
    else pref_terms_df.loc[row["Parent Term Code"]]["pref_name"],
    axis=1,
)
assert output["Preferred Term"].all()
assert output["Parent Term"].all()

In [None]:
output[output.duplicated()]

In [None]:
term2code = output.loc[:, ["Term", "Code"]].drop_duplicates()
term2code = term2code.sort_values(by=['Term'])
term2code.to_csv("term-code.tsv", sep="\t", index=False, encoding="utf-8")

code2pref_term = output.loc[:, ["Code", "Preferred Term"]].drop_duplicates()
code2pref_term = code2pref_term.sort_values(by=['Code'])
code2pref_term.to_csv("code2term.tsv", sep="\t", index=False, encoding="utf-8")

output.to_csv(f"ncit-{subset_keyword}-concepts.tsv", sep="\t", index=False, encoding="utf-8")