In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import ncit_utils

tqdm.pandas()

#### Load in the Thesaurus

In [None]:
ncit = ncit_utils.load_ncit() 
ncit = ncit.fillna("")

#### Flatten the input (code -> parent)

In [None]:
multiparents = ncit[ncit["parents"].str.contains("\|")]
singleparents = ncit[~ncit["parents"].str.contains("\|")]
singleparents = singleparents.rename(columns={"parents": "parent"})

flattened = []
for idx, row in multiparents.iterrows():
    parents_df = row["parents"].split("|")
    for p in parents_df:
        rcopy = row.copy()
        rcopy["parent"] = p
        rcopy = rcopy.drop(labels=["parents"])
        flattened.append(rcopy)

parent_codes_df = pd.DataFrame(flattened).reset_index(drop=True)

parents_df = pd.concat([singleparents, parent_codes_df]).loc[:, ["code", "parent"]]
display(parents_df[parents_df["code"].duplicated(keep=False)].head())

#### Constrain the starting codes to just those in CTRP Agent/Intervention Terminology subset

In [None]:
print(np.where(
    ncit["concept in subset"].str.contains("CTRP Agent Terminology", case=False)
    | ncit["concept in subset"].str.contains(
        "CTRP Intervention Terminology", case=False
    )
)[0][:3])
print(len(np.where(
    ncit["concept in subset"].str.contains("CTRP Agent Terminology", case=False)
    | ncit["concept in subset"].str.contains(
        "CTRP Intervention Terminology", case=False
    )
)[0]))
display(ncit.iloc[77])
display(ncit.iloc[92])
display(ncit.iloc[107])

In [None]:
pt_codes = ncit["code"][
    ncit["concept in subset"].str.contains("CTRP Agent Terminology", case=False) | 
    ncit["concept in subset"].str.contains("CTRP Intervention Terminology", case=False)
]
pt_codes_w_parent = parents_df[
    parents_df["code"].isin(pt_codes)
].reset_index(drop=True)
display(pt_codes_w_parent)
print(pt_codes_w_parent.drop_duplicates().shape)

#### Construct the lookup for code -> child

In [None]:
logfile = open("logfile.log", "w")
global_visited = set()


def t(code: str, parent: str):
    if not code:
        return
    if (code, parent) in global_visited:
        return
    global_visited.add((code, parent))
    logfile.write(f"visiting {(code, parent)}\n")
    # Find all the codes where the code's parent is equal to the input code
    child_codes = parents_df["code"][parents_df["parent"] == code]
    if child_codes.any():
        # Apply to code's children. Code becomes the parent.
        child_codes.apply(t, args=(code,))
    else:
        return

for pt_code, parent_code in tqdm(list(pt_codes_w_parent.itertuples(index=False))):
    t(pt_code, parent_code)

logfile.close()

#### Construct the output

In [None]:
lookups: dict[str, pd.Series] = {}
data = []
for child, parent in tqdm(global_visited):
    if child not in lookups:
        child_row = ncit[ncit["code"] == child].iloc[0]
        lookups[child] = child_row
    if parent not in lookups:
        parent_row = ncit[ncit["code"] == parent].iloc[0]
        lookups[parent] = parent_row
    child_row = lookups[child]
    parent_row = lookups[parent]
    synonyms = child_row["synonyms"].split("|")
    for syn in synonyms:
        data.append(
            (
                syn,
                child_row["display name"],
                child,
                parent_row["display name"],
                parent,
            )
        )
output = pd.DataFrame(
    data,
    columns=[
        "Term",
        "Preferred Term",
        "Code",
        "Parent Term",
        "Parent Term Code",
    ],
    dtype=str,
)
output.to_csv("pts_codes.csv", index=False)
output

#### Call EVS API to get preferred terms


In [None]:
pref_term_codes_set = pd.concat([output['Code'], output['Parent Term Code']]).drop_duplicates()
pt_terms_df = ncit_utils.EVSConceptsApi.load_terms_w_synonyms(
    pref_term_codes_set, "ncit_output/preferred_terms_pts.csv"
)
pt_terms_df = pt_terms_df.dropna(subset=['source'])
pt_terms_df

#### Apply EVS terms to output

In [None]:
from collections import defaultdict


syn_to_sources = defaultdict(set)

for _, row in pt_terms_df.iterrows():
    r_code = row["code"]
    syn = row["synonym"]
    source = row["source"]
    syn_to_sources[(r_code, syn)].add(source)


def get_sources(row: pd.Series):
    as_list = list(syn_to_sources[(row["Code"], row["Term"])])
    as_list.sort()
    return ",".join(as_list)


output["Sources"] = output[["Term", "Code"]].apply(get_sources, axis=1)
assert not output["Sources"].isna().any()
display(output.head())

In [None]:
# Use existing pref term if provided by NCIt
# Else use the EVS preferred name
code_to_term_map = {}


def code_to_term(code: str):
    if code not in code_to_term_map:
        term = pt_terms_df.loc[pt_terms_df["code"] == code, "name"].iloc[
            0
        ]
        code_to_term_map[code] = term
    return code_to_term_map[code]


output["Preferred Term"] = output.progress_apply(
    lambda row: row["Preferred Term"]
    if row["Preferred Term"]
    else code_to_term(row["Code"]),
    axis=1,
)

print("Done with Preferred Term.")

output["Parent Term"] = output.progress_apply(
    lambda row: row["Parent Term"]
    if row["Parent Term"]
    else code_to_term(row["Parent Term Code"]),
    axis=1,
)
assert not (output["Preferred Term"] == "").any()
assert not (output["Parent Term"] == "").any()

In [None]:
print(output.duplicated().sum())
print(output.shape)
output = output.drop_duplicates()
print(output.shape)

#### Save the output

In [None]:
term2code = output.loc[:, ["Term", "Code"]].drop_duplicates()
term2code = term2code.sort_values(by=["Term"])
term2code.to_csv("pts_syn_2_code.tsv", sep="\t", index=False, encoding="utf-8")

code2pref_term = output.loc[:, ["Code", "Preferred Term"]].drop_duplicates()
code2pref_term = code2pref_term.sort_values(by=["Code"])
code2pref_term.to_csv(
    "pts_code_2_pref_term.tsv", sep="\t", index=False, encoding="utf-8"
)

output.to_csv("pts_ncit_concepts.tsv", sep="\t", index=False, encoding="utf-8")

#### Check the output

In [None]:
codes_checked = set()
parents_checked = set()


def check_output(arr):
    pref_term = arr[1]
    code = arr[2]
    parent_term = arr[3]
    parent_code = arr[4]

    len(pref_term) >= 1
    len(parent_term) >= 1

    if code not in codes_checked:
        terms = output["Preferred Term"][output["Code"] == code]
        assert len(terms.unique()) == 1
        codes_checked.add(code)
    if parent_code not in parents_checked:
        terms = output["Parent Term"][output["Parent Term Code"] == parent_code]
        assert len(terms.unique()) == 1
        parents_checked.add(parent_code)


_ = output.progress_apply(check_output, axis=1, raw=True)