In [141]:
import pandas as pd
import glob

In [150]:
table_files = glob.glob("*.csv")
search_codes = {
    "C159685",
    "C132788",
    "C140985",
    "C140997",
    "C141001",
    "C133162",
    "C6005",
    "C133000",
}


In [151]:
def dfs(current, df):
    parents = df.loc[df["code"] == current, "parent"]
    if pd.isna(parents).all():
        return [[current]]
    else:
        paths = []
        for parent in parents:
            parent_paths = dfs(parent, df)
            for path in parent_paths:
                paths.append([current, *path])
        return paths


def flatten_df(df):
    code2parent = []
    for _, row in df.iterrows():
        parents: list[str] | list[None] = [None]
        if pd.notna(row["parents"]):
            if "," in row["parents"]:
                parents = [p.strip() for p in row["parents"].split(",")]
            else:
                parents = [row["parents"]]

        for parent in parents:
            code2parent.append((row["code"], parent))
    df = pd.DataFrame(code2parent, columns=["code", "parent"])
    df.sort_values("code")
    return df

In [None]:
saved_disease_paths = {}
saved_disease_paths_origin = {}
visited_codes = set()

for file in table_files:
    df = pd.read_csv(file)[["code", "parents", "disease"]]
    assert (
        not df["code"].str.contains(",").any()
    ), "Ensure disease code appears only once per df"
    df = flatten_df(df)

    for code in search_codes:
        trial_disease_paths = dfs(code, df)
        code_disease_paths = set()
        for path in trial_disease_paths:
            if len(path) <= 1:
                continue
            code_disease_paths.add("|".join(path))
        if len(code_disease_paths) < 1:
            continue
        if code in saved_disease_paths:
            assert (
                code_disease_paths == saved_disease_paths[code]
            ), "Ensure disease paths are consistent across all trials"
        else:
            saved_disease_paths[code] = code_disease_paths
            saved_disease_paths_origin[code] = file