In [3]:
import os
import sys
NB_DIR = os.getcwd()
ROOT_DIR = os.path.dirname(NB_DIR)

sys.path.append(ROOT_DIR)

In [11]:
import tqdm
import pandas as pd
from core.io_ops import load_pickle
from ontology_src import ARTIFACT_PATH
from core_3asc.dynamodb_ops import DynamoDBClient

patients = load_pickle(ARTIFACT_PATH["patients"])

client = DynamoDBClient("../data/keyfile.yaml")

rows = list()
onset_order = {
    'Antenatal':0 , 
    'Neonatal': 1, 
    'Infancy': 2,
    'Childhood': 3,
    "Adolescent": 4,
    'Adult': 5, 
    'Elderly': 6, 
    'Unknown': 7
}
idx2onset = {i:onset for i, onset in enumerate(onset_order)}
for patient in tqdm.tqdm(patients):
    demographics = client.get_clinical_info(patient.id)

    # Onset
    onset_idx = min([onset_order.get(tup[2], 7) for tup in demographics["symptoms"]])

    # N hpo
    n_hpo = len(patient.hpos)

    # Gender
    gender = demographics["gender"]
    
    rows.append([patient.id, n_hpo, gender, idx2onset[onset_idx]])

df = pd.DataFrame(rows, columns=["id", "n_hpos", "gender", "onset"])

100%|██████████| 14540/14540 [05:00<00:00, 48.33it/s]


In [31]:
from tableone import TableOne

exclude_ids = set(df.loc[
    (df["gender"] == "") | (df["gender"] == "?"), "id"
].tolist())
df = df.loc[~df["id"].isin(exclude_ids)]

table1 = TableOne(
    df,
    columns=["n_hpos", "gender", "onset"],
    missing=False,
    categorical=["gender", "onset"],
    decimals=3,
)
table1

Unnamed: 0,Unnamed: 1,Overall
n,,14518
"n_hpos, mean (SD)",,3.961 (4.358)
"gender, n (%)",female,6463 (44.517)
"gender, n (%)",male,8055 (55.483)
"onset, n (%)",Adolescent,617 (4.250)
"onset, n (%)",Adult,2316 (15.953)
"onset, n (%)",Antenatal,676 (4.656)
"onset, n (%)",Childhood,2605 (17.943)
"onset, n (%)",Elderly,33 (0.227)
"onset, n (%)",Infancy,3010 (20.733)


In [83]:
from pronto import Ontology
import warnings
import pronto
from collections import defaultdict
warnings.filterwarnings("ignore", category=pronto.warnings.ProntoWarning)

ontology = Ontology("https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-10-09/hp.owl")
categories = set(ontology["HP:0000118"].subclasses(distance=1, with_self=False).to_set().ids)

# count frequency of each pheno group in dataset
top5categories = {'HP:0000152', 'HP:0000478', 'HP:0000707', 'HP:0001626', 'HP:0033127'}

res = defaultdict(int)
for patient in patients:
    supersets = set()
    for hpo in  patient.hpos:
        superclass_set = ontology[hpo.id].superclasses().to_set().ids
        supersets |= superclass_set
    
    patient_cateogries = supersets & categories
    include_others = False
    for category in patient_cateogries:
        if category not in top5categories:
            if include_others:
                continue
        
            res["others"] +=1
            include_others = True
            
        else:
            res[category] += 1

  ontology = Ontology("https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-10-09/hp.owl")


In [104]:
df = pd.DataFrame.from_dict([res]).T
df.columns = ["count"]

In [106]:
df["count"] / len(patients)

HP:0000707    0.461898
HP:0033127    0.339821
HP:0001626    0.185282
others        0.592366
HP:0000152    0.237208
HP:0000478    0.252201
Name: count, dtype: float64