In [None]:
import os
import sys
import torch

PUBDIR = os.getcwd()
ROOT_DIR = os.path.dirname(PUBDIR)
DATA_DIR = os.path.join(ROOT_DIR, "data")
sys.path.append(ROOT_DIR)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load ontology
from core.data_model import Patient, Disease, Ontology
from core.data_model import Diseases
from core.io_ops import load_pickle

disease_data = load_pickle(os.path.join(DATA_DIR, "diseases.pickle"))
vectorized_hpo = load_pickle(os.path.join(DATA_DIR, "hpo_definition.vector.pickle"))
ontology = Ontology(vectorized_hpo)
omim_diseases = Diseases([disease for disease in disease_data if disease.id.startswith("OMIM")])

### 1. Load benchmark patient dataset

In [None]:
# download public dataset from zeonodo
if not os.path.exists('../data/phenopackets/'):
    ! wget https://zenodo.org/records/3905420/files/phenopackets.zip?download=1
    ! mv phenopackets.zip?download=1 ../data/phenopackets.zip
    ! unzip -o ../data/phenopackets.zip -d ../data/

In [None]:
import glob
from core.benchmark import load_phenopacket_patients
from core.data_model import HPO, HPOs, Patient, Patients

benchmark_patients:Patients = load_phenopacket_patients(
    phenopacket_dir=os.path.join(DATA_DIR, "phenopackets"),
    ontology=ontology
)
print(benchmark_patients)

### Model 1: Phen2Disease

Download prerequsite file

In [None]:
if not os.path.exists("../data/lin_similarity_matrix.json"):
    ! pip install gdown
    ! gdown 1CSYfDj5fG9SsosIDlG-hLAoKp9eMHxjH
    ! gunzip -f lin_similarity_matrix.json.gz

In [None]:
import tqdm
import numpy as np
from core.benchmark import get_phen2disease
from core.io_ops import read_json

if not os.path.exists("phen2disease_result.npy"):
    phen2disease_result = np.zeros((len(benchmark_patients), len(omim_diseases)))
    labels = np.zeros((len(benchmark_patients), len(omim_diseases)))
    pheno2disease_sim_mat:dict = read_json("../data/lin_similarity_matrix.json")
        
    for patient_idx, patient in tqdm.tqdm(enumerate(benchmark_patients), total=len(benchmark_patients)):
        true_disease_indices = [
            disease_idx for disease_idx, disease 
            in enumerate(omim_diseases) 
            if disease.id in patient.disease_ids
        ]
        labels[patient_idx, true_disease_indices] = 1
        
        process = list()
        for disease_idx, disease in enumerate(omim_diseases):
            phen2disease_result[patient_idx, disease_idx] = get_phen2disease(patient, disease, pheno2disease_sim_mat)

    np.save("phen2disease_result", phen2disease_result)
    
else:
    phen2disease_result = np.load("phen2disease_result.npy")

### Model 2: LaRA

load model

In [None]:
import torch
import mlflow

from core.datasets import (
    StochasticPairwiseDataset,
    collate_for_stochastic_pairwise_eval,
)
from torch.utils.data import DataLoader
from core.networks import Transformer
from mlflow_settings import TRACKING_URI
mlflow.set_tracking_uri(TRACKING_URI)

model_url = "mlflow-artifacts:/25/8764a3fa4a4f46cbb20c33fc92bccb9f/artifacts/model"
lara = mlflow.pytorch.load_model(model_url).to(DEVICE).eval()

build dataset for LaRA

In [None]:
import numpy as np
def cosine_sim(vector1, vector2):
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

lara_disease_vectors = list()
with torch.no_grad():
    for disease in tqdm.tqdm(omim_diseases):
        disease_tensor:torch.Tensor = torch.tensor(
            disease.hpos.vector, 
            dtype=torch.float32, 
            device=DEVICE
        ).unsqueeze(dim=0)
        target_vector = lara(disease_tensor).squeeze().cpu().numpy()
        lara_disease_vectors.append(target_vector)
        
lara_disease_vectors = np.stack(lara_disease_vectors, axis=0)

In [None]:
if not os.path.exists("lara_result.npy"):
    lara_result = np.zeros((len(benchmark_patients), len(omim_diseases)))
    with torch.no_grad():
        for patient_idx, patient in tqdm.tqdm(enumerate(benchmark_patients)):
            input_src = torch.tensor(
                    patient.hpos.vector, dtype=torch.float32, device=DEVICE
            ).unsqueeze(dim=0)
            patient_vector = lara(input_src).squeeze().cpu().numpy()
                
            for disease_idx, disease in enumerate(omim_diseases):
                lara_result[patient_idx, disease_idx] = cosine_sim(
                    patient_vector,
                    lara_disease_vectors[disease_idx]
                )
    np.save("lara_result", lara_result)
    
else:
    lara_result = np.load("lara_result.npy")

### Model 3: Node level Semantic similarity

Install SemanticSimilarity (Node level similarity)

In [None]:
if not os.path.exists('../SemanticSimilarity'):
    ! git clone https://github.com/4pygmalion/SemanticSimilarity.git
    ! cd SemanticSimilarity
    ! python3 -m pip install . 

Load calculator

In [None]:
if not os.path.exists("base_result.npy"):

    from SemanticSimilarity.calculator import NodeLevelSimilarityCalculator
    from SemanticSimilarity.data_model import Phenotype
    import tqdm

    from omegaconf import OmegaConf
    conf = OmegaConf.load("/data1/benny_dev/symptom_similarity/SemanticSimilarity/config.yaml")
    tb_cal = NodeLevelSimilarityCalculator(conf)
    tb_cal.set_level()
    tb_cal.set_mica_mat()

    def calculate_score(p, d):
        node_level = {}
        node_level[p.id] = {}
        p_syms = {Phenotype(id_, name) for id_, name in zip(p.hpos.id2hpo.keys(), p.hpos.name2hpo.keys())}
        d_syms = {Phenotype(id_, name) for id_, name in zip(d.hpos.id2hpo.keys(), d.hpos.name2hpo.keys())}
        score = tb_cal.get_semantic_similarity(p_syms, d_syms)
        return score

    base_result = np.zeros((len(benchmark_patients), len(omim_diseases)))
    for patient_idx, patient in tqdm.tqdm(enumerate(benchmark_patients)):
        for disease_idx, disease in enumerate(omim_diseases):  ## 필요시 병렬처리
            score = calculate_score(patient, disease)
            base_result[patient_idx, disease_idx] = score

    np.save("base_result", base_result)
    
else:
    base_result = np.load("base_result.npy")

#### 비교 평가


아래의 np.ndarray을 이용하여 계산
- phen2disease_result(np.ndarray). shape=(# of patients, # of omim disease)
- lara_result(np.ndarray). shape=(# of patients, # of omim disease)
- base_result(np.ndarray). shape=(# of patients, # of omim disease)

In [None]:
from tqdm import tqdm
from core_3asc.metric import topk_recall

results = []
for patient_idx, patient in tqdm(enumerate(benchmark_patients)):

    label = np.zeros((len(omim_diseases), ))
    scores_base = np.zeros((len(omim_diseases), ))
    scores_pd = np.zeros((len(omim_diseases), ))
    scores_model = np.zeros((len(omim_diseases), ))

    for disease_idx, disease in enumerate(omim_diseases):
        if disease.id in patient.disease_ids:
            label[disease_idx] = 1
        
        scores_base = base_result[patient_idx]
        scores_pd = phen2disease_result[patient_idx]
        scores_lara = lara_result[patient_idx]

    results.append({
        "p_id": patient.id, 
        "scores_base": scores_base, 
        "scores_pd": scores_pd, 
        "scores_model": scores_lara, 

        "top_1_base": topk_recall(scores_base, label, k=1),
        "top_1_pd": topk_recall(scores_pd, label, k=1),
        "top_1_model": topk_recall(scores_lara, label, k=1),
        
        "top_5_base": topk_recall(scores_base, label, k=5),
        "top_5_pd": topk_recall(scores_pd, label, k=5),
        "top_5_model": topk_recall(scores_lara, label, k=5),

        "top_10_base": topk_recall(scores_base, label, k=10),
        "top_10_pd": topk_recall(scores_pd, label, k=10),
        "top_10_model": topk_recall(scores_lara, label, k=10),

        "top_15_base": topk_recall(scores_base, label, k=15),
        "top_15_pd": topk_recall(scores_pd, label, k=15),
        "top_15_model": topk_recall(scores_lara, label, k=15),

        "top_20_base": topk_recall(scores_base, label, k=20),
        "top_20_pd": topk_recall(scores_pd, label, k=20),
        "top_20_model": topk_recall(scores_lara, label, k=20),

        "top_30_base": topk_recall(scores_base, label, k=30),
        "top_30_pd": topk_recall(scores_pd, label, k=30),
        "top_30_model": topk_recall(scores_lara, label, k=30),

        "top_40_base": topk_recall(scores_base, label, k=40),
        "top_40_pd": topk_recall(scores_pd, label, k=40),
        "top_40_model": topk_recall(scores_lara, label, k=40),

        "top_50_base": topk_recall(scores_base, label, k=50),
        "top_50_pd": topk_recall(scores_pd, label, k=50),
        "top_50_model": topk_recall(scores_lara, label, k=50),

        "top_75_base": topk_recall(scores_base, label, k=75),
        "top_75_pd": topk_recall(scores_pd, label, k=75),
        "top_75_model": topk_recall(scores_lara, label, k=75),

        "top_100_base": topk_recall(scores_base, label, k=100),
        "top_100_pd": topk_recall(scores_pd, label, k=100),
        "top_100_model": topk_recall(scores_lara, label, k=100),
        
    })

In [None]:
import pandas as pd
result_df = pd.DataFrame(results)
result_df = result_df.set_index("p_id")
result_df = result_df[list(set(result_df.columns) - {'p_id', 'scores_base', 'scores_pd', 'scores_model'})]

data = (result_df.sum(0) / len(result_df)).to_dict()

# Initialize empty dictionaries for 'base', 'pd', and 'model' data
base_data = {f"top{i}": None for i in [1,5,10,15,20,30,40,50,75,100]}
pd_data = {f"top{i}": None for i in [1,5,10,15,20,30,40,50,75,100]}
model_data = {f"top{i}": None for i in [1,5,10,15,20,30,40,50,75,100]}

# Organize the data according to 'base', 'pd', and 'model' categories
for key, value in data.items():
    if 'base' in key:
        base_data[f"top{key.split('_')[1]}"] = value
    elif 'pd' in key:
        pd_data[f"top{key.split('_')[1]}"] = value
    elif 'model' in key:
        model_data[f"top{key.split('_')[1]}"] = value

# Create a DataFrame with 'base', 'pd', and 'model' as rows and 'top1', 'top10', 'top50', 'top100' as columns
df = pd.DataFrame([base_data, pd_data, model_data], index=['baseline', 'Pheno2Disease', 'LLM-based'])
df.index.name = 'Method'

# Display the DataFrame
print(df.head(2))

In [None]:
# Figure 3

import matplotlib.pyplot as plt
import pandas as pd

fig_df = pd.DataFrame([base_data, pd_data, model_data], index=['Resnik-based IC', 'Pheno2Disease', 'LaRa'])
fig_df.index.name = 'Method'


# Example DataFrame
# Plotting the recall curve
plt.figure(figsize=(6, 6))
for index, row in fig_df.iterrows():
    plt.plot(list(row.index), list(row.values), marker='o', label=index)

plt.title('Real world dataset: rare disease patient data')
plt.xlabel('Top-k')
plt.ylabel('Top-k Recall')
plt.xticks(rotation=45)
plt.legend()
plt.grid(False)
plt.tight_layout()
plt.show()