In [81]:
import requests
import pandas as pd
from tqdm import tqdm
from utility import *


In [99]:
DISEASE_GENE_URI = "https://api.monarchinitiative.org/api/bioentity/disease/{}/genes"
DISEASE_VARIANT_URI = "https://api.monarchinitiative.org/api/bioentity/disease/{}/variants"

disease_path = "../../../data/benchmark_datasets/monarch/gwas_diseases.csv"

params = {}
params["rows"] = 2
params["direct"] = "true"
params["direct_taxon"] = "true"

In [74]:
disease_df = pd.read_csv(disease_path)
disease_df.columns = ["disease_id", "disease_name"]
disease_df.disease_id = disease_df.disease_id.apply(lambda x:x.split('"')[1])
disease_df

Unnamed: 0,disease_id,disease_name
0,DOID:0080630,"""B-lymphoblastic leukemia/lymphoma"""
1,DOID:8893,"""psoriasis"""
2,DOID:1712,"""aortic valve stenosis"""
3,DOID:13189,"""gout"""
4,DOID:5418,"""schizoaffective disorder"""
...,...,...
249,DOID:1380,"""endometrial cancer"""
250,DOID:6039,"""uveal melanoma"""
251,DOID:10538,"""gastric fundus cancer"""
252,DOID:0111910,"""spermatogenic failure"""


In [107]:
def get_api_resp(URI, params=None):
    if params:
        return requests.get(URI, params=params)
    else:
        return requests.get(URI)
    
def get_association(URI, disease_id, params, object_attribute="label"):
    URI_ = URI.format(disease_id)
    resp = get_api_resp(URI_, params=params)
    if resp.status_code == 200:
        response = resp.json()
        associations = response["associations"]
        object_list = []
        for item in associations:
            object_list.append(item["object"][object_attribute])
        df = pd.DataFrame(object_list, columns=["object"])
        df["subject"] = disease_id
        return df
    else:
        return None
    

In [71]:
edge_df_list = []

for index, row in tqdm(disease_df.iterrows()):
    edge_df_list.append(get_association(DISEASE_GENE_URI, row["disease_id"], params))

edge_df = pd.concat(edge_df_list, ignore_index=True)
edge_df = pd.merge(edge_df, disease_df, left_on="subject", right_on="disease_id").drop(["subject", "disease_id"], axis=1)
edge_df.disease_name = edge_df.disease_name.apply(lambda x:x.split('"')[1])

data_mcq_disease_gene = create_mcq(edge_df, "disease_name", "object", "Gene", "is associated with")

254it [01:46,  2.38it/s]


In [108]:
edge_df_list_2 = []

for index, row in tqdm(disease_df.iterrows()):
    edge_df_list_2.append(get_association(DISEASE_VARIANT_URI, row["disease_id"], params, object_attribute="id"))

edge_df_2 = pd.concat(edge_df_list_2, ignore_index=True)
edge_df_2 = pd.merge(edge_df_2, disease_df, left_on="subject", right_on="disease_id").drop(["subject", "disease_id"], axis=1)
edge_df_2.disease_name = edge_df_2.disease_name.apply(lambda x:x.split('"')[1])
edge_df_2.object = edge_df_2.object.apply(lambda x:x.split("dbSNP:")[-1])

254it [01:42,  2.47it/s]


In [114]:
data_mcq_disease_variant = create_mcq(edge_df_2, "disease_name", "object", "Variant", "is associated with")


In [121]:
data_mcq_monarch = pd.concat([data_mcq_disease_gene, data_mcq_disease_variant], ignore_index=True)


In [122]:
data_mcq_monarch.to_csv("../../../data/benchmark_datasets/test_questions_two_hop_mcq_from_monarch.csv", index=False, header=True)
