In [81]:
import requests
import pandas as pd
from tqdm import tqdm
from utility import *


In [73]:
DISEASE_GENE_URI = "https://api.monarchinitiative.org/api/bioentity/disease/{}/genes"

disease_path = "../../../data/benchmark_datasets/monarch/gwas_diseases.csv"

In [74]:
disease_df = pd.read_csv(disease_path)
disease_df.columns = ["disease_id", "disease_name"]
disease_df.disease_id = disease_df.disease_id.apply(lambda x:x.split('"')[1])
disease_df

Unnamed: 0,disease_id,disease_name
0,DOID:0080630,"""B-lymphoblastic leukemia/lymphoma"""
1,DOID:8893,"""psoriasis"""
2,DOID:1712,"""aortic valve stenosis"""
3,DOID:13189,"""gout"""
4,DOID:5418,"""schizoaffective disorder"""
...,...,...
249,DOID:1380,"""endometrial cancer"""
250,DOID:6039,"""uveal melanoma"""
251,DOID:10538,"""gastric fundus cancer"""
252,DOID:0111910,"""spermatogenic failure"""


In [69]:
def get_api_resp(URI, params=None):
    if params:
        return requests.get(URI, params=params)
    else:
        return requests.get(URI)
    
def get_association(disease_id):
    DISEASE_GENE_URI_ = DISEASE_GENE_URI.format(disease_id)
    resp = get_api_resp(DISEASE_GENE_URI_, params=params)
    if resp.status_code == 200:
        response = resp.json()
        associations = response["associations"]
        object_list = []
        for item in associations:
            object_list.append(item["object"]["label"])
        df = pd.DataFrame(object_list, columns=["object"])
        df["subject"] = disease_id
        return df
    else:
        return None
    

In [70]:
params = {}
params["rows"] = 2
params["direct"] = "true"
params["direct_taxon"] = "true"


In [71]:
edge_df_list = []

for index, row in tqdm(disease_df.iterrows()):
    edge_df_list.append(get_association(row["disease_id"]))



254it [01:46,  2.38it/s]


In [79]:
edge_df = pd.concat(edge_df_list, ignore_index=True)
edge_df = pd.merge(edge_df, disease_df, left_on="subject", right_on="disease_id").drop(["subject", "disease_id"], axis=1)
edge_df.disease_name = edge_df.disease_name.apply(lambda x:x.split('"')[1])
edge_df
    

Unnamed: 0,object,disease_name
0,HLA-B,psoriasis
1,ENSEMBL:ENSG00000249738,psoriasis
2,LINC01708,aortic valve stenosis
3,TEX41,aortic valve stenosis
4,ABCG2,gout
...,...,...
386,ATXN2,endometrial cancer
387,CACNA2D1,uveal melanoma
388,STARD4-AS1,uveal melanoma
389,PRKAA1,gastric fundus cancer


In [82]:
data_mcq_disease_gene = create_mcq(edge_df, "disease_name", "object", "Gene", "is associated with")


In [93]:
data_mcq_disease_gene.text.values[100]

'Out of the given list, which Gene is associated with oral cavity cancer and nephrotic syndrome. Given list is: HLA-DQB1, TBC1D8, ATP2B1, PNPLA3, PHGDH'

In [87]:
data_mcq_disease_gene.to_csv("../../../data/benchmark_datasets/test_questions_two_hop_mcq_from_monarch.csv", index=False, header=True)
