In [17]:
import requests
import pandas as pd
import random
import os
from tqdm import tqdm
from utility import *


# MONARCH

In [20]:
def get_api_resp(URI, params=None):
    if params:
        return requests.get(URI, params=params)
    else:
        return requests.get(URI)
    
def get_association(URI, disease_id, params, object_attribute="label"):
    URI_ = URI.format(disease_id)
    resp = get_api_resp(URI_, params=params)
    if resp.status_code == 200:
        response = resp.json()
        associations = response["associations"]
        object_list = []
        for item in associations:
            object_list.append(item["object"][object_attribute])
        df = pd.DataFrame(object_list, columns=["object"])
        df["subject"] = disease_id
        return df
    else:
        return None
    

In [21]:
DISEASE_GENE_URI = "https://api.monarchinitiative.org/api/bioentity/disease/{}/genes"
DISEASE_VARIANT_URI = "https://api.monarchinitiative.org/api/bioentity/disease/{}/variants"

disease_path = "../../../data/benchmark_datasets/monarch/gwas_diseases.csv"

params = {}
params["rows"] = 2
params["direct"] = "true"
params["direct_taxon"] = "true"


In [22]:
disease_df = pd.read_csv(disease_path)
disease_df.columns = ["disease_id", "disease_name"]
disease_df.disease_id = disease_df.disease_id.apply(lambda x:x.split('"')[1])


## Disease-Gene 

In [23]:
edge_df_list = []

for index, row in tqdm(disease_df.iterrows()):
    edge_df_list.append(get_association(DISEASE_GENE_URI, row["disease_id"], params))

edge_df = pd.concat(edge_df_list, ignore_index=True)
edge_df = pd.merge(edge_df, disease_df, left_on="subject", right_on="disease_id").drop(["subject", "disease_id"], axis=1)
edge_df.disease_name = edge_df.disease_name.apply(lambda x:x.split('"')[1])

monarch_disease_gene_mcq = create_mcq_with_shuffle(edge_df, "disease_name", "object", "Gene", "is associated with")



254it [01:47,  2.36it/s]
254it [01:43,  2.46it/s]


## Disease-Variant 

In [51]:
edge_df_list_2 = []

for index, row in tqdm(disease_df.iterrows()):
    edge_df_list_2.append(get_association(DISEASE_VARIANT_URI, row["disease_id"], params, object_attribute="id"))

edge_df_2 = pd.concat(edge_df_list_2, ignore_index=True)
edge_df_2 = pd.merge(edge_df_2, disease_df, left_on="subject", right_on="disease_id").drop(["subject", "disease_id"], axis=1)
edge_df_2.disease_name = edge_df_2.disease_name.apply(lambda x:x.split('"')[1])
edge_df_2.object = edge_df_2.object.apply(lambda x:x.split("dbSNP:")[-1])

monarch_disease_variant_mcq = create_mcq_with_shuffle(edge_df_2, "disease_name", "object", "Variant", "is associated with")


254it [01:44,  2.43it/s]


# ROBOKOP

In [52]:
ROBOKOP_PATH = "../../../data/benchmark_datasets/robokop"


## Disease-Variant

In [53]:
FILES_LIST_2 = ["disease_variant_1.csv", "disease_variant_2.csv"]

data_robokop = []
for item in FILES_LIST_2:
    data_robokop.append(pd.read_csv(os.path.join(ROBOKOP_PATH, item)))
    
data_robokop = pd.concat(data_robokop, ignore_index=True)
data_robokop.columns = ["source", "target"]

robokop_disease_variant_mcq = create_mcq_with_shuffle(data_robokop, "source", "target", "Variant", "is associated with")


## Diseaes-Organism 

In [58]:
FILES_LIST_3 = ["disease_organism_1.csv"]

data_robokop = []
for item in FILES_LIST_3:
    data_robokop.append(pd.read_csv(os.path.join(ROBOKOP_PATH, item)))
    
data_robokop = pd.concat(data_robokop, ignore_index=True)
data_robokop.columns = ["source", "target"]

robokop_disease_organism_mcq = create_mcq_with_shuffle(data_robokop, "source", "target", "Organism", "causes")


In [61]:
data_combined = pd.concat([monarch_disease_gene_mcq, monarch_disease_variant_mcq, robokop_disease_variant_mcq, robokop_disease_organism_mcq], ignore_index=True)



In [66]:
data_combined.to_csv("../../../data/benchmark_datasets/test_questions_two_hop_mcq_from_monarch_and_robokop.csv", index=False, header=True)

