In [1]:
import requests
import pandas as pd
import sqlite3
from tqdm import tqdm
from math import ceil


In [2]:
DB_PATH = "../../../data/benchmark_datasets/disgenet/disgenet_2020.db"


API_URL = "https://biothings.ncats.io/semmeddb/query"

REFERENCE_PREDICATE = "TREATS"
RETURN_RESULT_COUNT = 1000
PMID_THRESH = 25


In [3]:
%%time

conn = sqlite3.connect(DB_PATH)
c = conn.cursor()


table_name = "diseaseAttributes"
c.execute("SELECT * FROM {}".format(table_name))
rows = c.fetchall()
disease_df = pd.DataFrame(rows, columns=["diseaseNID", "diseaseId", "diseaseName", "type"])

table_name = "geneAttributes"
c.execute("SELECT * FROM {}".format(table_name))
rows = c.fetchall()
gene_df = pd.DataFrame(rows, columns=["geneNID", "geneId", "geneName", "geneDescription", "pLI", "DSI", "DPI"])


table_name = "geneDiseaseNetwork"
c.execute("SELECT * FROM {}".format(table_name))
rows = c.fetchall()
disease_gene_df = pd.DataFrame(rows, columns=["NID", "diseaseNID", "geneNID", "source", "association", "associationType", "sentence", "pmid", "score", "EL", "EI", "year"])
# Selecting association with maximum score
disease_gene_df_selected  = disease_gene_df[disease_gene_df.score == 1]
disease_gene_df_selected_1 = pd.merge(disease_gene_df_selected, disease_df, on="diseaseNID")
disease_gene_df_selected_2 = pd.merge(disease_gene_df_selected_1, gene_df, on="geneNID")




CPU times: user 8.67 s, sys: 1.48 s, total: 10.2 s
Wall time: 10.7 s


In [4]:
disease_ids = disease_gene_df_selected_2.diseaseId.unique()


In [5]:
%%time

result_list = []
for sel_disease_id in tqdm(disease_ids):
    params = {}
    params["q"] = "subject.umls:{}".format(sel_disease_id)
    params["size"] = RETURN_RESULT_COUNT

    response = requests.get(API_URL, params=params)
    result = response.json()
    hits = result["hits"]
    for item in hits:
        try:
            if item["object"]["semantic_type_name"] == "Disease or Syndrome":
                result_list.append((item["subject"]["name"], item["predicate"], item["object"]["name"], item["pmid_count"], item["subject"]["semantic_type_name"], item["object"]["semantic_type_name"]))
        except:
            continue
        
        

100%|█████████████████████████████████████████| 353/353 [03:54<00:00,  1.51it/s]

CPU times: user 21.4 s, sys: 1.47 s, total: 22.9 s
Wall time: 3min 54s





In [9]:
result_df = pd.DataFrame(result_list, columns=["subject", "predicate", "object", "pmid_count", "subject_semantic_type", "object_semantic_type"])
result_df = result_df.sort_values(by="pmid_count", ascending=False)
result_df

Unnamed: 0,subject,predicate,object,pmid_count,subject_semantic_type,object_semantic_type
921,Huntington Disease,ISA,Neurodegenerative Disorders,2136,Disease or Syndrome,Disease or Syndrome
3847,Obesity,ISA,Metabolic Diseases,1243,Disease or Syndrome,Disease or Syndrome
2682,Cystic Fibrosis,ISA,Hereditary Diseases,867,Disease or Syndrome,Disease or Syndrome
3448,Fabry Disease,ISA,Lysosomal Storage Diseases,573,Disease or Syndrome,Disease or Syndrome
3818,Obesity,PREDISPOSES,Hypertensive disease,566,Disease or Syndrome,Disease or Syndrome
...,...,...,...,...,...,...
3584,"Muscular Dystrophy, Duchenne",ISA,Fetal Diseases,1,Disease or Syndrome,Disease or Syndrome
3581,"Muscular Dystrophy, Duchenne",COEXISTS_WITH,Congenital muscular dystrophy,1,Disease or Syndrome,Disease or Syndrome
3580,"Muscular Dystrophy, Duchenne",NEG_COEXISTS_WITH,Amyotrophic Lateral Sclerosis,1,Disease or Syndrome,Disease or Syndrome
3579,"Muscular Dystrophy, Duchenne",COEXISTS_WITH,Autoimmune hemolytic anemia,1,Disease or Syndrome,Disease or Syndrome


In [7]:
result_df.predicate.unique()

array(['AFFECTS', 'ISA', 'COEXISTS_WITH', 'CAUSES', 'NEG_COEXISTS_WITH',
       'MANIFESTATION_OF', 'PRECEDES', 'NEG_CAUSES', 'COMPLICATES',
       'NEG_MANIFESTATION_OF', 'NEG_ISA', 'NEG_AFFECTS', 'PREDISPOSES',
       'AUGMENTS', 'ASSOCIATED_WITH', 'NEG_PREDISPOSES', 'NEG_AUGMENTS',
       'NEG_COMPLICATES', 'NEG_PRECEDES', 'compared_with'], dtype=object)

In [18]:
result_df_ = result_df[result_df.pmid_count > 25]
result_df_[result_df_.predicate=="ISA"]

Unnamed: 0,subject,predicate,object,pmid_count,subject_semantic_type,object_semantic_type
921,Huntington Disease,ISA,Neurodegenerative Disorders,2136,Disease or Syndrome,Disease or Syndrome
3847,Obesity,ISA,Metabolic Diseases,1243,Disease or Syndrome,Disease or Syndrome
2682,Cystic Fibrosis,ISA,Hereditary Diseases,867,Disease or Syndrome,Disease or Syndrome
3448,Fabry Disease,ISA,Lysosomal Storage Diseases,573,Disease or Syndrome,Disease or Syndrome
858,Huntington Disease,ISA,Hereditary Diseases,322,Disease or Syndrome,Disease or Syndrome
...,...,...,...,...,...,...
1739,Hemophilia B,ISA,Hereditary Diseases,26,Disease or Syndrome,Disease or Syndrome
3661,Obesity,ISA,Overnutrition,26,Disease or Syndrome,Disease or Syndrome
6487,Hereditary Multiple Exostoses,ISA,Hereditary Diseases,26,Congenital Abnormality,Disease or Syndrome
1723,Factor XI Deficiency,ISA,Bleeding tendency,26,Disease or Syndrome,Disease or Syndrome
