In [75]:
import requests
import pandas as pd
import sqlite3
from tqdm import tqdm
from math import ceil


In [155]:
DB_PATH = "../../../data/benchmark_datasets/disgenet/disgenet_2020.db"


API_URL = "https://biothings.ncats.io/semmeddb/query"

REFERENCE_PREDICATE = "TREATS"
RETURN_RESULT_COUNT = 1000
PMID_THRESH = 25


In [40]:
%%time

conn = sqlite3.connect(DB_PATH)
c = conn.cursor()


table_name = "diseaseAttributes"
c.execute("SELECT * FROM {}".format(table_name))
rows = c.fetchall()
disease_df = pd.DataFrame(rows, columns=["diseaseNID", "diseaseId", "diseaseName", "type"])

table_name = "geneAttributes"
c.execute("SELECT * FROM {}".format(table_name))
rows = c.fetchall()
gene_df = pd.DataFrame(rows, columns=["geneNID", "geneId", "geneName", "geneDescription", "pLI", "DSI", "DPI"])


table_name = "geneDiseaseNetwork"
c.execute("SELECT * FROM {}".format(table_name))
rows = c.fetchall()
disease_gene_df = pd.DataFrame(rows, columns=["NID", "diseaseNID", "geneNID", "source", "association", "associationType", "sentence", "pmid", "score", "EL", "EI", "year"])
# Selecting association with maximum score
disease_gene_df_selected  = disease_gene_df[disease_gene_df.score == 1]
disease_gene_df_selected_1 = pd.merge(disease_gene_df_selected, disease_df, on="diseaseNID")
disease_gene_df_selected_2 = pd.merge(disease_gene_df_selected_1, gene_df, on="geneNID")




CPU times: user 9.27 s, sys: 444 ms, total: 9.71 s
Wall time: 10.1 s


In [41]:
disease_ids = disease_gene_df_selected_2.diseaseId.unique()


In [159]:
%%time

result_list = []
for sel_disease_id in tqdm(disease_ids):
    params = {}
    params["q"] = "object.umls:{}".format(sel_disease_id)
    params["size"] = RETURN_RESULT_COUNT

    response = requests.get(API_URL, params=params)
    result = response.json()
    hits = result["hits"]
    for item in hits:
        if item["predicate"] == REFERENCE_PREDICATE:
            try:
                subject_semantic_type = item["subject"]["semantic_type_name"]  
                object_semantic_type = item["object"]["semantic_type_name"]  
            except:
                subject_semantic_type = None
            result_list.append((item["subject"]["name"], item["predicate"], item["object"]["name"], item["pmid_count"], subject_semantic_type, object_semantic_type))
        

100%|█████████████████████████████████████████| 353/353 [04:01<00:00,  1.46it/s]

CPU times: user 22.2 s, sys: 1.33 s, total: 23.5 s
Wall time: 4min 1s





In [160]:
result_df = pd.DataFrame(result_list, columns=["subject", "predicate", "object", "pmid_count", "subject_semantic_type", "object_semantic_type"])
result_df


Unnamed: 0,subject,predicate,object,pmid_count,subject_semantic_type,object_semantic_type
0,Replacement therapy,TREATS,Sandhoff Disease,1,Therapeutic or Preventive Procedure,Disease or Syndrome
1,Bone Marrow Transplantation,TREATS,Sandhoff Disease,1,Therapeutic or Preventive Procedure,Disease or Syndrome
2,Genetic Counseling,TREATS,Sandhoff Disease,1,Therapeutic or Preventive Procedure,Disease or Syndrome
3,Cyclic AMP-Dependent Protein Kinases,TREATS,Sandhoff Disease,1,"Amino Acid, Peptide, or Protein",Disease or Syndrome
4,Miglustat,TREATS,Sandhoff Disease,4,Pharmacologic Substance,Disease or Syndrome
...,...,...,...,...,...,...
10929,Cladribine,TREATS,Infantile myofibromatosis,1,Pharmacologic Substance,Neoplastic Process
10930,Kidney Transplantation,TREATS,Infantile myofibromatosis,1,Therapeutic or Preventive Procedure,Neoplastic Process
10931,Reconstructive Surgical Procedures,TREATS,Infantile myofibromatosis,2,Therapeutic or Preventive Procedure,Neoplastic Process
10932,Excision of mandible,TREATS,Infantile myofibromatosis,1,Therapeutic or Preventive Procedure,Neoplastic Process


In [161]:
result_df = result_df.explode("subject_semantic_type")
result_df = result_df.explode("object_semantic_type")
result_df = result_df.explode("subject")
result_df = result_df.explode("object")
result_df.sort_values(by="pmid_count", ascending=False)
        

Unnamed: 0,subject,predicate,object,pmid_count,subject_semantic_type,object_semantic_type
100,Chemoembolization,TREATS,Liver carcinoma,2408,Therapeutic or Preventive Procedure,Neoplastic Process
100,Chemoembolisation,TREATS,Liver carcinoma,2408,Therapeutic or Preventive Procedure,Neoplastic Process
6664,inhibitors,TREATS,"Diabetes Mellitus, Non-Insulin-Dependent",948,Chemical Viewed Functionally,Disease or Syndrome
359,Operative Surgical Procedures,TREATS,sarcoma,508,Therapeutic or Preventive Procedure,Neoplastic Process
359,Operative Surgical Procedures,TREATS,Sarcoma,508,Therapeutic or Preventive Procedure,Neoplastic Process
...,...,...,...,...,...,...
4653,Alkynes,TREATS,Gastrointestinal Stromal Tumors,1,Organic Chemical,Neoplastic Process
4652,Transanal Excision,TREATS,Gastrointestinal Stromal Tumors,1,Therapeutic or Preventive Procedure,Neoplastic Process
4651,tyrosine receptor,TREATS,Gastrointestinal Stromal Tumors,1,"Amino Acid, Peptide, or Protein",Neoplastic Process
4650,Pancreaticojejunostomy,TREATS,Gastrointestinal Stromal Tumors,1,Therapeutic or Preventive Procedure,Neoplastic Process


In [162]:
result_df.subject_semantic_type.unique()

array(['Therapeutic or Preventive Procedure',
       'Amino Acid, Peptide, or Protein', 'Pharmacologic Substance',
       'Health Care Activity', 'Chemical Viewed Functionally',
       'Gene or Genome', 'Hazardous or Poisonous Substance',
       'Organic Chemical', 'Medical Device', 'Antibiotic', 'Hormone',
       'Chemical Viewed Structurally',
       'Nucleic Acid, Nucleoside, or Nucleotide', None,
       'Biologically Active Substance', 'Clinical Drug',
       'Inorganic Chemical', 'Research Activity',
       'Daily or Recreational Activity',
       'Indicator, Reagent, or Diagnostic Aid', 'Diagnostic Procedure',
       'Vitamin', 'Immunologic Factor', 'Laboratory Procedure',
       'Element, Ion, or Isotope'], dtype=object)

In [164]:
INCLUDE_SUBJECT_TYPE = [
    "Antibiotic", 
    "Pharmacologic Substance", 
    "Organic Chemical", 
    "Clinical Drug", 
    "Inorganic Chemical"
]

result_df = result_df[result_df.subject_semantic_type.isin(INCLUDE_SUBJECT_TYPE)].sort_values(by="pmid_count", ascending=False)
result_df.subject = result_df.subject.str.lower()
result_df.object = result_df.object.str.lower()
result_df = result_df.drop_duplicates(subset=["subject", "object"])



In [165]:
# PMID_THRESH = ceil(result_df.pmid_count.quantile(.95))

result_df_final_treats = result_df[result_df.pmid_count > PMID_THRESH]

result_df_final_treats.to_csv("../../../data/benchmark_datasets/semmeddb/compound_treats_disease_from_semmeddb.csv", index=False, header=True)

result_df_final_treats


Unnamed: 0,subject,predicate,object,pmid_count,subject_semantic_type,object_semantic_type
4023,insulin,TREATS,obesity,429,Pharmacologic Substance,Disease or Syndrome
1559,factor viii,TREATS,hemophilia a,389,Pharmacologic Substance,Disease or Syndrome
2714,antibiotics,TREATS,cystic fibrosis,341,Antibiotic,Disease or Syndrome
6658,protease inhibitor,TREATS,"diabetes mellitus, non-insulin-dependent",335,Pharmacologic Substance,Disease or Syndrome
6658,protease inhibitors,TREATS,"diabetes mellitus, non-insulin-dependent",335,Pharmacologic Substance,Disease or Syndrome
...,...,...,...,...,...,...
117,radiopharmaceuticals,TREATS,malignant neoplasm of breast,27,Pharmacologic Substance,Neoplastic Process
7347,nusinersen,TREATS,hmn (hereditary motor neuropathy) proximal type i,27,Pharmacologic Substance,Disease or Syndrome
2598,macrolide antibiotics,TREATS,cystic fibrosis,27,Antibiotic,Disease or Syndrome
521,gonadorelin,TREATS,idiopathic hypogonadotropic hypogonadism,26,Pharmacologic Substance,Pathologic Function


In [166]:
result_df_final_treats.object_semantic_type.unique()


array(['Disease or Syndrome', 'Neoplastic Process', 'Finding',
       'Pathologic Function'], dtype=object)