In [1]:
import requests
import os.path
import time
import json
import h5py
import pandas as pd
import zipfile
import logging
import multiprocessing as mp
import sys
from time import sleep

In [3]:
# setting path
sys.path.append('../../PYTHON_SCRIPTS')
import normalize_nodes
#import importlib
#importlib.reload(normalize_nodes)

In [22]:
logfile = "ara_querying.log"
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(filename=logfile, filemode='a', level=logging.DEBUG, format='%(levelname)s:%(asctime)s %(message)s')

In [23]:
logging.info("LOGGING STARTED")

In [33]:
def post_query(url, query_dict):
    try:
        resp = requests.post(url,json=query_dict,timeout=1200)
    except requests.exceptions.ReadTimeout:
        print("Request timed out!")
        logging.warning("Request timed out!")
        return "Error",-1
    except requests.exceptions.ConnectionError:
        print("Request had connection error")
        logging.warning("Request had connection error!")
        return "Error",-1
    print("Query is posted")
    if resp.status_code != 200:
        #return "Error",resp.status_code
        return resp

    return resp

ara_urls = {
    "robokop":"https://aragorn.renci.org/robokop",
    "aragorn":"https://aragorn.renci.org/aragorn",
    "spoke":"https://ia.transltr.io/api/v1.3/",
    "bte":"https://bte.transltr.io/v1",
    "medikanren":"https://medikanren-trapi.transltr.io",
    "arax":"https://arax.ncats.io/api/arax/v1.3"
}

In [25]:
def get_normalized_for_df(df):
    #df = score_parser(resps[key].json(),None)
    df = normalize_nodes.normalize_data_frame(df,"Disease")
    df = normalize_nodes.normalize_data_frame(df,"Drug")
    return df

def getDfFromResponse(resp,ara=None,disease_idx=None):
    if type(resp)==requests.models.Response:
        if(resp.status_code//100==2):
            resp_json = resp.json()
        else:
            raise ValueError("We didn't get a 2XX code from the server",ara,disease_idx,resp.status_code)
    elif type(resp)==dict:
        resp_json = resp
    return score_parser(resp_json)

            

In [26]:
def create_query(callback,disease_id,results=None):
    query_graph = {
            'nodes': {},
            'edges': {}
    }

    query_graph['nodes']['disease'] = {'ids': [disease_id]}
    query_graph['nodes']['chemical'] = {'categories': ['biolink:ChemicalEntity']}
    pnode = 'disease'
    cnode = 'chemical'
    query_graph['edges']['t_edge'] = {'object': pnode, 'subject': cnode, 'predicates':['biolink:treats'], 'knowledge_type': 'inferred'}
    m = {'message':{'query_graph':query_graph}}
    if(callback!=None): m['callback']= callback
    return m

def create_query_for_aragorn(callback,disease_id,results=500):
    m = create_query(callback,disease_id)
    workflow_list = [{"id":"lookup"},
                     {"id":"overlay_connect_knodes"},
                     {"id":"score"},
                     #{"id":"filter_message_top_n","parameters":{"max_results":10}}
                     {"id":"filter_results_top_n","parameters":{"max_results":results}}
                    ]
    m["workflow"] = workflow_list
    return m


def create_query_for_spoke(callback,disease_id,results=10001):
    #query_graph = {
    #        'nodes': {},
    #        'edges': {}
    #}
    #Spoke needs diseases to have a category
    #query_graph['nodes']['disease'] = {'ids': [disease_id], 'categories': ['biolink:Disease']}
    #m['message']['query_graph']['nodes']['disease']['categories'] = ['biolink:Disease']
    #query_graph['nodes']['chemical'] = {'categories': ['biolink:ChemicalEntity']}
    #pnode = 'disease'
    #cnode = 'chemical'
    #query_graph['edges']['t_edge'] = {'object': pnode, 'subject': cnode, 'predicates':['biolink:treats'], 'knowledge_type': 'inferred'}
    #m = {
    #        'message':{'query_graph':query_graph},
    #        'max_results':10}
            
    m = create_query(callback,disease_id)
    #Spoke needs diseases to have a category
    m['message']['query_graph']['nodes']['disease']['categories'] = ['biolink:Disease']
    #query_graph['nodes']['chemical'] = {'categories': ['biolink:ChemicalEntity']}
    m['max_results'] = results
    return m


def create_query_for_arax(callback,disease_id,n_drugs=10):
    query_graph = {
            'nodes': {},
            'edges': {}
    }
    #Spoke needs diseases to have a category
    query_graph['nodes']['disease'] = {'ids': [disease_id], 'categories': ['biolink:Disease']}
    query_graph['nodes']['chemical'] = {'categories': ['biolink:ChemicalEntity']}
    pnode = 'disease'
    cnode = 'chemical'
    query_graph['edges']['t_edge'] = {'object': pnode, 'subject': cnode, 'predicates':['biolink:treats'], 'knowledge_type': 'inferred'}
    
    #https://github.com/RTXteam/RTX/blob/master/code/ARAX/Documentation/DSL_Documentation.md#inferactiondrug_treatment_graph_expansion
    infer_str = f"infer(action=drug_treatment_graph_expansion,node_curie={disease_id},qedge_id=t_edge,n_drugs {n_drugs})"
#    infer_str = "
    operations = {"actions": [infer_str, "return(message=true, store=true)"]}
    query = {
        'message':{'query_graph':query_graph},
        "operations": {"actions": [
            "create_message",
            "infer(action=chemical_gene_regulation_graph_expansion, subject_curie=CHEMBL.COMPOUND:CHEMBL1097205, regulation_type=increase, threshold=0.6, path_len=2, n_result_curies=1, n_paths=1)",
            "return(message=true, store=true)"
        ]}}
    
    query = {
        'message':{'query_graph':query_graph},
        "operations": {"actions": [
            "infer(action=drug_treatment_graph_expansion, node_curie=MONDO:0018874, qedge_id=t_edge, n_drugs=10)",
            "return(message=true, store=true)"
        ]}}
    m = {
            'message':{'query_graph':query_graph},
            'operations':operations
    }
#    return m
#XKCD
    return query


In [27]:
#Get data frame from result.
def score_parser(json_data):
    l = []
    query_node = json_data['message']['query_graph']['nodes']
    if 'on' in query_node:
        disease_idx = query_node['on']['ids'][0]
    elif 'n0' in query_node:
        disease_idx = query_node['n0']['ids'][0]
    elif 'disease' in query_node:
        disease_idx = query_node['disease']['ids'][0]
    else:
        print(query_node)
        raise Exception("Could not get Disease")
            
   # if disease_idx not in output_data:
   #     output_data[disease_idx] = dict()
    for result in json_data['message']['results']:
        node_bindings = result['node_bindings']
        if 'sn' in node_bindings:
            drug_idx = node_bindings['sn'][0]['id']
        elif 'n1' in node_bindings:
            drug_idx = node_bindings['n1'][0]['id']
        elif 'drug' in node_bindings:
            drug_idx = node_bindings['drug'][0]['id']
        elif "chemical" in node_bindings:
            drug_idx = node_bindings['chemical'][0]['id']
        else:
            print(node_bindings)
            raise Exception("Could not get Drug")
            
        #Noticed some Aragorn queries did not have scores in the results
        if 'score' not in result:
            return "error"
        score = result['score']
        if 'normalized_score' in result:
            normalized_score = result['normalized_score']
        else:
            normalized_score = 0
        l.append((disease_idx,drug_idx,score,normalized_score))
    df = pd.DataFrame(l, columns =['Disease', "Drug", 'Score', 'Normalized_Score'])
    return df

In [28]:
query_function_dict = {
    "robokop":create_query_for_aragorn,
    "aragorn":create_query_for_aragorn,
    "spoke":create_query_for_spoke,
    "bte":create_query,
    "medikanren":create_query,
    "arax":None
}

dis_idxs = []
drug_idxs = []
with open("../ARAX/disease_info.txt") as f:
    next(f)
    for line in f: dis_idxs.append(line.strip().split('\t')[0])
with open("../ARAX/drug_info.txt") as f:
    next(f)
    for line in f: drug_idxs.append(line.strip().split('\t')[0])
        
def getARAXResult(disease_name):
    disease_idx = dis_idxs.index(disease_name)
    with h5py.File('../ARAX/arax_10k_ranking_data.hdf5', 'r') as f:
        data = f[str(disease_idx)][()]
    l = []
    for (drug,score) in data: l.append([disease_name,drug_idxs[int(drug)],score,0.0])
    return pd.DataFrame(l, columns =['Disease', "Drug", 'Score', 'Normalized_Score'])

def checkIfAlreadyQueried(ara,disease_idx):
    fpath = os.path.join("ara_tables",ara, disease_idx + ".csv")
    queried = os.path.isfile(fpath) 
    df = None
    if(queried):
        df = pd.read_csv(fpath)
    return (queried,df)

#https://stackoverflow.com/questions/65834655/python-how-to-efficiently-write-read-a-json-file-inside-a-compressed-archive-i
def writeJsonToFile(ara,disease_idx,resp):
    fpath = os.path.join("ara_jsons",ara, disease_idx + ".zip")
#    with open(os.path.join("ara_jsons",ara, disease_idx + ".json.zip"),'w') as outf:
#        json.dump(resp.json(),outf, default=str, indent=2)
    dumped_json =  json.dumps(resp.json(), ensure_ascii=False, indent=2)
    with zipfile.ZipFile(fpath, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
        zip_file.writestr(disease_idx + '.json',data=dumped_json)
        
def writeDFToFile(ara,disease_idx,df):
    outf = os.path.join("ara_tables",ara, disease_idx + ".csv")
    #json.dump(resp.json(),outf, default=str, indent=2)
    df.to_csv(outf,index=False)

def runQuery(ara,disease_idx,query):
    logging.info(f"Beginning run for {disease_idx},{ara}")
    
    ara_resp = post_query(ara_urls[ara] + "/query",query)
    logging.info(f"Completed run for {disease_idx},{ara},{ara_resp.status_code}")
    if(ara_resp.status_code==502):
        raise ValueError("The query for the ara got a 502",ara,disease_idx)
    writeJsonToFile(ara,disease_idx,ara_resp)    
    print(ara,ara_resp)
    if("No ML predictions available for disease equivalent to" in ara_resp.text):
        ara_df = pd.DataFrame([], columns =['Disease', "Drug", 'Score', 'Normalized_Score'])
#        return pd.DataFrame([], columns =['Disease', "Drug", 'Score', 'Normalized_Score'])
    else:
        ara_df = getDfFromResponse(ara_resp,ara,disease_idx)
    writeDFToFile(ara,disease_idx,ara_df)
    print(ara_df.Disease.size)
    logging.info(f"Number results for {disease_idx},{ara},{ara_df.Disease.size}")
    return ara_df
    
def queryARA(ara,disease_idx,cache=True):
    if(ara=='arax'): return getARAXResult(disease_idx)
    if(cache):
        already_queried,ara_df = checkIfAlreadyQueried(ara,disease_idx)
        if(already_queried):
            return ara_df
    query = query_function_dict[ara](None,disease_idx)
    ara_df = runQuery(ara,disease_idx,query)
    return ara_df

In [29]:
def queryARA(ara,disease_idx,cache=True):
    if(ara=='arax'): return getARAXResult(disease_idx)
    if(cache):
        already_queried,ara_df = checkIfAlreadyQueried(ara,disease_idx)
        if(already_queried):
            return ara_df
    query = query_function_dict[ara](None,disease_idx)
    ara_df = runQuery(ara,disease_idx,query)
    return ara_df

In [30]:
#mondo = "MONDO:0019082"
#query = create_query(None,mondo)
#rk_df = runQuery("robokop",mondo,query)
#rk_df

In [31]:
#queryARA('arax',"MONDO:0005148").sort_values("Score")

In [32]:
#most_perscribed

In [94]:
aras = ["robokop","aragorn","spoke","bte","medikanren","arax"]
aras = ["robokop","spoke","bte","medikanren","arax"]

def testARAsForPair(mondo,target_drug):
    frames = {}
    for ara in aras:
        frames[ara] = queryARA(ara,mondo).sort_values("Score")
    drug_list = []
    for f in frames.values(): drug_list.extend(list(f.Drug))
    no_drug_dict = normalize_nodes.normalize_big_list(drug_list)
    ara_target_rank = {}
    for ara in aras:
        ara_drug_list = list(frames[ara].Drug)
        hit_idx = -1
        for i,drug in enumerate(ara_drug_list):
            if(no_drug_dict.get(drug,None)==target_drug):
                hit_idx = i
                break
        ara_target_rank[ara] = (hit_idx,len(ara_drug_list))
    return ara_target_rank
            
    #MONDO:0003019

In [15]:
#mondo = "MONDO:0021187"
#query = query_function_dict['robokop'](None,mondo)
#print(query)

In [97]:
'''
mondo = "MONDO:0003019"
frames = {}
for ara in aras:
    frames[ara] = queryARA(ara,mondo).sort_values("Score")
drug_list = []
for f in frames.values(): drug_list.extend(list(f.Drug))
no_drug_dict = normalize_nodes.normalize_big_list(drug_list)
ara_target_rank = {}
for ara in aras:
    ara_drug_list = list(frames[ara].Drug)
    hit_idx = -1
    for i,drug in enumerate(ara_drug_list):
        if(no_drug_dict.get(drug,None)==target_drug):
            hit_idx = i
            break
    ara_target_rank[ara] = (hit_idx,len(ara_drug_list))
    '''

Query is posted
Query is posted
Query is posted


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [244]:
def queryARAIgnoreErrors(ara,disease_idx,cache=True):
    try:
        queryARA(ara,disease_idx,cache)
    except Exception as e:
        print(e)
    return

def queryAllARAsInParallel(aras,disease_idx):
    pool = mp.Pool()
    for ara in aras:
        pool.apply_async(queryARAIgnoreErrors, args=(ara,disease_idx,False,))
    pool.close()
    pool.join()

#queryAllARAsInParallel(['bte','spoke','medikanren'],"MONDO:0035838")
        
#for disease in most_perscribed.ConditionID.unique():
#    print(disease)
#    queryARAIgnoreErrors('bte',disease)
#    queryARAIgnoreErrors('spoke',disease)
#    queryARAIgnoreErrors('medikanren',disease)
#    queryARAIgnoreErrors('robokop',disease)

Query is posted
spoke <Response [200]>
'query_graph'
Query is posted
medikanren <Response [200]>
0
Query is posted
bte <Response [200]>
0


In [200]:
def buildDFForARA(ara):
    ara_dir = os.path.join("ara_tables",ara)
    dfs = [pd.DataFrame([], columns =['Disease', "Drug", 'Score', 'Normalized_Score'])]
    for fname in os.listdir(ara_dir):
        if(fname.endswith(".csv")):
            df = pd.read_csv(os.path.join(ara_dir,fname))
            dfs.append(df)
    return pd.concat(dfs)
#    return (queried,df)



def buildBigDF():
    big_dfs = []
    aras = ["robokop","spoke","bte","medikanren"]
    for ara in aras:
        ara_df = buildDFForARA(ara)
        ara_df['ara'] = ara
        big_dfs.append(ara_df)
    return pd.concat(big_dfs)

In [155]:
#drug_list_big_df = big_df.sort_values("Score",ascending=False).groupby(["Disease","ara"]).Drug.agg(list).to_frame().reset_index()

In [18]:
#drug_list = list(most_perscribed.DrugID.unique()) + list(big_df.Drug.unique())
#no_drug_dict = normalize_nodes.normalize_big_list(drug_list)


In [183]:
def testARA(disease,target_drug,ara):
    drug_list = list(big_df[(big_df.Disease==disease) & (big_df.ara==ara)].Drug)
    tot_hits = len(drug_list)
    hit_idx = -1
    for i,drug in enumerate(drug_list):
        if(no_drug_dict[drug]==no_drug_dict[target_drug]):
            hit_idx=i
            break
    return hit_idx,tot_hits

In [16]:
def getDataFromMostPerscribed():
    most_perscribed = pd.read_csv("https://raw.githubusercontent.com/TranslatorSRI/Benchmarks/main/config/MostPrescribed_2019/data.tsv",sep='\t') 
    l = []
    for ara in aras:
        for i,row in most_perscribed.iterrows():
            (hit_idx,tot_hits) = testARA(row.ConditionID,row.DrugID,ara)
            l.append([row.ConditionID,row.DrugID,hit_idx,tot_hits,ara])
    #        break
    df = pd.DataFrame(l,columns =["Disease","Drug","Hit_Idx","Num_Results","ara"])
    return df

In [17]:
#start_mondo = "MONDO:0001159"
#with open("../RANKING/mondo_list.csv") as f:
#    next(f)
#    start = False
#    for i,line in enumerate(f):
#        mondo_idx = line.strip().split(',')[0]
#        if(!start):
#            if(mondo_idx==start_mondo):
#                start=True
#            else:
#                continue
#        queryAllARAsInParallel(['bte','spoke','medikanren'],mondo_idx)
#        queryAllARAsInParallel(['spoke'],mondo_idx)
#        time.sleep(3)
#        if((i%100)==99):
#            time.sleep(60)
#        break

In [14]:
if(__name__=="__main__"):
    start_mondo = "MONDO:0001159"
    with open("../RANKING/mondo_list.csv") as f:
        next(f)
        wait = True
        for i,line in enumerate(f):
            mondo_idx = line.strip().split(',')[0]
            if(wait):
                if(mondo_idx==start_mondo):
                    wait=False
                else:
                    continue
            #print(wait,i,mondo_idx)
            queryAllARAsInParallel(['bte','spoke','medikanren'],mondo_idx)


False 727 MONDO:0001159
False 728 MONDO:0001160
False 729 MONDO:0001161
False 730 MONDO:0001162
False 731 MONDO:0001163
False 732 MONDO:0001164
False 733 MONDO:0001165
False 734 MONDO:0001166
False 735 MONDO:0001167
False 736 MONDO:0001168
False 737 MONDO:0001169
False 738 MONDO:0001170
False 739 MONDO:0001171
False 740 MONDO:0001172
False 741 MONDO:0001173
False 742 MONDO:0001174
False 743 MONDO:0001175
False 744 MONDO:0001176
False 745 MONDO:0001177
False 746 MONDO:0001178
False 747 MONDO:0001179
False 748 MONDO:0001180
False 749 MONDO:0001181
False 750 MONDO:0001182
False 751 MONDO:0001183
False 752 MONDO:0001184
False 753 MONDO:0001185
False 754 MONDO:0001186
False 755 MONDO:0001187
False 756 MONDO:0001188
False 757 MONDO:0001190
False 758 MONDO:0001191
False 759 MONDO:0001192
False 760 MONDO:0001195
False 761 MONDO:0001196
False 762 MONDO:0001197
False 763 MONDO:0001198
False 764 MONDO:0001199
False 765 MONDO:0001200
False 766 MONDO:0001202
False 767 MONDO:0001203
False 768 MONDO:

False 12185 MONDO:0013016
False 12186 MONDO:0013017
False 12187 MONDO:0013018
False 12188 MONDO:0013020
False 12189 MONDO:0013021
False 12190 MONDO:0013022
False 12191 MONDO:0013023
False 12192 MONDO:0013024
False 12193 MONDO:0013025
False 12194 MONDO:0013026
False 12195 MONDO:0013027
False 12196 MONDO:0013028
False 12197 MONDO:0013029
False 12198 MONDO:0013030
False 12199 MONDO:0013031
False 12200 MONDO:0013032
False 12201 MONDO:0013033
False 12202 MONDO:0013034
False 12203 MONDO:0013035
False 12204 MONDO:0013036
False 12205 MONDO:0013037
False 12206 MONDO:0013038
False 12207 MONDO:0013039
False 12208 MONDO:0013040
False 12209 MONDO:0013041
False 12210 MONDO:0013042
False 12211 MONDO:0013043
False 12212 MONDO:0013044
False 12213 MONDO:0013045
False 12214 MONDO:0013046
False 12215 MONDO:0013047
False 12216 MONDO:0013048
False 12217 MONDO:0013049
False 12218 MONDO:0013050
False 12219 MONDO:0013051
False 12220 MONDO:0013052
False 12221 MONDO:0013053
False 12222 MONDO:0013054
False 12223 

In [17]:
arax_trapi_url = "https://arax.test.transltr.io/api/arax/v1.4"#
arax_trapi_url = "https://arax.ncats.io/api/arax/v1.3"

In [18]:
query = create_query(None,"MONDO:0015564")

In [19]:
post_query(arax_trapi_url,query)

Query is posted


<Response [404]>