In [1]:
import pandas as pd
import os
from tqdm import tqdm
from dotenv import load_dotenv
from neo4j import GraphDatabase, basic_auth


In [2]:
load_dotenv(os.path.join(os.path.expanduser('~'), '.neo4j_config.env'))
USER = os.environ.get('SPOKE_USER')
PSW = os.environ.get('SPOKE_PSW')
URI = os.environ.get('SPOKE_URI')


In [5]:
%%time

DISEASE_QUERY = """
    MATCH(d:Disease)-[r:ASSOCIATES_DaG]->(g:Gene)
    RETURN DISTINCT d.identifier AS d_id, d.name AS d_name
"""

auth = basic_auth(USER, PSW)
sdb = GraphDatabase.driver(URI, auth=auth)

disease_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(DISEASE_QUERY)
        for row in result:
            disease_list.append((row["d_id"], row["d_name"]))

disease_df = pd.DataFrame(disease_list, columns=["disease_id", "disease_name"])
disease_id_list = list(disease_df.disease_id.values)


CPU times: user 209 ms, sys: 29.1 ms, total: 238 ms
Wall time: 12.7 s


In [125]:

def get_two_hop_data_step_1(central_node, predicate, disease_list, outbound=True):        
    if outbound:
        TWO_HOP_QUERY = """
            MATCH(d1:Disease)-[:{}]->(n:{})<-[:{}]-(d2:Disease)
            WHERE d1.identifier IN {} AND d2.identifier IN {} AND d1.name <> "malignant teratoma" AND d2.name <> "malignant teratoma"
            RETURN d1.name AS d1_name, n.name AS n_name, d2.name AS d2_name LIMIT 100
        """.format(predicate, central_node, predicate, disease_list, disease_list)
    else:
        TWO_HOP_QUERY = """
            MATCH(d1:Disease)<-[:{}]-(n:{})-[:{}]->(d2:Disease)
            WHERE d1.identifier IN {} AND d2.identifier IN {} AND d1.name <> "malignant teratoma" AND d2.name <> "malignant teratoma"
            RETURN d1.name AS d1_name, n.name AS n_name, d2.name AS d2_name LIMIT 100
        """.format(predicate, central_node, predicate, disease_list, disease_list)
    
    auth = basic_auth(USER, PSW)
    sdb = GraphDatabase.driver(URI, auth=auth)
    edge_list = []
    with sdb.session() as session:
        with session.begin_transaction() as tx:
            result = tx.run(TWO_HOP_QUERY)
            for row in result:
                edge_list.append((row["d1_name"], row["n_name"], row["d2_name"]))
    return pd.DataFrame(edge_list, columns=["node_1", "node_2", "node_3"])


def run_cypher(QUERY, variable_name):
    auth = basic_auth(USER, PSW)
    sdb = GraphDatabase.driver(URI, auth=auth)
    result = []
    with sdb.session() as session:
        with session.begin_transaction() as tx:
            result = tx.run(QUERY)
            output = []
            for row in result:
                output.append(row[variable_name])
    return output

    



In [126]:
%%time

disease_symptom_2_hop = get_two_hop_data_step_1("Symptom", "PRESENTS_DpS", disease_id_list)
    
two_hop_symptom_data = []
for index, row in tqdm(disease_symptom_2_hop.iterrows()):
    text = "What are the common Symptoms associated with {} and {}?".format(row["node_1"], row["node_3"])
    cypher = 'MATCH(d1:Disease)-[:PRESENTS_DpS]->(n:Symptom)<-[:PRESENTS_DpS]-(d2:Disease) WHERE d1.name="{}" AND d2.name="{}" RETURN DISTINCT n.name AS n_name'.format(row["node_1"], row["node_3"])
    ground_truth_from_spoke = run_cypher(cypher, "n_name")
    two_hop_symptom_data.append((text, cypher, ground_truth_from_spoke))    
    

100it [00:17,  5.84it/s]

CPU times: user 1.2 s, sys: 205 ms, total: 1.41 s
Wall time: 18.4 s





In [134]:
two_hop_symptom_data_df = pd.DataFrame(two_hop_symptom_data, columns=["text", "cypher", "ground_truth"])
two_hop_symptom_data_df = two_hop_symptom_data_df[~two_hop_symptom_data_df['text'].str.contains(r'\b\d+\b')]
# two_hop_symptom_data_df = two_hop_symptom_data_df[~two_hop_symptom_data_df['text'].str.contains(r'malignant teratoma', case=False)]
ind = 0
print(two_hop_symptom_data_df.iloc[ind].text)
print(two_hop_symptom_data_df.iloc[ind].ground_truth)


What are the common Symptoms associated with rosacea and adenoma?
['Eye Manifestations']


In [None]:
Find list of Symptoms associated with mixed glioma. Find list of Symptoms associated with adenoma. Then find the common Symptoms from the two lists.


In [133]:
disease_df

Unnamed: 0,disease_id,disease_name
0,DOID:0080936,serum amyloid A amyloidosis
1,DOID:0080916,erythroleukemia
2,DOID:0112298,spondylometaphyseal dysplasia Sedaghatian type
3,DOID:0080912,cerebrooculofacioskeletal syndrome 2
4,DOID:0080922,bilateral frontoparietal polymicrogyria
...,...,...
6190,DOID:0081042,T-cell prolymphocytic leukemia
6191,DOID:0081087,acute myeloid leukemia with maturation
6192,DOID:0081044,frontonasal dysplasia
6193,DOID:0081055,central diabetes insipidus


In [43]:
%%time

disease_variant_2_hop = get_two_hop_data_step_1("Variant", "ASSOCIATES_VaP", disease_id_list, outbound=False)
    
two_hop_variant_data = []
for index, row in tqdm(disease_variant_2_hop.iterrows()):
    text = "What are the common Variants associated with {} and {}?".format(row["node_1"], row["node_3"])
    cypher = 'MATCH(d1:Disease)<-[:ASSOCIATES_VaP]-(n:Variant)-[:ASSOCIATES_VaP]->(d2:Disease) WHERE d1.name="{}" AND d2.name="{}" RETURN DISTINCT n.identifier AS n_id'.format(row["node_1"], row["node_3"])
    ground_truth_from_spoke = run_cypher(cypher, "n_id")
    two_hop_variant_data.append((text, cypher, ground_truth_from_spoke))
    

1000it [03:13,  5.16it/s]

CPU times: user 17.1 s, sys: 2.81 s, total: 20 s
Wall time: 3min 14s





In [52]:
%%time

disease_anatomy_2_hop = get_two_hop_data_step_1("Anatomy", "LOCALIZES_DlA", disease_id_list, outbound=True)
    
two_hop_anatomy_data = []
for index, row in tqdm(disease_anatomy_2_hop.iterrows()):
    text = "Which is the common Anatomy that {} and {} both localize?".format(row["node_1"], row["node_3"])
    cypher = 'MATCH(d1:Disease)-[:LOCALIZES_DlA]->(n:Anatomy)<-[:LOCALIZES_DlA]-(d2:Disease) WHERE d1.name="{}" AND d2.name="{}" RETURN DISTINCT n.name AS n_name'.format(row["node_1"], row["node_3"])
    ground_truth_from_spoke = run_cypher(cypher, "n_name")
    two_hop_anatomy_data.append((text, cypher, ground_truth_from_spoke))
    

1000it [02:46,  6.01it/s]

CPU times: user 11.7 s, sys: 2 s, total: 13.7 s
Wall time: 2min 46s





In [101]:
# two_hop_gene_data_df = pd.DataFrame(two_hop_gene_data, columns=["text", "cypher", "ground_truth"])
# two_hop_gene_data_df = two_hop_gene_data_df[~two_hop_gene_data_df['text'].str.contains(r'\b\d+\b')]
# two_hop_gene_data_df.loc[:,"ground_truth_len"] = two_hop_gene_data_df.ground_truth.apply(lambda x:len(x))
# # two_hop_gene_data_df_ = two_hop_gene_data_df[two_hop_gene_data_df.ground_truth_len == 1]


# # ind = 54
# # print(two_hop_gene_data_df_.iloc[ind].text)
# # print(two_hop_gene_data_df_.iloc[ind].cypher)
# # print(two_hop_gene_data_df_.iloc[ind].ground_truth)
# # two_hop_gene_data_df_.shape
# two_hop_gene_data_df