In [33]:
import pandas as pd
import os
from tqdm import tqdm
from dotenv import load_dotenv
from neo4j import GraphDatabase, basic_auth
import random


In [2]:
load_dotenv(os.path.join(os.path.expanduser('~'), '.neo4j_config.env'))
USER = os.environ.get('SPOKE_USER')
PSW = os.environ.get('SPOKE_PSW')
URI = os.environ.get('SPOKE_URI')


In [4]:
%%time

DISEASE_QUERY = """
    MATCH(d:Disease)-[r:ASSOCIATES_DaG]->(g:Gene)
    RETURN DISTINCT d.identifier AS d_id, d.name AS d_name
"""

auth = basic_auth(USER, PSW)
sdb = GraphDatabase.driver(URI, auth=auth)

disease_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(DISEASE_QUERY)
        for row in result:
            disease_list.append((row["d_id"], row["d_name"]))

disease_df = pd.DataFrame(disease_list, columns=["disease_id", "disease_name"])
disease_id_list = list(disease_df.disease_id.values)


CPU times: user 563 ms, sys: 79.5 ms, total: 643 ms
Wall time: 15.6 s


In [125]:

def get_two_hop_data_step_1(central_node, predicate, disease_list, outbound=True):        
    if outbound:
        TWO_HOP_QUERY = """
            MATCH(d1:Disease)-[:{}]->(n:{})<-[:{}]-(d2:Disease)
            WHERE d1.identifier IN {} AND d2.identifier IN {} AND d1.name <> "malignant teratoma" AND d2.name <> "malignant teratoma"
            RETURN d1.name AS d1_name, n.name AS n_name, d2.name AS d2_name LIMIT 100
        """.format(predicate, central_node, predicate, disease_list, disease_list)
    else:
        TWO_HOP_QUERY = """
            MATCH(d1:Disease)<-[:{}]-(n:{})-[:{}]->(d2:Disease)
            WHERE d1.identifier IN {} AND d2.identifier IN {} AND d1.name <> "malignant teratoma" AND d2.name <> "malignant teratoma"
            RETURN d1.name AS d1_name, n.name AS n_name, d2.name AS d2_name LIMIT 100
        """.format(predicate, central_node, predicate, disease_list, disease_list)
    
    auth = basic_auth(USER, PSW)
    sdb = GraphDatabase.driver(URI, auth=auth)
    edge_list = []
    with sdb.session() as session:
        with session.begin_transaction() as tx:
            result = tx.run(TWO_HOP_QUERY)
            for row in result:
                edge_list.append((row["d1_name"], row["n_name"], row["d2_name"]))
    return pd.DataFrame(edge_list, columns=["node_1", "node_2", "node_3"])


def run_cypher(QUERY, variable_name):
    auth = basic_auth(USER, PSW)
    sdb = GraphDatabase.driver(URI, auth=auth)
    result = []
    with sdb.session() as session:
        with session.begin_transaction() as tx:
            result = tx.run(QUERY)
            output = []
            for row in result:
                output.append(row[variable_name])
    return output

    



In [72]:
def get_two_hop_data(central_node, predicate, disease_list, outbound=True):        
    if outbound:
        if central_node == "Gene":
            TWO_HOP_QUERY = """
                MATCH(d1:Disease)-[r1:{}]->(n:{})<-[r2:{}]-(d2:Disease)
                WHERE r1.diseases_sources <> ["textmining"] 
                AND r2.diseases_sources <> ["textmining"]
                AND d1.identifier IN {} AND d2.identifier IN {}
                RETURN DISTINCT d1.name AS d1_name, n.name AS n_name, d2.name AS d2_name
            """.format(predicate, central_node, predicate, disease_list, disease_list)
        else:
                TWO_HOP_QUERY = """
                MATCH(d1:Disease)-[r1:{}]->(n:{})<-[r2:{}]-(d2:Disease)
                WHERE d1.identifier IN {} AND d2.identifier IN {}
                RETURN DISTINCT d1.name AS d1_name, n.name AS n_name, d2.name AS d2_name
            """.format(predicate, central_node, predicate, disease_list, disease_list)            
    else:
        TWO_HOP_QUERY = """
            MATCH(d1:Disease)<-[r1:{}]-(n:{})-[r2:{}]->(d2:Disease)
            WHERE r1.diseases_sources <> ["textmining"] 
            AND r2.diseases_sources <> ["textmining"]
            AND d1.identifier IN {} AND d2.identifier IN {}
            RETURN DISTINCT d1.name AS d1_name, n.name AS n_name, d2.name AS d2_name
        """.format(predicate, central_node, predicate, disease_list, disease_list)
    auth = basic_auth(USER, PSW)
    sdb = GraphDatabase.driver(URI, auth=auth)
    edge_list = []
    with sdb.session() as session:
        with session.begin_transaction() as tx:
            result = tx.run(TWO_HOP_QUERY)
            for row in result:
                edge_list.append((row["d1_name"], row["n_name"], row["d2_name"]))
    sdb.close()
    return pd.DataFrame(edge_list, columns=["node_1", "node_2", "node_3"])


def get_all_two_hop_data(central_node, predicate, node1, node2, disease_list, outbound=True):
    if outbound:
        if central_node == "Gene":
            TWO_HOP_QUERY = """
                MATCH(d1:Disease)-[r1:{}]->(n:{})<-[r2:{}]-(d2:Disease)
                WHERE r1.diseases_sources <> ["textmining"]
                AND r2.diseases_sources <> ["textmining"]
                AND d1.name = "{}" AND d2.name = "{}"
                AND d1.identifier IN {} AND d2.identifier IN {}
                RETURN DISTINCT n.name AS n_name
            """.format(predicate, central_node, predicate, node1, node2, disease_list, disease_list)
        else:
            TWO_HOP_QUERY = """
                MATCH(d1:Disease)-[r1:{}]->(n:{})<-[r2:{}]-(d2:Disease)
                WHERE d1.name = "{}" AND d2.name = "{}"
                AND d1.identifier IN {} AND d2.identifier IN {}
                RETURN DISTINCT n.name AS n_name
            """.format(predicate, central_node, predicate, node1, node2, disease_list, disease_list)
    else:
        TWO_HOP_QUERY = """
        MATCH(d1:Disease)<-[r1:{}]-(n:{})-[r2:{}]->(d2:Disease)
        WHERE d1.name = "{}" AND d2.name = "{}"
        AND d1.identifier IN {} AND d2.identifier IN {}
        RETURN DISTINCT n.name AS n_name
    """.format(predicate, central_node, predicate, node1, node2, disease_list, disease_list)
    auth = basic_auth(USER, PSW)
    sdb = GraphDatabase.driver(URI, auth=auth)
    central_node_list = []
    with sdb.session() as session:
        with session.begin_transaction() as tx:
            result = tx.run(TWO_HOP_QUERY)
            for row in result:
                central_node_list.append(row["n_name"])
    sdb.close()
    return central_node_list


In [96]:
%%time

sample_diseases = random.sample(disease_id_list, 300)
disease_gene_2_hop = get_two_hop_data("Gene", "ASSOCIATES_DaG", sample_diseases, outbound=True)
disease_gene_2_hop = disease_gene_2_hop.sample(n=30)

disease_gene_2_hop_with_all_central_nodes = []
for index, row in disease_gene_2_hop.iterrows():
    central_node_list = get_all_two_hop_data("Gene", "ASSOCIATES_DaG", row["node_1"], row["node_3"], sample_diseases)
    disease_gene_2_hop_with_all_central_nodes.append((row["node_1"], row["node_3"], central_node_list))

disease_gene_2_hop_with_all_central_nodes_df = pd.DataFrame(disease_gene_2_hop_with_all_central_nodes, columns=["disease_1", "disease_2", "central_nodes"])

disease_gene_2_hop_with_all_central_nodes_df.loc[:, "text"] = "What are the Genes that are commonly associated with both " + disease_gene_2_hop_with_all_central_nodes_df.disease_1 + " and " + disease_gene_2_hop_with_all_central_nodes_df.disease_2 + "?"


CPU times: user 886 ms, sys: 122 ms, total: 1.01 s
Wall time: 7.91 s


In [97]:
%%time

sample_diseases = random.sample(disease_id_list, 300)
disease_symptom_2_hop = get_two_hop_data("Symptom", "PRESENTS_DpS", sample_diseases, outbound=True)
disease_symptom_2_hop = disease_symptom_2_hop.sample(n=30)

disease_symptom_2_hop_with_all_central_nodes = []
for index, row in disease_symptom_2_hop.iterrows():
    central_node_list = get_all_two_hop_data("Symptom", "PRESENTS_DpS", row["node_1"], row["node_3"], sample_diseases, outbound=True)
    disease_symptom_2_hop_with_all_central_nodes.append((row["node_1"], row["node_3"], central_node_list))

disease_symptom_2_hop_with_all_central_nodes_df = pd.DataFrame(disease_symptom_2_hop_with_all_central_nodes, columns=["disease_1", "disease_2", "central_nodes"])

disease_symptom_2_hop_with_all_central_nodes_df.loc[:, "text"] = "What are the Symptoms that are commonly associated with both " + disease_symptom_2_hop_with_all_central_nodes_df.disease_1 + " and " + disease_symptom_2_hop_with_all_central_nodes_df.disease_2 + "?"


CPU times: user 1.12 s, sys: 144 ms, total: 1.26 s
Wall time: 8.21 s


In [98]:
%%time

sample_diseases = random.sample(disease_id_list, 300)
disease_anatomy_2_hop = get_two_hop_data("Anatomy", "LOCALIZES_DlA", sample_diseases, outbound=True)
disease_anatomy_2_hop = disease_anatomy_2_hop.sample(n=30)

disease_anatomy_2_hop_with_all_central_nodes = []
for index, row in disease_anatomy_2_hop.iterrows():
    central_node_list = get_all_two_hop_data("Anatomy", "LOCALIZES_DlA", row["node_1"], row["node_3"], sample_diseases, outbound=True)
    disease_anatomy_2_hop_with_all_central_nodes.append((row["node_1"], row["node_3"], central_node_list))

disease_anatomy_2_hop_with_all_central_nodes_df = pd.DataFrame(disease_anatomy_2_hop_with_all_central_nodes, columns=["disease_1", "disease_2", "central_nodes"])

disease_anatomy_2_hop_with_all_central_nodes_df.loc[:, "text"] = "What are the Anatomy that are commonly associated with both " + disease_anatomy_2_hop_with_all_central_nodes_df.disease_1 + " and " + disease_anatomy_2_hop_with_all_central_nodes_df.disease_2 + "?"


CPU times: user 539 ms, sys: 72.1 ms, total: 611 ms
Wall time: 6.6 s


In [99]:
disease_node_2_hop_data = pd.concat([disease_gene_2_hop_with_all_central_nodes_df, disease_symptom_2_hop_with_all_central_nodes_df,disease_anatomy_2_hop_with_all_central_nodes_df], ignore_index=True)
disease_node_2_hop_data = disease_node_2_hop_data.sample(frac=1)
disease_node_2_hop_data.to_csv("../../../../data/benchmark_datasets/disease_two_hop_validation_data.csv", index=False, header=True)


In [111]:
disease_node_2_hop_data.text.values[8]

'What are the Anatomy that are commonly associated with both exotropia and Crouzon syndrome?'

In [112]:
disease_node_2_hop_data.central_nodes.values[8]

['orbit of skull', 'oculomotor muscle']