In [47]:
import pandas as pd
from neo4j import GraphDatabase, basic_auth
from tqdm import tqdm


In [48]:
URI = "bolt://spokedev.cgl.ucsf.edu/:7687"
SPOKE_USER = "neo4j"
SPOKE_PSW = "SPOKEdev"


# Disease-Gene-Disease

In [110]:
%%time

query = """
        MATCH(d1:Disease)-[r1:ASSOCIATES_DaG]-(g:Gene)-[r2:ASSOCIATES_DaG]-(d2:Disease)        
        RETURN g.name AS g_name, d1.name AS d1_name, d2.name AS d2_name
        LIMIT 10000
"""

auth = basic_auth(SPOKE_USER, SPOKE_PSW)
sdb = GraphDatabase.driver(URI, auth=auth)

disease_gene_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        for index, row in enumerate(result):
            disease_gene_list.append(("Gene", row["g_name"], row["d1_name"], row["d2_name"]))
            
            
disease_gene_df = pd.DataFrame(disease_gene_list, columns=["node_1_type", "node_1", "node_2", "node_3"])
disease_gene_df = disease_gene_df.drop_duplicates(subset=["node_1"])
disease_gene_df = disease_gene_df.drop_duplicates(subset=["node_2", "node_3"])


CPU times: user 272 ms, sys: 33 ms, total: 305 ms
Wall time: 650 ms


In [122]:
%%time

query = """
        MATCH(d1:Disease)-[r1:ASSOCIATES_DaG]-(g:Gene)-[r2:ASSOCIATES_DaG]-(d2:Disease)        
        WHERE g.name CONTAINS "MIR"
        RETURN g.name AS g_name, d1.name AS d1_name, d2.name AS d2_name
        LIMIT 100000        
"""

auth = basic_auth(SPOKE_USER, SPOKE_PSW)
sdb = GraphDatabase.driver(URI, auth=auth)

disease_gene_list_2 = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        for index, row in enumerate(result):
            disease_gene_list_2.append(("Gene", row["g_name"], row["d1_name"], row["d2_name"]))
            
            
disease_gene_df_2 = pd.DataFrame(disease_gene_list_2, columns=["node_1_type", "node_1", "node_2", "node_3"])
disease_gene_df_2 = disease_gene_df_2.drop_duplicates(subset=["node_1"])
disease_gene_df_2 = disease_gene_df_2.drop_duplicates(subset=["node_2", "node_3"])


CPU times: user 2.09 s, sys: 248 ms, total: 2.34 s
Wall time: 5.76 s


In [123]:
%%time

query = """
        MATCH(d1:Disease)-[r1:ASSOCIATES_VaP]-(v:Variant)-[r2:ASSOCIATES_VaP]-(d2:Disease) 
        RETURN v.identifier AS v_id, d1.name AS d1_name, d2.name AS d2_name
        LIMIT 10000
"""

auth = basic_auth(SPOKE_USER, SPOKE_PSW)
sdb = GraphDatabase.driver(URI, auth=auth)

disease_variant_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        for index, row in enumerate(result):
            disease_variant_list.append(("Variant", row["v_id"], row["d1_name"], row["d2_name"]))
                    
                
disease_variant_df = pd.DataFrame(disease_variant_list, columns=["node_1_type", "node_1", "node_2", "node_3"])
disease_variant_df = disease_variant_df.drop_duplicates(subset=["node_1"])
disease_variant_df = disease_variant_df.drop_duplicates(subset=["node_2", "node_3"])


CPU times: user 301 ms, sys: 37 ms, total: 338 ms
Wall time: 1.25 s


In [124]:
disease_2_hop_final = pd.concat([disease_gene_df_2, disease_variant_df.sample(n=100)], ignore_index=True).sample(frac=1)


In [126]:

disease_2_hop_final.loc[:, "text"] = disease_2_hop_final.node_1_type + " " + disease_2_hop_final.node_1 + " is associated with both " + disease_2_hop_final.node_2 + " and " + disease_2_hop_final.node_3

disease_2_hop_final.loc[:, "label"] = "True"

disease_2_hop_final



Unnamed: 0,node_1_type,node_1,node_2,node_3,text,label
126,Variant,rs754656961,dilated cardiomyopathy 1CC,hypertrophic cardiomyopathy 1,Variant rs754656961 is associated with both di...,True
72,Gene,MIR133B,ovarian carcinoma,thyroid gland cancer,Gene MIR133B is associated with both ovarian c...,True
34,Gene,MIR1246,ovarian squamous cell carcinoma,colorectal carcinoma,Gene MIR1246 is associated with both ovarian s...,True
20,Gene,MIR1202,depressive disorder,high grade glioma,Gene MIR1202 is associated with both depressiv...,True
155,Variant,rs886045829,hypertrophic cardiomyopathy,hypertrophic cardiomyopathy 2,Variant rs886045829 is associated with both hy...,True
...,...,...,...,...,...,...
68,Gene,MIR132,malignant mesothelioma,spinal muscular atrophy,Gene MIR132 is associated with both malignant ...,True
170,Variant,rs280519,immunodeficiency 35,psoriasis,Variant rs280519 is associated with both immun...,True
168,Variant,rs28934575,anaplastic astrocytoma,breast carcinoma,Variant rs28934575 is associated with both ana...,True
131,Variant,rs968230475,Wolff-Parkinson-White syndrome,Melnick-Needles syndrome,Variant rs968230475 is associated with both Wo...,True


In [128]:
disease_2_hop_final.to_csv("../../../data/benchmark_datasets/test_questions_two_hop.csv", index=False, header=True)


In [129]:
disease_2_hop_final



Unnamed: 0,node_1_type,node_1,node_2,node_3,text,label
126,Variant,rs754656961,dilated cardiomyopathy 1CC,hypertrophic cardiomyopathy 1,Variant rs754656961 is associated with both di...,True
72,Gene,MIR133B,ovarian carcinoma,thyroid gland cancer,Gene MIR133B is associated with both ovarian c...,True
34,Gene,MIR1246,ovarian squamous cell carcinoma,colorectal carcinoma,Gene MIR1246 is associated with both ovarian s...,True
20,Gene,MIR1202,depressive disorder,high grade glioma,Gene MIR1202 is associated with both depressiv...,True
155,Variant,rs886045829,hypertrophic cardiomyopathy,hypertrophic cardiomyopathy 2,Variant rs886045829 is associated with both hy...,True
...,...,...,...,...,...,...
68,Gene,MIR132,malignant mesothelioma,spinal muscular atrophy,Gene MIR132 is associated with both malignant ...,True
170,Variant,rs280519,immunodeficiency 35,psoriasis,Variant rs280519 is associated with both immun...,True
168,Variant,rs28934575,anaplastic astrocytoma,breast carcinoma,Variant rs28934575 is associated with both ana...,True
131,Variant,rs968230475,Wolff-Parkinson-White syndrome,Melnick-Needles syndrome,Variant rs968230475 is associated with both Wo...,True
