In [1]:
import pandas as pd
from neo4j import GraphDatabase, basic_auth
from tqdm import tqdm


In [2]:
URI = "bolt://spokedev.cgl.ucsf.edu/:7687"
SPOKE_USER = "neo4j"
SPOKE_PSW = "SPOKEdev"


In [11]:
%%time

query = """
    MATCH (d1:Disease)<-[:ASSOCIATES_VaP]-(v:Variant)-[:ASSOCIATES_VaP]->(d2:Disease)
    WITH d1, d2, COLLECT(v.identifier) AS positive_variants
    LIMIT 10
    OPTIONAL MATCH (d1)<-[:ASSOCIATES_VaP]-(negative_variants:Variant)
    WHERE NOT negative_variants IN positive_variants
    WITH d1, d2, positive_variants, COLLECT(DISTINCT negative_variants.identifier) AS negative_samples
    RETURN d1.name AS d1_name, d2.name AS d2_name, positive_variants AS v_name_correct, negative_samples
    LIMIT 10
"""


auth = basic_auth(SPOKE_USER, SPOKE_PSW)
sdb = GraphDatabase.driver(URI, auth=auth)

disease_variant_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        for index, row in enumerate(result):
            disease_variant_list.append(("Variant", row["d1_name"], row["d2_name"], row["v_name_correct"], row["negative_samples"]))
            

CPU times: user 28 ms, sys: 4.63 ms, total: 32.6 ms
Wall time: 13.4 s


In [54]:
disease_variant_df = pd.DataFrame(disease_variant_list, columns=["correct_type", "disease_1", "disease_2", "correct_node", "negative_nodes"])
disease_variant_df = disease_variant_df.dropna(subset="negative_nodes")
disease_variant_df.negative_nodes = disease_variant_df.negative_nodes.apply(lambda x:", ".join(x[0:4]))
disease_variant_df = disease_variant_df.explode("correct_node")


In [55]:
disease_variant_df.loc[:, "text"] = "Out of the given list, which " + disease_variant_df.correct_type + " is associated with both " + disease_variant_df.disease_1 + " and " + disease_variant_df.disease_2 + ". Given list is: " + disease_variant_df.correct_node + ", " + disease_variant_df.negative_nodes


In [56]:
disease_variant_df.to_csv("../../../data/benchmark_datasets/test_questions_two_hop_mcq.csv", index=False, header=True)


In [57]:
disease_variant_df.text.values[0]


'Out of the given list, which Variant is associated with both cerebrooculofacioskeletal syndrome 1 and cerebrooculofacioskeletal syndrome 2. Given list is: rs200665173, rs1026438103, rs886047027, rs886047033, rs886047024'