In [1]:
import pandas as pd
import os
from tqdm import tqdm
from dotenv import load_dotenv
from neo4j import GraphDatabase, basic_auth


In [2]:
load_dotenv(os.path.join(os.path.expanduser('~'), '.neo4j_config.env'))
USER = os.environ.get('SPOKE_USER')
PSW = os.environ.get('SPOKE_PSW')
URI = os.environ.get('SPOKE_URI')


In [3]:
%%time

DISEASE_QUERY = """
    MATCH(d:Disease)-[r:ASSOCIATES_DaG]->(g:Gene)
    WHERE r.diseases_sources <> ["textmining"]
    RETURN DISTINCT d.identifier AS d_id, d.name AS d_name
"""

auth = basic_auth(USER, PSW)
sdb = GraphDatabase.driver(URI, auth=auth)

disease_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(DISEASE_QUERY)
        for row in result:
            disease_list.append((row["d_id"], row["d_name"]))

disease_df = pd.DataFrame(disease_list, columns=["disease_id", "disease_name"])
disease_id_list = list(disease_df.disease_id.values)


CPU times: user 140 ms, sys: 28.4 ms, total: 169 ms
Wall time: 8.67 s


In [36]:
%%time

QUERY = """
    MATCH(c:Compound)-[r:TREATS_CtD]->(d1:Disease)-[:RESEMBLES_DrD]-(d2:Disease) 
    WHERE r.phase>=3 AND NOT EXISTS((c)-[:TREATS_CtD]->(d2)) AND d1.identifier IN {}
    RETURN c.name as c_name, d1.name AS d1_name, d2.name AS d2_name LIMIT 20000
""".format(disease_id_list)

auth = basic_auth(USER, PSW)
sdb = GraphDatabase.driver(URI, auth=auth)
edge_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(QUERY)
        for row in result:
            edge_list.append((row["c_name"], row["d1_name"], row["d2_name"]))

drug_repurp_df = pd.DataFrame(edge_list, columns=["compound", "disease_1", "disease_2"])



CPU times: user 466 ms, sys: 56.6 ms, total: 523 ms
Wall time: 1.35 s


In [53]:

result_df = drug_repurp_df.groupby("disease_1").agg({
    'compound': list,
    'disease_2': list
}).reset_index()

result_df.rename(columns={'compound': 'Compounds', 'disease_2': 'Diseases'}, inplace=True)
result_df['Compounds'] = result_df['Compounds'].apply(lambda x: list(set(x)))
result_df['Diseases'] = result_df['Diseases'].apply(lambda x: list(set(x)))
result_df.loc[:, "text"] = "What compounds treat '" + result_df.disease_1 + "' and what diseases resemble '" + result_df.disease_1 + "'?"
result_df.shape



(143, 4)

In [60]:
result_df.columns

Index(['disease_1', 'Compounds', 'Diseases', 'text'], dtype='object')

In [59]:
result_df.to_csv("../../../../data/benchmark_datasets/drug_reporposing_questions.csv", index=False, header=True)


In [58]:
ind = 4
print(result_df.text.values[ind])
print(result_df.Compounds.values[ind])
print(result_df.Diseases.values[ind])


What compounds treat 'Fabry disease' and what diseases resemble 'Fabry disease'?
['AGALSIDASE BETA', 'Migalastat', 'AGALSIDASE ALFA', 'Lucerastat', 'Venglustat']
['familial hypertrophic cardiomyopathy', 'anhidrosis', 'lipid storage disease', 'angiokeratoma', 'glycogen storage disease II', 'lysosomal storage disease', 'erythromelalgia', 'fucosidosis', 'hypohidrosis', 'beta-mannosidosis', 'priapism', 'cerebral amyloid angiopathy', 'proteinuria', 'restrictive cardiomyopathy', 'lipid metabolism disorder', 'mucopolysaccharidosis I', 'GM1 gangliosidosis', "Gaucher's disease", 'mucopolysaccharidosis Ih/s', 'MELAS syndrome', 'mucopolysaccharidosis Ih']
