Written by Alex Loia

## Setup Connection to Graph Database

Download Neo4j locally [here](https://neo4j.com/download/?ref=docs-get-started-dropdown). The default database is called `neo4j`.

In [88]:
import neo4j
import os, shutil
import pandas as pd
import tqdm
import pickle
import numpy as np

In [37]:
driver = neo4j.GraphDatabase.driver("neo4j://localhost", auth=('neo4j', 'password'))
driver.verify_connectivity()

## Load CSV of translated DRKG relationships

The data file must be in the Neo4j import directory. See [here](https://neo4j.com/docs/operations-manual/5/configuration/file-locations/) for the import directory's location on your system.

In [11]:
# edit the below according to your Neo4j install
NEO4J_HOME = "/home/alexdloia/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-32f29f82-74c3-4124-8a29-27ad205fb889"
drkg_csv_name = 'drkg_translated_with_relation_labels.csv'
drkg_data_path = os.path.join('..', 'data', drkg_csv_name)

In [None]:
shutil.copy(drkg_data_path, os.path.join(NEO4J_HOME, 'import', drkg_csv_name))

In [15]:
drkg_df = pd.read_csv(drkg_data_path, index_col="id")
drkg_df.head()

Unnamed: 0_level_0,head_entity,tail_entity,drkg_id,relation_label,head_entity_type,tail_entity_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,coagulation factor VIII (F8),coagulation factor VIII (F8),bioarx::HumGenHumGen:Gene:Gene,INTERACTION,GENE,GENE
1,coagulation factor VIII (F8),phytanoyl-CoA 2-hydroxylase (PHYH),bioarx::HumGenHumGen:Gene:Gene,INTERACTION,GENE,GENE
2,coagulation factor VIII (F8),coagulation factor IX (F9),bioarx::HumGenHumGen:Gene:Gene,INTERACTION,GENE,GENE
3,coagulation factor VIII (F8),heat shock protein family A (Hsp70) member 5 (...,bioarx::HumGenHumGen:Gene:Gene,INTERACTION,GENE,GENE
4,coagulation factor VIII (F8),immunoglobulin kappa variable 3-20 (IGKV3-20),bioarx::HumGenHumGen:Gene:Gene,INTERACTION,GENE,GENE


In [18]:
for col_name in ["head_entity_type", "tail_entity_type"]:
    drkg_df[col_name] = drkg_df[col_name].str.replace(" ", "_")

In [47]:
unique_entity_types = pd.concat(
    [drkg_df["head_entity_type"], drkg_df["tail_entity_type"]]
).unique()
unique_relation_labels = drkg_df["relation_label"].unique()
unique_entity_types, unique_relation_labels

(array(['GENE', 'COMPOUND', 'DISEASE', 'ANATOMY', 'PHARMACOLOGIC_CLASS',
        'BIOLOGICAL_PROCESS', 'SYMPTOM', 'MOLECULAR_FUNCTION',
        'CELLULAR_COMPONENT', 'PATHWAY', 'SIDE_EFFECT'], dtype=object),
 array(['INTERACTION', 'INHIBITION', 'OTHER', 'ACTIVATION', 'BINDING',
        'MODULATION', 'BLOCKING', 'ANTAGONISM', 'ANTIBODY',
        'POSITIVE_ALLOSTERIC_MODULATION', 'ALLOSTERIC_MODULATION',
        'PARTIAL_AGONISM', 'DRUGDRUG_INTERACTION', 'TARGET', 'ENZYME',
        'CARRIER', 'COMPOUND_TREATS_THE_DISEASE',
        'AFFECTS_EXPRESSION_ORPRODUCTION_NEUTRAL',
        'METABOLISM_OR_PHARMACOKINETICS',
        'DECREASES_EXPRESSION_ORPRODUCTION',
        'INCREASES_EXPRESSION_ORPRODUCTION', 'TRANSPORT_OR_CHANNELS',
        'INHIBITS_CELL_GROWTH_ESPECIALLY_CANCERS',
        'SIDE_EFFECT_ORADVERSE_EVENT', 'BIOMARKERS_OF_DISEASE_PROGRESSION',
        'ALLEVIATES_OR_REDUCES', 'PREVENTS_OR_SUPPRESSES',
        'ROLE_IN_DISEASE_PATHOGENESIS',
        'IMPROPER_REGULATION_LINKED_TO_

In [30]:
entity_type_pairs = pd.concat(
    [
        drkg_df[["head_entity", "head_entity_type"]].rename(
            columns={"head_entity": "entity", "head_entity_type": "type"}
        ),
        drkg_df[["tail_entity", "tail_entity_type"]].rename(
            columns={"tail_entity": "entity", "tail_entity_type": "type"}
        ),
    ],
    axis=0,
    ignore_index=True,
).groupby(["entity", "type"]).size().reset_index()
entity_type_pairs.drop(columns=0, inplace=True)
entity_type_pairs.rename(columns={'entity': 'name'}, inplace=True)
entity_type_pairs.head()

Unnamed: 0,name,type
0,'de novo' AMP biosynthetic process,BIOLOGICAL_PROCESS
1,'de novo' CTP biosynthetic process,BIOLOGICAL_PROCESS
2,'de novo' GDP-L-fucose biosynthetic process,BIOLOGICAL_PROCESS
3,'de novo' IMP biosynthetic process,BIOLOGICAL_PROCESS
4,'de novo' NAD biosynthetic process,BIOLOGICAL_PROCESS


In [31]:
entity_type_pairs_l = entity_type_pairs.to_dict('records')
entity_type_pairs_l[:5]

[{'name': "'de novo' AMP biosynthetic process", 'type': 'BIOLOGICAL_PROCESS'},
 {'name': "'de novo' CTP biosynthetic process", 'type': 'BIOLOGICAL_PROCESS'},
 {'name': "'de novo' GDP-L-fucose biosynthetic process",
  'type': 'BIOLOGICAL_PROCESS'},
 {'name': "'de novo' IMP biosynthetic process", 'type': 'BIOLOGICAL_PROCESS'},
 {'name': "'de novo' NAD biosynthetic process", 'type': 'BIOLOGICAL_PROCESS'}]

In [32]:
records, summary, keys = driver.execute_query(
    "UNWIND $props AS map\n" + "CREATE (n:Entity)\n" + "SET n = map",
    props=entity_type_pairs_l,
)

In [41]:
summary.counters

{'labels_added': 62374, 'nodes_created': 62374, 'properties_set': 124748}

In [34]:
drkg_df.loc[0]

head_entity           coagulation factor VIII (F8)
tail_entity           coagulation factor VIII (F8)
drkg_id             bioarx::HumGenHumGen:Gene:Gene
relation_label                         INTERACTION
head_entity_type                              GENE
tail_entity_type                              GENE
Name: 0, dtype: object

In [46]:
drkg_df.index[drkg_df["relation_label"] == "ACTIVATION"]

Index([  77274,   77321,   77341,   77344,   77346,   77350,   77360,   77363,
         77370,   77399,
       ...
       1538633, 1538711, 1538797, 1538800, 1538874, 1538888, 1538943, 1538970,
       1539019, 1539030],
      dtype='int64', name='id', length=3234)

In [49]:
driver.execute_query("CREATE INDEX entity_name_type FOR (e:Entity) ON (e.name, e.type)")

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x7fe2d6c095d0>, keys=[])

In [50]:
for relation_label in unique_relation_labels:
    # create relationships for each label according to drkg_df
    this_index = drkg_df.index[drkg_df["relation_label"] == relation_label]
    if relation_label == "INTERACTION":
        this_index = this_index[82023:]
    for i in tqdm.tqdm(this_index):
        row = drkg_df.loc[i]
        rel_type = row["relation_label"]
        driver.execute_query(
            f"""
        MATCH 
            (e1:Entity {{name: $head_name, type: $head_type}}),
            (e2:Entity {{name: $tail_name, type: $tail_type}})
        CREATE
            (e1)-[r:{rel_type} {{type: $drkg_id}}]->(e2)
        """,
            head_name=row["head_entity"],
            tail_name=row["tail_entity"],
            head_type=row["head_entity_type"],
            tail_type=row["tail_entity_type"],
            drkg_id=row["drkg_id"],
            database_='neo4j'
        )

100%|██████████| 142410/142410 [08:03<00:00, 294.78it/s]
100%|██████████| 38378/38378 [02:36<00:00, 245.09it/s]
100%|██████████| 321361/321361 [22:05<00:00, 242.40it/s]
100%|██████████| 3234/3234 [00:13<00:00, 243.30it/s]
100%|██████████| 15221/15221 [01:01<00:00, 247.11it/s]
100%|██████████| 239/239 [00:00<00:00, 239.25it/s]
100%|██████████| 1233/1233 [00:05<00:00, 239.59it/s]
100%|██████████| 2664/2664 [00:10<00:00, 245.33it/s]
100%|██████████| 179/179 [00:00<00:00, 220.02it/s]
100%|██████████| 598/598 [00:02<00:00, 245.87it/s]
100%|██████████| 196/196 [00:00<00:00, 229.84it/s]
100%|██████████| 66/66 [00:00<00:00, 200.61it/s]
100%|██████████| 1378439/1378439 [1:19:34<00:00, 288.70it/s]
100%|██████████| 14479/14479 [00:48<00:00, 296.82it/s]
100%|██████████| 5952/5952 [00:22<00:00, 263.17it/s]
100%|██████████| 718/718 [00:02<00:00, 254.15it/s]
100%|██████████| 48554/48554 [02:50<00:00, 284.22it/s]
100%|██████████| 15299/15299 [00:54<00:00, 281.80it/s]
100%|██████████| 5781/5781 [00:21<

## Rare Disease Graph Exploration

In [53]:
rare_disease_df = pd.read_csv('../test_raredisease_links.csv', index_col=0)

In [55]:
rare_disease_df

Unnamed: 0,compound,compound_drkg_id,rare_disease,disease_drkg_id
0,Alpelisib,Compound::DB12015,breast cancer,Disease::DOID:1612
1,Ambenonium,Compound::DB01122,Myasthenia Gravis,Disease::MESH:D009157
2,Ambenonium,Compound::DB01122,Myasthenia gravis,Side Effect::C0026896
3,Amrubicin,Compound::DB06263,Small cell lung cancer,Side Effect::C0149925
4,Apalutamide,Compound::DB11901,prostate cancer,Disease::DOID:10283
5,Artesunate,Compound::DB09274,Malaria,Disease::MESH:D008288
6,Artesunate,Compound::DB09274,malaria,Disease::DOID:12365
7,Artesunate,Compound::DB09274,Malaria,Side Effect::C0024530
8,Darolutamide,Compound::DB12941,prostate cancer,Disease::DOID:10283
9,Delamanid,Compound::DB11637,Tuberculosis,Disease::MESH:D014376


In [58]:
disease_paths_df = rare_disease_df[rare_disease_df["disease_drkg_id"].str.contains("Disease")][["compound", "rare_disease"]].drop_duplicates()
disease_paths_df

Unnamed: 0,compound,rare_disease
0,Alpelisib,breast cancer
1,Ambenonium,Myasthenia Gravis
4,Apalutamide,prostate cancer
5,Artesunate,Malaria
6,Artesunate,malaria
8,Darolutamide,prostate cancer
9,Delamanid,Tuberculosis
11,Didox,breast cancer
12,Enzalutamide,prostate cancer
15,Leucovorin,Osteosarcoma


In [85]:
for idx in disease_paths_df.index:
    row = disease_paths_df.loc[idx]
    print(f'{idx}: {row["compound"]}->{row["rare_disease"]}')
    print(
        f"""
MATCH 
    (e1:Entity {{name: "{row["compound"]}", type: "COMPOUND"}}),
    (e2:Entity {{name: "{row["rare_disease"]}", type: "DISEASE"}}),
p=shortestPath((e1)-[*]->(e2)) RETURN p;
    """
    )

0: Alpelisib->breast cancer

MATCH 
    (e1:Entity {name: "Alpelisib", type: "COMPOUND"}),
    (e2:Entity {name: "breast cancer", type: "DISEASE"}),
p=shortestPath((e1)-[*]->(e2)) RETURN p;
    
1: Ambenonium->Myasthenia Gravis

MATCH 
    (e1:Entity {name: "Ambenonium", type: "COMPOUND"}),
    (e2:Entity {name: "Myasthenia Gravis", type: "DISEASE"}),
p=shortestPath((e1)-[*]->(e2)) RETURN p;
    
4: Apalutamide->prostate cancer

MATCH 
    (e1:Entity {name: "Apalutamide", type: "COMPOUND"}),
    (e2:Entity {name: "prostate cancer", type: "DISEASE"}),
p=shortestPath((e1)-[*]->(e2)) RETURN p;
    
5: Artesunate->Malaria

MATCH 
    (e1:Entity {name: "Artesunate", type: "COMPOUND"}),
    (e2:Entity {name: "Malaria", type: "DISEASE"}),
p=shortestPath((e1)-[*]->(e2)) RETURN p;
    
6: Artesunate->malaria

MATCH 
    (e1:Entity {name: "Artesunate", type: "COMPOUND"}),
    (e2:Entity {name: "malaria", type: "DISEASE"}),
p=shortestPath((e1)-[*]->(e2)) RETURN p;
    
8: Darolutamide->prostate c

In [73]:
dictionaries = pickle.load(open("../data2/mapping_dict/data.pkl", mode="rb"))

In [89]:
rare_disease_data = []
indices = []
for row in rare_disease_df.itertuples():
    try:
        this_data = {
            "ID": f"{row.compound}->{row.rare_disease}",
            "drug_name": row.compound,
            "disease_name": row.rare_disease,
            "drug_id": dictionaries["Compound"][row.compound],
            "disease_id": dictionaries["Disease"][row.rare_disease]
        }
        rare_disease_data.append(this_data)
        indices.append(row.Index)
    except Exception as e:
        print(e)
print(rare_disease_data[:2])
indices = np.array(indices)

'Myasthenia gravis'
'Small cell lung cancer'
'Scleroderma'
'Scleroderma'
'Anterior uveitis'
[{'ID': 'Alpelisib->breast cancer', 'drug_name': 'Alpelisib', 'disease_name': 'breast cancer', 'drug_id': 2574, 'disease_id': 483}, {'ID': 'Ambenonium->Myasthenia Gravis', 'drug_name': 'Ambenonium', 'disease_name': 'Myasthenia Gravis', 'drug_id': 11387, 'disease_id': 2205}]


In [79]:
with open("../data/rare_disease_data.pkl", "wb") as f:
    pickle.dump(rare_disease_data, f)

In [80]:
print(len(rare_disease_data))

32


In [94]:
hits = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1])
rare_disease_df.loc[indices[hits == 1]]

Unnamed: 0,compound,compound_drkg_id,rare_disease,disease_drkg_id
15,Leucovorin,Compound::DB00650,Osteosarcoma,Disease::MESH:D012516
16,Leuprolide,Compound::DB00007,prostate cancer,Disease::DOID:10283
22,Raltitrexed,Compound::DB00293,Mesothelioma,Disease::MESH:D008654
28,Abiraterone,Compound::DB05812,prostate cancer,Disease::DOID:10283
29,Bicalutamide,Compound::DB01128,prostate cancer,Disease::DOID:10283
30,Cabazitaxel,Compound::DB06772,prostate cancer,Disease::DOID:10283
31,Degarelix,Compound::DB06699,prostate cancer,Disease::DOID:10283
32,Estramustine,Compound::DB01196,prostate cancer,Disease::DOID:10283
33,Flutamide,Compound::DB00499,prostate cancer,Disease::DOID:10283
34,Fulvestrant,Compound::DB00947,breast cancer,Disease::DOID:1612


In [92]:
rare_disease_df.loc[indices[hits == 0]]

Unnamed: 0,compound,compound_drkg_id,rare_disease,disease_drkg_id
0,Alpelisib,Compound::DB12015,breast cancer,Disease::DOID:1612
1,Ambenonium,Compound::DB01122,Myasthenia Gravis,Disease::MESH:D009157
4,Apalutamide,Compound::DB11901,prostate cancer,Disease::DOID:10283
5,Artesunate,Compound::DB09274,Malaria,Disease::MESH:D008288
6,Artesunate,Compound::DB09274,malaria,Disease::DOID:12365
7,Artesunate,Compound::DB09274,Malaria,Side Effect::C0024530
8,Darolutamide,Compound::DB12941,prostate cancer,Disease::DOID:10283
9,Delamanid,Compound::DB11637,Tuberculosis,Disease::MESH:D014376
10,Delamanid,Compound::DB11637,Tuberculosis,Side Effect::C0041296
11,Didox,Compound::DB12948,breast cancer,Disease::DOID:1612
