In [1]:
from neo4j import GraphDatabase
import numpy as np
import pandas as pd
import random

## Extracting connected nodes and their relationships

In [2]:
host = "neo4j+s://neo4j.alzkb.ai"
driver = GraphDatabase.driver(host, auth=None)

In [3]:
def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [4]:
def remove_key_value(row, key_to_remove):
    try:
        row.pop(key_to_remove)
    except: 
        return row
    return row

In [None]:
df = run_query("""
MATCH (s)-[r]->(t)
RETURN labels(s)[0] as source_label, properties(s) as source, type(r) as relationship_type, labels(t)[0] as target_label, properties(t) AS target
""")

In [5]:
df

Unnamed: 0,source_label,source,relationship_type,target_label,target
0,Drug,"{'commonName': 'Droperidol', 'xrefDrugbank': '...",CHEMICALBINDSGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
1,Drug,"{'commonName': 'Raloxifene', 'xrefDrugbank': '...",CHEMICALINCREASESEXPRESSION,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
2,Drug,"{'commonName': 'Anagrelide', 'xrefDrugbank': '...",CHEMICALINCREASESEXPRESSION,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
3,Drug,"{'commonName': 'Amodiaquine', 'xrefDrugbank': ...",CHEMICALINCREASESEXPRESSION,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
4,Drug,"{'commonName': 'Digoxin', 'xrefDrugbank': 'DB0...",CHEMICALDECREASESEXPRESSION,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
...,...,...,...,...,...
1309461,Disease,{'commonName': 'Familial Alzheimer Disease (FA...,DISEASELOCALIZESTOANATOMY,BodyPart,"{'commonName': 'dura mater', 'uri': 'http://jd..."
1309462,Disease,"{'commonName': 'Alzheimer's Disease', 'xrefUml...",DISEASELOCALIZESTOANATOMY,BodyPart,"{'commonName': 'forebrain', 'uri': 'http://jdr..."
1309463,Disease,{'commonName': 'Familial Alzheimer Disease (FA...,DISEASELOCALIZESTOANATOMY,BodyPart,"{'commonName': 'spinal cord', 'uri': 'http://j..."
1309464,Disease,"{'commonName': 'Alzheimer's Disease', 'xrefUml...",DISEASELOCALIZESTOANATOMY,BodyPart,"{'commonName': 'olfactory nerve', 'uri': 'http..."


In [7]:
# Getting inverse relationship for gene-gene interactions - not sure if this needs to be explicitly extracted
gene_gene_df = run_query("""
MATCH (s)-[r:GENEINTERACTSWITHGENE]->(t)
RETURN labels(t)[0] as source_label, properties(t) as source, type(r) as relationship_type, labels(s)[0] as target_label, properties(s) AS target
""")

In [8]:
gene_gene_df

Unnamed: 0,source_label,source,relationship_type,target_label,target
0,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
1,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
2,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
3,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
4,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
...,...,...,...,...,...
146996,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
146997,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
146998,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
146999,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."


In [9]:
combined_df = pd.concat([df, gene_gene_df], ignore_index=True)

In [10]:
# Apply the function to the column
combined_df['source'] = combined_df['source'].apply(lambda row: remove_key_value(row, key_to_remove='uri'))
combined_df['target'] = combined_df['target'].apply(lambda row: remove_key_value(row, key_to_remove='uri'))

In [11]:
combined_df

Unnamed: 0,source_label,source,relationship_type,target_label,target
0,Drug,"{'commonName': 'Droperidol', 'xrefDrugbank': '...",CHEMICALBINDSGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
1,Drug,"{'commonName': 'Raloxifene', 'xrefDrugbank': '...",CHEMICALINCREASESEXPRESSION,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
2,Drug,"{'commonName': 'Anagrelide', 'xrefDrugbank': '...",CHEMICALINCREASESEXPRESSION,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
3,Drug,"{'commonName': 'Amodiaquine', 'xrefDrugbank': ...",CHEMICALINCREASESEXPRESSION,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
4,Drug,"{'commonName': 'Digoxin', 'xrefDrugbank': 'DB0...",CHEMICALDECREASESEXPRESSION,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
...,...,...,...,...,...
1456462,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
1456463,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
1456464,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
1456465,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",GENEINTERACTSWITHGENE,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."


In [12]:
combined_df['relationship_type'].unique()

array(['CHEMICALBINDSGENE', 'CHEMICALINCREASESEXPRESSION',
       'CHEMICALDECREASESEXPRESSION', 'DRUGINCLASS', 'DRUGTREATSDISEASE',
       'DRUGCAUSESEFFECT', 'GENEPARTICIPATESINBIOLOGICALPROCESS',
       'GENEINPATHWAY', 'GENEINTERACTSWITHGENE',
       'GENEHASMOLECULARFUNCTION', 'GENEASSOCIATEDWITHCELLULARCOMPONENT',
       'GENEASSOCIATESWITHDISEASE', 'BODYPARTOVEREXPRESSESGENE',
       'BODYPARTUNDEREXPRESSESGENE', 'SYMPTOMMANIFESTATIONOFDISEASE',
       'DISEASELOCALIZESTOANATOMY', 'DISEASEASSOCIATESWITHDISEASE'],
      dtype=object)

In [13]:
# Rewrite relationship type in Natural language
rels_type = {'GENEINPATHWAY': 'gene in the pathway', 'GENEINTERACTSWITHGENE': 'gene interacts with the gene', \
             'GENEPARTICIPATESINBIOLOGICALPROCESS': 'gene participates in the biological process', \
             'GENEASSOCIATEDWITHCELLULARCOMPONENT': 'gene associated with the cellular component', \
             'GENEHASMOLECULARFUNCTION': 'gene has a molecular function', \
             'GENEASSOCIATESWITHDISEASE': 'gene associates with the disease', \
             'CHEMICALBINDSGENE': 'chemical or drug binds the gene', \
             'CHEMICALINCREASESEXPRESSION': 'chemical or drug increases the gene expression', \
             'CHEMICALDECREASESEXPRESSION': 'chemical or drug decreases the gene expression', \
             'DRUGINCLASS': 'chemical or drug in the drug class', \
             'DISEASELOCALIZESTOANATOMY': 'disease localizes to anatomy or body part', \
             'DISEASEASSOCIATESWITHDISEASE': 'disease associates with another disease', \
             'DRUGCAUSESEFFECT': 'chemical or drug causes effect to the disease', \
             'DRUGTREATSDISEASE': 'chemical or drug treats the disease', \
             'SYMPTOMMANIFESTATIONOFDISEASE': 'symptom manifestation of the disease', \
             'BODYPARTOVEREXPRESSESGENE': 'body part over-expresses the gene', \
             'BODYPARTUNDEREXPRESSESGENE': 'body part under-expresses the gene'}

In [14]:
combined_df['relationship_type'] = combined_df['relationship_type'].replace(rels_type)
combined_df

Unnamed: 0,source_label,source,relationship_type,target_label,target
0,Drug,"{'commonName': 'Droperidol', 'xrefDrugbank': '...",chemical or drug binds the gene,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
1,Drug,"{'commonName': 'Raloxifene', 'xrefDrugbank': '...",chemical or drug increases the gene expression,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
2,Drug,"{'commonName': 'Anagrelide', 'xrefDrugbank': '...",chemical or drug increases the gene expression,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
3,Drug,"{'commonName': 'Amodiaquine', 'xrefDrugbank': ...",chemical or drug increases the gene expression,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
4,Drug,"{'commonName': 'Digoxin', 'xrefDrugbank': 'DB0...",chemical or drug decreases the gene expression,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
...,...,...,...,...,...
1456462,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",gene interacts with the gene,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
1456463,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",gene interacts with the gene,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
1456464,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",gene interacts with the gene,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."
1456465,Gene,"{'typeOfGene': 'protein-coding', 'commonName':...",gene interacts with the gene,Gene,"{'typeOfGene': 'protein-coding', 'commonName':..."


In [15]:
combined_df.to_csv('alzkb_relationships_v2.csv', index=False)

In [14]:
combined_df.to_json('alzkb_relationships.json')