## Build Oxidative Stress - CVD Knowledgegraph

In [5]:
import json 
import pandas as pd

**How many nodes are there?**

1. Ion Channel Protein nodes:
2. MeSH terms CVD tree nodes:
3. PMID nodes:
4. Oxidative stress chemical tree nodes

----------------

**How many types of edges are there?**

1. Protein to PMID edge:
2. MeSH to PMID edge
3. Protein to Pathways
4. Oxidative-stress chemical to PMID 

    



In [6]:
import pandas as pd
import json
from neo4j import GraphDatabase

In [3]:
query = "CREATE CONSTRAINT ON (p:Protein) ASSERT p.id  IS UNIQUE; \
        CREATE CONSTRAINT ON (pw:Pathway) ASSERT pw.id  IS UNIQUE; \
        CREATE CONSTRAINT ON (d:Document) ASSERT d.id  IS UNIQUE;\
        CREATE CONSTRAINT ON (m:MeSH) ASSERT m.id  IS UNIQUE;\
        CREATE CONSTRAINT ON (dg:Drug) ASSERT dg.id  IS UNIQUE;\
        CREATE CONSTRAINT ON (c:Compound) ASSERT c.id  IS UNIQUE"

'''UNCOMMENT AND RUN THIS CELL ONLY ONCE'''

#with driver.session() as session:
#    info = session.run(query)

'UNCOMMENT AND RUN THIS CELL ONLY ONCE'

In [4]:
driver = GraphDatabase.driver(uri = "bolt://52.40.20.135:7687",\
                              auth = ("neo4j","Aim1008"))

### Neo4j Implemetation

#### N1. Protein Nodes

In [18]:
df1 = pd.read_csv("Interface/merged-proteins.csv")
df1.head(1)

Unnamed: 0,ID,name,synonyms,CM,ARR,CHD,VD,IHD,CCS,VOO,OHD,IOS,ROS,OOS
0,P17302,Gap junction alpha-1 protein,"['Gap junction alpha-1 protein', 'Connexin-43'...",0.108147,0.154496,0.089738,0.034155,0.100026,0.098568,0.04216,0.103816,0.179578,0.349316,0.165879


In [19]:
def create_protein_nodes(tx, uid, name, syn):
    query = "MERGE (p:Protein{id:$uid,name:$name,syn:$syn})"
    tx.run(query, uid = uid, name=name, syn=syn)

In [20]:
with driver.session() as session:
    for uid,name,syn in zip(df1["ID"],df1['name'],df1['synonyms']):
        session.write_transaction(create_protein_nodes, uid,name,syn)

#### N2. Document Node

In [14]:
df2 = pd.read_csv("kgdata/allpmid-nodes.csv")
df2.head(1)

Unnamed: 0.1,Unnamed: 0,pmid,title,abstract,mesh,date
0,0,20091048,Cardiac sodium channelopathies.,cardiac sodium channel are protein complexes t...,"['Animals', 'Arrhythmias, Cardiac', 'genetics'...","{'Year': '2010', 'Month': 'Jul', 'Day': '', 'S..."


In [15]:
def create_document_nodes(tx, pmid,title,date):
    query = "MERGE (d:Document{id:$pmid,title:$title,date:$date})"
    tx.run(query, pmid=pmid, title=title, date=date)

In [16]:
with driver.session() as session:
    for pmid,title,date in zip(df2["pmid"],df2["title"],df2["date"]):
        session.write_transaction(create_document_nodes, pmid,title,date)

#### N3.  MeSH Node

In [71]:
df3 = pd.read_csv("kgdata/merged-mesh-nodes.csv")
df3.head(1)

Unnamed: 0.1,Unnamed: 0,mid,name
0,0,C14.280.238,cardiomyopathies


In [73]:
df3.shape

(254, 3)

In [74]:
def create_mesh_nodes(tx, mid, name):
    query = "MERGE (m:MeSH{id:$mid, name:$name})"
    tx.run(query, mid=mid, name=name)

In [76]:
with driver.session() as session:
    for mid,name in zip(df3["mid"],df3["name"]):
        session.write_transaction(create_mesh_nodes, mid,name)

#### N4. Pathways Nodes

In [11]:
df4 = pd.read_csv("Pathways/pw2protein-edge.csv")
df4.head(1)

Unnamed: 0.1,Unnamed: 0,ID,Pathway,Protein,Relation
0,0,R-HSA-5578775,Ion homeostasis,Q96D31,INVOLVED_IN


In [14]:
def create_pathway_nodes(tx, pwid, name):
    query = "MERGE (pw:Pathway{id:$pwid,name:$name})"
    tx.run(query, pwid=pwid, name=name)

In [15]:
with driver.session() as session:
    for pwid,name in zip(df4["ID"],df4["Pathway"]):
        session.write_transaction(create_pathway_nodes, pwid, name)

#### N5. Drug Nodes

In [58]:
df5 = pd.read_csv("kgdata/drug-nodes.csv")
df5.head(1)

Unnamed: 0.1,Unnamed: 0,name,category
0,0,heparin,Anticoagulants


In [59]:
df5.shape

(156, 3)

In [60]:
def create_drug_nodes(tx, dgid, cat):
    query = "MERGE (dg:Drug{id:$dgid,cat:$cat})"
    tx.run(query, dgid=dgid, cat=cat)

In [61]:
with driver.session() as session:
    for dgid,cat in zip(df5["name"],df5["category"]):
        session.write_transaction(create_drug_nodes, dgid, cat)

#### N6 Drug-target Protein Nodes

In [62]:
df6 = pd.read_csv("kgdata/drug-target-nodes.csv")
df6.head(1)

Unnamed: 0.1,Unnamed: 0,Protein
0,0,Q01064


In [63]:
def create_drug_target_nodes(tx, uid):
    query = "MERGE (p:Protein{id:$uid})"
    tx.run(query, uid=uid)

In [64]:
with driver.session() as session:
    for uid in df6["Protein"]:
        session.write_transaction(create_drug_target_nodes, uid)

#### N7. Compound Nodes

#### E1. Document to Protein Edge

In [21]:
df4 = pd.read_csv("kgdata/pmid2protein-edge.csv")
df4.head(1)

Unnamed: 0.1,Unnamed: 0,pmid,protein,edge
0,0,20091048,P56539,MENTIONS


In [23]:
def create_document2protein_edge(tx, pmid, uid):
    query = '''
    MATCH (d:Document{id:$pmid})
    MATCH (p:Protein{id:$uid})
    MERGE (d)-[:MENTIONS]->(p)
    '''
    tx.run(query, pmid=pmid, uid=uid)

In [24]:
with driver.session() as session:
    for pmid, uid in zip(df4["pmid"], df4["protein"]):
        session.write_transaction(create_document2protein_edge, pmid, uid)

#### E2.  Document to MeSH Edge

In [77]:
dfe2 = pd.read_csv("kgdata/pmid2mesh-merged-edge.csv")
dfe2.head(1)

Unnamed: 0.1,Unnamed: 0,pmid,name,mid
0,0,20091048,"arrhythmias, cardiac",C23.550.073


In [78]:
dfe2.shape

(59306, 4)

In [79]:
def create_document2mesh_edge(tx, pmid, mid):
    query = '''
    MATCH (d:Document{id:$pmid})
    MATCH (m:MeSH{id:$mid})
    MERGE (d)-[:ASSIGNS]->(m)
    '''
    tx.run(query, pmid=pmid, mid=mid)

In [81]:
with driver.session() as session:
    for pmid, mid in zip(dfe2["pmid"], dfe2["mid"]):
        session.write_transaction(create_document2mesh_edge, pmid, mid)

#### E3. Pathways to Proteins Edges

In [16]:
def create_pw2p_edge(tx, pwid, uid):
    query = '''
    MATCH (pw:Pathway{id:$pwid})
    MATCH (p:Protein{id:$uid})
    MERGE (pw)-[:CANDIDATE]->(p)
    '''
    tx.run(query, pwid=pwid, uid=uid)

In [18]:
with driver.session() as session:
    for pwid, uid in zip(df4["ID"], df4["Protein"]):
        session.write_transaction(create_pw2p_edge, pwid, uid)

#### E4. Drug to Drug Target Edges

In [65]:
dfe5 = pd.read_csv("kgdata/drug2targets-edge.csv")
dfe5.head(1)

Unnamed: 0.1,Unnamed: 0,name,cat,protein
0,0,heparin,Anticoagulants,P01008


In [66]:
def create_dg2t_edge(tx, dgid, uid):
    query = '''
    MATCH (dg:Drug{id:$dgid})
    MATCH (p:Protein{id:$uid})
    MERGE (dg)-[:TARGET]->(p)
    '''
    tx.run(query, dgid=dgid, uid=uid)

In [67]:
with driver.session() as session:
    for dgid, uid in zip(dfe4["name"], dfe4["protein"]):
        session.write_transaction(create_dg2t_edge, dgid, uid)

--------

### Clear Graph Database (When Needed!)

- Conform APOC is available

In [None]:
CALL dbms.procedures()
YIELD name WHERE name STARTS WITH "apoc"
RETURN name

- Clear database

In [None]:
// Delete all constraints and indexes
CALL apoc.schema.assert({},{},true);
// Delete all nodes and relationships
CALL apoc.periodic.iterate(
  'MATCH (n) RETURN n',
  'DETACH DELETE n',
  { batchSize:500 }
)

### Perform the task

- Visualize the schema

In [None]:
CALL db.schema.visualization

- Find the protein list for Pathways

In [None]:
MATCH path = (p:Protein)--(pw:Pathway)--(t:Target)--(d:Drug) RETURN path LIMIT 100

In [None]:
MATCH path = (p:Protein)--(pw:Pathway)--(t:Target) RETURN p.name, collect(DISTINCT t.name)

In [None]:
MATCH path = (p:Protein)--(pw:Pathway)--(t:Target)  RETURN p.name, count(DISTINCT t.name)

In [None]:
MATCH path = (p:Protein)--()--(t:Target)  RETURN path LIMIT 100

In [None]:
MATCH path = (d:Drug)--()--(p:Protein)  RETURN path LIMIT 100

In [None]:
MATCH path = (d:Drug)--(t:Target)--(pw:Pathway)--(p:Protein)  
WHERE t.id in ['Q92769','P26358'] RETURN path LIMIT 100

In [None]:
MATCH path = (d:Drug)--(t:Target)--(pw:Pathway)--(p:Protein)  
WHERE t.id in ['Q92769'] RETURN path LIMIT 100

In [None]:
MATCH path = (d:Drug)--(t:Target)--(pw:Pathway)--(p:Protein)  
WHERE t.id in ['P26358'] RETURN path LIMIT 100

In [None]:
P19634

In [None]:
MATCH path = (d:Drug)--(p:Protein)--(pw:Pathway)  
WHERE p.id in ['P19634'] RETURN path LIMIT 100