## Protein Pathways Network (CVD)

In [1]:
import json as json
import pandas as pd

### Load Data

Put data into data folder and load in the python environment

In [2]:
df_cvd = pd.read_csv("reactome-result-cvd.csv")
df_cvd.head(1)

Unnamed: 0,Pathway identifier,Pathway name,#Entities found,#Entities total,#Interactors found,#Interactors total,Entities ratio,Entities pValue,Entities FDR,#Reactions found,#Reactions total,Reactions ratio,Species identifier,Species name,Submitted entities found,Mapped entities,Submitted entities hit interactor,Interacts with,Found reaction identifiers
0,R-HSA-5578775,Ion homeostasis,20,64,3,98,0.004401,1.110223e-16,4.718448e-14,13,16,0.001184,9606,Homo sapiens,Q96D31;Q14643;Q92736;Q13586;P30626;P32418;P0DP...,Q96D31;Q14643;Q92736;Q13586;P30626;P32418;P0DP...,P30626;P49810;P05067,P17612;P30626,R-HSA-427910;R-HSA-418309;R-HSA-2855020;R-HSA-...


In [3]:
df_cvd.shape

(1188, 19)

- What are the coluns in the data?

In [4]:
df_cvd.columns

Index(['Pathway identifier', 'Pathway name', '#Entities found',
       '#Entities total', '#Interactors found', '#Interactors total',
       'Entities ratio', 'Entities pValue', 'Entities FDR', '#Reactions found',
       '#Reactions total', 'Reactions ratio', 'Species identifier',
       'Species name', 'Submitted entities found', 'Mapped entities',
       'Submitted entities hit interactor', 'Interacts with',
       'Found reaction identifiers'],
      dtype='object')

- Lets set pathways as the index of the data

In [5]:
df_cvd = df_cvd.set_index("Pathway name")

- What are the top 30 pathways involved?

In [6]:
df_cvd.index[0:30]

Index(['Ion homeostasis', 'Cardiac conduction', 'Muscle contraction',
       'Ion channel transport', 'Platelet calcium homeostasis',
       'Stimuli-sensing channels', 'Phase 0 - rapid depolarisation',
       'Platelet homeostasis', 'Elevation of cytosolic Ca2+ levels',
       'Reduction of cytosolic Ca++ levels',
       'Negative regulation of NMDA receptor-mediated neuronal transmission',
       'Ion transport by P-type ATPases',
       'Unblocking of NMDA receptors, glutamate binding and activation',
       'Presynaptic depolarization and calcium channel opening',
       'Phase 2 - plateau phase', 'Transport of small molecules',
       'CLEC7A (Dectin-1) induces NFAT activation', 'NCAM1 interactions',
       'Loss of phosphorylation of MECP2 at T308', 'Sodium/Calcium exchangers',
       'Regulation of insulin secretion', 'TRP channels',
       'Striated Muscle Contraction',
       'Response to elevated platelet cytosolic Ca2+',
       'NOTCH4 Activation and Transmission of Signal t

#### Protein Node Data

In [7]:
'''Lets collect all proteins in a list and find unique proteins'''

allproteins = []

for item in df_cvd['Submitted entities found']:
    
    #protein list for a pathway may also be vacant
    try:
        Proteins = item.split(";")
    except:
        continue
        
    allproteins += Proteins
    
#find unique protein set
unique_proteins = list(set(allproteins))

In [8]:
'''Create a dataframe'''
protein_nodes = pd.DataFrame()
protein_nodes["Protein"] = unique_proteins

In [9]:
'''save the datafile'''
protein_nodes.to_csv("protein_nodes.csv")

#### Pathway Node Data

In [10]:
'''collect all pathways'''
Pathways = []
for ID, pw in zip(df_cvd['Pathway identifier'],df_cvd.index):
    Pathways.append({"ID":ID, "Pathway":pw})

In [11]:
'''create a dataframe'''
pathway_nodes = pd.DataFrame(Pathways)

In [12]:
'''save the datafile'''
pathway_nodes.to_csv("pathway-nodes.csv")

#### Pathways to Protein Edge data

In [13]:
pw2proteins_dict = {}
pw2proteins_list = []

for pw_ID, pw_name, ps in zip(df_cvd['Pathway identifier'], df_cvd.index, df_cvd['Submitted entities found']):
    
    pathway = pw_name
    ID = pw_ID
    
    #protein list for a pathway may also be vacant
    try:
        proteins = ps.split(";")
    except:
        continue
    
    #print(pathway, proteins)
    #print(proteins)
    
    pw2proteins_dict.update({pathway:proteins})
    
    
    for p in proteins:
        pw2proteins_list.append({"ID" : ID,\
                                 "Pathway":pathway,\
                                 "Protein":p,\
                                 "Relation": "INVOLVED_IN"})

In [14]:
'''create a dataframe'''
df_pw2ps = pd.DataFrame(pw2proteins_list)

In [15]:
df_pw2ps.head()

Unnamed: 0,ID,Pathway,Protein,Relation
0,R-HSA-5578775,Ion homeostasis,Q96D31,INVOLVED_IN
1,R-HSA-5578775,Ion homeostasis,Q14643,INVOLVED_IN
2,R-HSA-5578775,Ion homeostasis,Q92736,INVOLVED_IN
3,R-HSA-5578775,Ion homeostasis,Q13586,INVOLVED_IN
4,R-HSA-5578775,Ion homeostasis,P30626,INVOLVED_IN


In [16]:
'''save the datafile'''
df_pw2ps.to_csv("pw2protein-edge.csv")

### Construct Knowledgegraph - From Neo4j Browser

0. Install Neo4j desktop,create a local database and start database, run neo4j browser
1. Open database folder and load datafiles into import folder
2. Copy pest these Sypher commands one-by-one directly in the cell of Neo4j browser
3. Explore the knowledgegraph

- Create Protein Nodes

In [None]:
*** DO NOT RUN IN THE CELL HERE, RUN AT NEO4J BROWSER CELL***

CREATE CONSTRAINT UniqueProteinIdConstraint ON (p:Protein) ASSERT p.id IS UNIQUE;

LOAD CSV WITH HEADERS FROM
  'file:///protein-nodes.csv' as row
MERGE (p:Protein {id:row.Protein})

- Create Pathway Nodes

In [None]:
*** DO NOT RUN IN THE CELL HERE, RUN AT NEO4J BROWSER CELL***

CREATE CONSTRAINT UniquePathwayIdConstraint ON (pw:Pathway) ASSERT pw.id IS UNIQUE;

LOAD CSV WITH HEADERS FROM
  'file:///pathway-nodes.csv' as row
MERGE (pw:Pathway {id:row.ID})
    ON CREATE SET
        pw.name = row.Pathway,

- Create Edges between Protein and Pathways

In [None]:
*** DO NOT RUN IN THE CELL HERE, RUN AT NEO4J BROWSER CELL***

LOAD CSV WITH HEADERS FROM
  'file:///pw2protein-edge.csv' as row
MATCH (pw:Pathway{id:row.Pathway})
MATCH (p:Protein{id:row.Protein})
MERGE (pw)-[:INVOLVED_IN]->(p)

### Create Knowledgegraph - From Jupyter Notebook and  Neo4j Database API

In [26]:
#!pip install neo4j

In [17]:
import pandas as pd
import json
from neo4j import GraphDatabase

In [20]:
driver = GraphDatabase.driver(uri = "bolt://52.40.20.135:7687",\
                              auth = ("neo4j","Aim1008"))

- Create Protein Nodes

In [31]:
'''UNCOMMENT AND RUN THIS CELL ONLY ONCE'''
#query = "CREATE CONSTRAINT UniqueProteinIdConstraint ON (p:Protein) ASSERT p.id IS UNIQUE;"
#with driver.session() as session:
#    info = session.run(query)

'UNCOMMENT AND RUN THIS CELL ONLY ONCE'

In [24]:
def create_protein_nodes(tx, name):
    #"CREATE (p1:Person { name: $person1_name }) "
    query = "MERGE (p:Protein{id:$name})"
    tx.run(query, name=name)

In [25]:
with driver.session() as session:
    for item in protein_nodes["Protein"]:
        session.write_transaction(create_protein_nodes, item)

- Create Pathways Nodes

In [26]:
'''UNCOMMENT and RUN THIS CELL ONLY ONCE'''
#query = "CREATE CONSTRAINT UniquePathwayIdConstraint ON (pw:Pathway) ASSERT pw.id IS UNIQUE;"
#with driver.session() as session:
#    info = session.run(query)

'UNCOMMENT and RUN THIS CELL ONLY ONCE'

In [27]:
def create_pathway_nodes(tx, ID, name):
    #"CREATE (p1:Person { name: $person1_name }) "
    query = "CREATE (pw:Pathway {id:$ID,name:$name})"
    tx.run(query, ID=ID, name=name)

In [28]:
with driver.session() as session:
    for ID,name in zip(pathway_nodes["ID"],pathway_nodes["Pathway"]):
        session.write_transaction(create_pathway_nodes, ID, name)

- Create Pathways to Protein Edges

In [29]:
def create_pw2p_edge(tx, pw_ID, p_ID):
    #"CREATE (p1:Person { name: $person1_name }) "
    query = '''
    MATCH (pw:Pathway{id:$pw_ID})
    MATCH (p:Protein{id:$p_ID})
    MERGE (pw)-[:INVOLVED_IN]->(p)
    '''
    tx.run(query, pw_ID=pw_ID, p_ID=p_ID)

In [30]:
with driver.session() as session:
    for pw_ID, p_ID in zip(df_pw2ps["ID"], df_pw2ps["Protein"]):
        session.write_transaction(create_pw2p_edge, pw_ID, p_ID)

### Restected Zone

- Confirm APC is available

In [None]:
CALL dbms.procedures()
YIELD name WHERE name STARTS WITH "apoc"
RETURN name

- Delete All content

In [None]:
// Delete all constraints and indexes
CALL apoc.schema.assert({},{},true);
// Delete all nodes and relationships
CALL apoc.periodic.iterate(
  'MATCH (n) RETURN n',
  'DETACH DELETE n',
  { batchSize:500 }
)

In [None]:
MATCH (n)
DETACH DELETE n