## Protein Pathways Network (CVD)

In [22]:
import json as json
import pandas as pd

### Load Data

Put data into data folder and load in the python environment

In [23]:
df_cvd = pd.read_csv("data/reactome-result-os.csv")
df_cvd.head(1)

Unnamed: 0,Pathway identifier,Pathway name,#Entities found,#Entities total,#Interactors found,#Interactors total,Entities ratio,Entities pValue,Entities FDR,#Reactions found,#Reactions total,Reactions ratio,Species identifier,Species name,Submitted entities found,Mapped entities,Submitted entities hit interactor,Interacts with,Found reaction identifiers
0,R-HSA-5578775,Ion homeostasis,20,64,3,98,0.004401,1.110223e-16,4.718448e-14,13,16,0.001184,9606,Homo sapiens,Q96D31;Q14643;Q92736;Q13586;P30626;P32418;P0DP...,Q96D31;Q14643;Q92736;Q13586;P30626;P32418;P0DP...,P30626;P49810;P05067,P17612;P30626,R-HSA-427910;R-HSA-418309;R-HSA-2855020;R-HSA-...


In [24]:
df_cvd.shape

(1188, 19)

- What are the coluns in the data?

In [25]:
df_cvd.columns

Index(['Pathway identifier', 'Pathway name', '#Entities found',
       '#Entities total', '#Interactors found', '#Interactors total',
       'Entities ratio', 'Entities pValue', 'Entities FDR', '#Reactions found',
       '#Reactions total', 'Reactions ratio', 'Species identifier',
       'Species name', 'Submitted entities found', 'Mapped entities',
       'Submitted entities hit interactor', 'Interacts with',
       'Found reaction identifiers'],
      dtype='object')

- Lets set pathways as the index of the data

In [29]:
df_cvd = df_cvd.set_index("Pathway name")

- What are the top 30 pathways involved?

In [30]:
df_cvd.index[0:30]

Index(['Ion homeostasis', 'Cardiac conduction', 'Muscle contraction',
       'Ion channel transport', 'Platelet calcium homeostasis',
       'Stimuli-sensing channels', 'Phase 0 - rapid depolarisation',
       'Platelet homeostasis', 'Elevation of cytosolic Ca2+ levels',
       'Reduction of cytosolic Ca++ levels',
       'Negative regulation of NMDA receptor-mediated neuronal transmission',
       'Ion transport by P-type ATPases',
       'Unblocking of NMDA receptors, glutamate binding and activation',
       'Presynaptic depolarization and calcium channel opening',
       'Phase 2 - plateau phase', 'Transport of small molecules',
       'CLEC7A (Dectin-1) induces NFAT activation', 'NCAM1 interactions',
       'Loss of phosphorylation of MECP2 at T308', 'Sodium/Calcium exchangers',
       'Regulation of insulin secretion', 'TRP channels',
       'Striated Muscle Contraction',
       'Response to elevated platelet cytosolic Ca2+',
       'NOTCH4 Activation and Transmission of Signal t

#### Protein Node Data

In [55]:
'''Lets collect all proteins in a list and find unique proteins'''

allproteins = []

for item in df_cvd['Submitted entities found']:
    
    #protein list for a pathway may also be vacant
    try:
        Proteins = item.split(";")
    except:
        continue
        
    allproteins += Proteins
    
#find unique protein set
unique_proteins = list(set(allproteins))

In [59]:
'''Create a dataframe'''
protein_nodes = pd.DataFrame()
protein_nodes["Protein"] = unique_proteins

In [61]:
'''save the datafile'''
protein_nodes.to_csv("kgdata/protein_nodes.csv")

#### Pathway Node Data

In [62]:
'''collect all pathways'''
Pathways = []
for ID, pw in zip(df_cvd['Pathway identifier'],df_cvd.index):
    Pathways.append({"ID":ID, "Pathway":pw})

In [63]:
'''create a dataframe'''
pathway_nodes = pd.DataFrame(Pathways)

In [69]:
'''save the datafile'''
pathway_nodes.to_csv("kgdata/pathway-nodes.csv")

#### Pathways to Protein Edge data

In [44]:
pw2proteins_dict = {}
pw2proteins_list = []

for pw,ps in zip(df_cvd.index, df_cvd['Submitted entities found']):
    
    pathway = pw
    
    #protein list for a pathway may also be vacant
    try:
        proteins = ps.split(";")
    except:
        continue
    
    #print(pathway, proteins)
    #print(proteins)
    
    pw2proteins_dict.update({pathway:proteins})
    
    
    for p in proteins:
        pw2proteins_list.append({"Pathway":pathway,"Protein":p,"Relation": "INVOLVED_IN"})

In [45]:
'''create a dataframe'''
df_pw2ps = pd.DataFrame(pw2proteins_list)

In [46]:
df_pw2ps.head()

Unnamed: 0,Pathway,Protein,Relation
0,Ion homeostasis,Q96D31,INVOLVED_IN
1,Ion homeostasis,Q14643,INVOLVED_IN
2,Ion homeostasis,Q92736,INVOLVED_IN
3,Ion homeostasis,Q13586,INVOLVED_IN
4,Ion homeostasis,P30626,INVOLVED_IN


In [70]:
'''save the datafile'''
df_pw2ps.to_csv("kgdata/pw2protein-edge.csv")

### Construct Knowledgegraph

0. Install Neo4j desktop,create a local database and start database, run neo4j browser
1. Open database folder and load datafiles into import folder
2. Copy pest these Sypher commands one-by-one directly in the cell of Neo4j browser
3. Explore the knowledgegraph

- Create Protein Nodes

In [None]:
CREATE CONSTRAINT UniqueProteinIdConstraint ON (p:Protein) ASSERT p.id IS UNIQUE;

LOAD CSV WITH HEADERS FROM
  'file:///protein-nodes.csv' as row
MERGE (p:Protein {id:row.Protein})

- Create Pathway Nodes

In [None]:
CREATE CONSTRAINT UniquePathwayIdConstraint ON (pw:Pathway) ASSERT pw.id IS UNIQUE;

LOAD CSV WITH HEADERS FROM
  'file:///pathway-nodes.csv' as row
MERGE (pw:Pathway {id:row.ID})
    ON CREATE SET
        pw.name = row.Pathway,

- Create Edges between Protein and Pathways

In [None]:
LOAD CSV WITH HEADERS FROM
  'file:///pw2protein-edge.csv' as row
MATCH (pw:Pathway{id:row.Pathway})
MATCH (p:Protein{id:row.Protein})
MERGE (pw)-[:INVOLVED_IN]->(p)

### Implementation of API

In [None]:
import pandas as pd
import json
from neo4j import GraphDatabase

In [None]:
driver = GraphDatabase.driver(uri = "bolt://localhost:7687", auth = ("neo4j","1234"))

In [None]:
query = '''CREATE CONSTRAINT UniqueProteinIdConstraint ON (p:Protein) ASSERT p.id IS UNIQUE;
LOAD CSV WITH HEADERS FROM
'file:///protein-nodes.csv' as row
MERGE (p:Protein {id:row.Protein})'''

with driver.session() as session:
    info = session.run(query)

In [None]:
query = '''CREATE CONSTRAINT UniquePathwayIdConstraint ON (pw:Pathway) ASSERT pw.id IS UNIQUE;
LOAD CSV WITH HEADERS FROM
  'file:///pathway-nodes.csv' as row
MERGE (pw:Pathway {id:row.ID})
    ON CREATE SET
        pw.name = row.Pathway'''

with driver.session() as session:
    info = session.run(query)

In [None]:
query = '''LOAD CSV WITH HEADERS FROM
  'file:///pw2protein-edge.csv' as row
MATCH (pw:Pathway{id:row.Pathway})
MATCH (p:Protein{id:row.Protein})
MERGE (pw)-[:INVOLVED_IN]->(p)'''

with driver.session() as session:
    info = session.run(query)