## Uniprot KG Builder

In [1]:
import pandas as pd
import numpy as np
import os
import time
import neo4j as neo4j
from neo4j import GraphDatabase

### Make a connection with Graph DataBase

In [2]:

driver=GraphDatabase.driver(uri="bolt://127.0.0.1:7687",auth=("neo4j","12345678"))
session=driver.session()

In [28]:
driver

<neo4j._sync.driver.BoltDriver at 0x7fa13076e940>

### Creat constraint

In [3]:
def create_constraints(driver):
        '''
        Create constraints for nodes in the graph
        args:
            driver: neo4j driver
        returns:
            None
        '''
        query = ["CREATE CONSTRAINT UniqueOrganismNameConstraint FOR (O:Organism) REQUIRE O.name IS UNIQUE",\
                "CREATE CONSTRAINT UniqueProteinIdConstraint FOR (P:Protein) REQUIRE P.UID IS UNIQUE",\
                "CREATE CONSTRAINT UniqueModifiedResidueIdConstraint FOR (MR:Modified_residue) REQUIRE MR.MRID IS UNIQUE",\
                "CREATE CONSTRAINT UniquePTMTypeIdConstraint FOR (PTM:PTM_Type) REQUIRE PTM.PTMID IS UNIQUE",\
                "CREATE CONSTRAINT UniqueReferenceIdConstraint FOR (R:Reference) REQUIRE R.PMID IS UNIQUE"]
        
        
        with driver.session() as session:
            for constraint in query:
                session.run(constraint)

In [4]:
'''UNCOMMENT AND RUN THIS CELL ONLY ONCE'''
#create_constraints(driver)

'UNCOMMENT AND RUN THIS CELL ONLY ONCE'

### Populate Node

#### 1. Protein Node

- Columns in PTM flat file:
``` 
       Index(['accession', 'gene', 'name', 'organism', 'sequence', 'uniprotId',
       'position', 'description', 'evidence', 'Identifier', 'ptm_accession',
       'mod_res_id'],
      dtype='object')
```

In [4]:
# Create protein node
def create_protein_node(data):
        '''
        Create Protein node in the graph
        args:
            data: dictionary containing the data for the node
        returns:
            None
        '''
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MERGE (P:Protein{UID:value.uniprotId})\
            ON CREATE SET P.sequence=value.sequence,\
            P.name=value.name, P.gene=value.gene"
            
            #print(query)
            tx.run(query,data=data)
        
        with driver.session() as session:   
            session.execute_write(tx_function,data)

     

In [5]:
t1 = time.time()
data = "ptm_flat_dropna.json"
create_protein_node(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  5.597813129425049


### Create Modified Resude Node

In [8]:
# Create protein node
def create_modified_residue_node(data):
        '''
        Create Protein node in the graph
        args:
            data: dictionary containing the data for the node
        returns:
            None

        Index(['accession', 'gene', 'name', 'organism', 'sequence', 'uniprotId',
       'position', 'description', 'evidence', 'Identifier', 'ptm_accession',
       'mod_res_id'],
        dtype='object')
        '''
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MERGE (MR:Modified_residue{MRID:value.mod_res_id})\
            ON CREATE SET MR.position=value.position,\
            MR.parent_protein=value.uniprotId,\
            MR.description=value.description,\
            MR.ptm_accession=value.ptm_accession,\
            MR.identifier=value.Identifier,\
            MR.evidence=value.evidence"
            
            #print(query)
            tx.run(query,data=data)
        
        with driver.session() as session:   
            session.execute_write(tx_function,data)

     

In [9]:
t1 = time.time()
data = "ptm_flat_dropna.json"
create_modified_residue_node(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  4.602682828903198


#### 2. Create PTM Node

- Columns in PTM vocab file:
``` 
       Index(['Identifier', 'Accession', 'Feature_key', 'Target',
       'Position_amino_acid', 'Position_polypeptide', 'Correction_formula',
       'Monoisotopic_mass_difference', 'Average_mass_difference',
       'Cellular_location', 'Taxonomic_range', 'Reference', 'Keyword'],
      dtype='object')
```

In [6]:
# create ptm node
def create_ptmtype_node(data):
        '''
        Create PTM_Type node in the graph
        args:
            data: dictionary containing the data for the node
        returns:
            None
        '''
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MERGE (PTM:PTM_Type{PTMID:value.Accession})\
            ON CREATE SET PTM.identifier=value.Identifier,\
            PTM.feature_key=value.Feature_Key,\
            PTM.target=value.Target,\
            PTM.position_amino_acid=value.Position_amino_acid,\
            PTM.position_polypeptide=value.Position_polypeptide,\
            PTM.correction_formula=value.Correction_formula,\
            PTM.monoisotopic_mass_difference=value.Monoisotopic_mass_difference,\
            PTM.average_mass_difference=value.Average_mass_difference,\
            PTM.keyword=value.Keyword,\
            PTM.cellular_location=value.Cellular_location,\
            PTM.taxonomic_range=value.Taxonomic_range,\
            PTM.reference=value.Reference"
            


            #print(query)
            tx.run(query,data=data)
        with driver.session() as session:   
            session.execute_write(tx_function,data)
    

In [7]:
t1 = time.time()
data = "ptm-vocab.json"
create_ptmtype_node(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  0.3713102340698242


#### 3. Organism Node

In [4]:
# Create organism node
def create_organism_node(data):
        '''  
        Create Organism node in the graph
         args:
            data: dictionary containing the data for the node
        returns:
            None
        '''
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MERGE (O:Organism{name:value.organism})"
            
            #print(query)
            tx.run(query,data=data)
        
        with driver.session() as session:   
            session.execute_write(tx_function,data)


In [6]:
t1 = time.time()
data = "ptm_flat_dropna.json"
create_organism_node(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  11.228230953216553


### Create reference node *****

```
Index(['gene', 'name', 'organism', 'uniprotId', 'position', 'description',
       'evidence', 'pubmedId', 'journal', 'title', 'date', 'Identifier',
       'ptm_accession'],
      dtype='object')

```

In [11]:

# Create organism node
def create_reference_node(data):
        '''  
        Create reference node in the graph
         args:
            data: dictionary containing the data for the node
        returns:
            None
        '''
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MERGE (R:Reference{PMID:value.pubmedId})\
            ON CREATE SET R.title=value.title,\
            R.journal=value.journal,\
            R.date=value.date"
            
            #print(query)
            tx.run(query,data=data)
        
        with driver.session() as session:   
            session.execute_write(tx_function,data)



In [12]:
t1 = time.time()
data = "df_modres_ref.json"
create_reference_node(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  6.920697927474976


## Edge creation

### Create relationship between Protein and Modified Residue

In [9]:
def create_edge_protein2modified_residue(data):
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MATCH (P: Protein {UID:value.uniprotId})\
            MATCH (MR: Modified_residue {MRID:value.mod_res_id})\
            MERGE (P)-[:HAS_MODIFIED_RESIDUE]->(MR)"

            tx.run(query,data=data)
        with driver.session() as session:   
            session.execute_write(tx_function,data)

In [10]:
t1 = time.time()
data = "ptm_flat_dropna.json"
create_edge_protein2modified_residue(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  12.504657745361328


### Create relationship between Modified residue and  PTM type

In [11]:
def create_edge_modified_residue2ptm_type(data):
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MATCH (MR:Modified_residue {MRID:value.mod_res_id})\
            MATCH (PTM:PTM_Type {PTMID:value.ptm_accession})\
            MERGE (MR)-[:HAS_PTM_TYPE]->(PTM)"

            tx.run(query,data=data)
        with driver.session() as session:   
            session.execute_write(tx_function,data)

In [12]:
t1 = time.time()
data = "ptm_flat_dropna.json"
create_edge_modified_residue2ptm_type(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  14.789127826690674


### Create relationship between Protein and Organism

In [17]:
def create_edge_protein2organism(data):
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MATCH (P: Protein {UID:value.uniprotId})\
            MATCH (O: Organism {name:value.organism})\
            MERGE (P)-[:BELONGS_TO]->(O)"

            tx.run(query,data=data)
        with driver.session() as session:   
            session.execute_write(tx_function,data)

In [18]:
t1 = time.time()
data = "ptm_flat_dropna.json"
create_edge_protein2organism(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  19.017669916152954


### Create relationship between Modified Residue and Reference

In [15]:
def create_edge_modres2ref(data):
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MATCH (MR:Modified_residue {MRID:value.mod_res_id})\
            MATCH (R:Reference {PMID:value.pubmedId})\
            MERGE (MR)-[:HAS_REFERENCE]->(R)"

            tx.run(query,data=data)
        with driver.session() as session:   
            session.execute_write(tx_function,data)

In [16]:
t1 = time.time()
data = "df_modres_ref.json"
create_edge_modres2ref(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  10.495097875595093


### Rough