### Notebook for loading data to Neo4j to construct Knowledge Graph

In [1]:
import json
import pandas as pd
import os
import _pickle
from neo4j import GraphDatabase

In [18]:
driver = GraphDatabase.driver(uri = "bolt://localhost:7687",\
                                          auth = ("neo4j", "pinglab"))

In [19]:
class LoadKG():    
    def __init__(self,import_file_path,reactome_file_path,driver):
        self.session = driver.session()
        self.import_extension = os.path.basename(os.path.normpath(import_file_path))
        self.import_data = import_file_path
        self.pathway_data = reactome_file_path
    
    def parse_reactome_file(self):
        ppw_df = pd.read_csv(self.pathway_data)
        for i,plist in enumerate(ppw_df['Submitted entities found']):
            ppw_df['Submitted entities found'][i] = plist.split(';')
        return ppw_df
    
    def create_constraints(self):
        query = ["CREATE CONSTRAINT UniqueProteinIdConstraint ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE",\
                 "CREATE CONSTRAINT UniqueDrugIdConstraint ON (d:Drug) ASSERT d.drugbank_id IS UNIQUE",\
                 "CREATE CONSTRAINT UniqueDrugPwConstraint ON (p:Pathway) ASSERT p.smpdb_id IS UNIQUE",\
                 "CREATE CONSTRAINT UniqueProteinPwConstraint ON (p:Pathway) ASSERT p.reactome_id IS UNIQUE",\
                "CREATE CONSTRAINT UniqueMeSHConstraint ON (m:MeSH) ASSERT m.name IS UNIQUE",\
                "CREATE CONSTRAINT UniqueDocConstraint ON (d:Document) ASSERT d.pmid IS UNIQUE"]
        for constraint in query:
            self.session.run(constraint)
    
    def create_protein_node(self,entity):
        import_data = self.import_data
        entity=entity
        def tx_function(tx,import_data,entity):
            if entity=='UniProt':
                query = "WITH '" + import_data  + "' as url \
                CALL apoc.load.json(url) YIELD value \
                UNWIND value." + entity + " as p \
                MERGE (pn:Protein {uniprot_id:p})"
            else:
                query = "WITH '" + import_data  + "' as url \
                CALL apoc.load.json(url) YIELD value \
                UNWIND value." + entity + " as p \
                MERGE (pn:Protein {uniprot_id:p.uniprot_id})"
            tx.run(query,import_data=import_data,entity=entity)
        self.session.write_transaction(tx_function,import_data,entity)
                    
    def update_protein_node(self,entity=None):
        import_data = self.import_data
        entity = entity
        def tx_function(tx,import_data,entity):
            if self.import_extension.lower().endswith('.json'):
                query = "WITH '" + import_data  + "' as url \
                CALL apoc.load.json(url) YIELD value \
                UNWIND value." + entity + " as p \
                MATCH (pn:Protein {uniprot_id:p.uniprot_id}) \
                SET pn.name=p.name,\
                pn.drugbank_id=p.drugbank_id,\
                pn.group_name=p.group_name"
                
            if self.import_extension.lower().endswith('.csv'):
                query = "LOAD CSV WITH HEADERS FROM '" + import_data  + "' AS p \
                MATCH (pn:Protein {uniprot_id:p.ID}) \
                SET pn.CM=p.CM,\
                pn.ARR=p.ARR,\
                pn.CHD=p.CHD,\
                pn.VD=p.VD,\
                pn.IHD=p.IHD,\
                pn.CCS=p.CCS,\
                pn.VOO=p.VOO,\
                pn.OHD=p.OHD"
            tx.run(query,import_data=import_data,entity=entity)
        self.session.write_transaction(tx_function,import_data,entity)
    
    def create_drug_node(self):
        import_data = self.import_data
        def tx_function(tx,import_data):
            query = "WITH '" + import_data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MERGE (d:Drug {drugbank_id:value.drugbank_id}) \
            ON CREATE SET d.name=value.name, \
            d.type='CVD', \
            d.synonyms=value.synonyms, \
            d.description=value.descriptions, \
            d.categories=value.categories, \
            d.atc_code=value.`ATC code`, \
            d.indication=value.indication"
            
            tx.run(query,import_data=import_data)
        
        self.session.write_transaction(tx_function,import_data)
    
    def create_drugpw_node(self):
        import_data = self.import_data
        def tx_function(tx,import_data):
            query = "WITH '" + import_data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            UNWIND value.pathways as pway \
            MERGE (pw:Pathway {smpdb_id:pway.smpdb_id})\
            ON CREATE SET pw.name=pway.name, \
            pw.category=pway.category"
            
            tx.run(query,import_data=import_data)
        
        self.session.write_transaction(tx_function,import_data)
    
    def create_proteinpw_node(self):
        ppw_df = self.parse_reactome_file()
        
        def tx_function(tx,pw_id,pw_name):
            query = "MERGE (pw:Pathway{reactome_id:$pw_id}) \
            ON CREATE SET pw.name=$pw_name"
            
            tx.run(query,pw_id=pw_id,pw_name=pw_name)
            
        for pw_id,pw_name in zip(ppw_df["Pathway identifier"],ppw_df["Pathway name"]):
            self.session.write_transaction(tx_function,pw_id,pw_name)
            
    def create_mesh_node(self):
        import_data = self.import_data
        def tx_function(tx,import_data):
            query = "WITH '" + import_data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            UNWIND value.MeSH as m \
            MERGE (:MeSH {name:m})"
            tx.run(query,import_data=import_data)
        self.session.write_transaction(tx_function,import_data)
        
    def create_doc_node(self):
        import_data = self.import_data
        def tx_function(tx,import_data):
            query = "WITH '" + import_data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MERGE (:Document {pmid:value.PMID})"
            tx.run(query,import_data=import_data)
        self.session.write_transaction(tx_function,import_data)
    
    def create_protein2doc_edge(self):
        import_data = self.import_data
        def tx_function(tx,import_data):
            query = "WITH '" + import_data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            UNWIND value.PMIDs as pmid \
            WITH value, pmid \
            MATCH (p:Protein {uniprot_id:value.UniProt}) \
            MATCH (d:Document {pmid:pmid}) \
            MERGE (p)-[:STUDIED_IN]->(d)"
            tx.run(query,import_data=import_data)
        self.session.write_transaction(tx_function,import_data)
    
    def create_doc2mesh_edge(self):
        import_data = self.import_data
        def tx_function(tx,import_data):
            query = "WITH '" + import_data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            UNWIND value.MeSH as mesh \
            WITH value, mesh \
            MATCH (d:Document {pmid:value.PMID}) \
            MATCH (m:MeSH {name:mesh}) \
            MERGE (d)-[:STUDIES]->(m)"
            tx.run(query,import_data=import_data)
        self.session.write_transaction(tx_function,import_data)
        
    def create_drug2protein_edge(self, entity):
        entity = entity
        import_data = self.import_data
        def tx_function(tx,import_data,entity):
            if entity == 'targets':
                query = "WITH '" + import_data  + "' as url \
                CALL apoc.load.json(url) YIELD value \
                UNWIND value." + entity + " as ent \
                WITH ent, value \
                MATCH (p:Protein {uniprot_id:ent.uniprot_id}) \
                MATCH (d:Drug {drugbank_id:value.drugbank_id}) \
                MERGE (d)-[t:TARGETS]->(p) \
                SET t.actions=ent.actions, \
                t.group_actions=ent.actions_of_group"
            elif entity in ['carriers','transporters','enzymes']:
                ent = entity[:len(entity)-1].upper()
                query = "WITH '" + import_data  + "' as url \
                CALL apoc.load.json(url) YIELD value \
                UNWIND value." + entity + " as ent \
                WITH ent, value \
                MATCH (p:Protein {uniprot_id:ent.uniprot_id}) \
                MATCH (d:Drug {drugbank_id:value.drugbank_id}) \
                MERGE (d)-[e:RELATED_" + ent +"]->(p) \
                SET e.actions=ent.actions, \
                e.group_actions=ent.actions_of_group"
                
            else:
                raise Exception('entity must be one of the following:\n' +
                                'targets\n' + 
                                'carriers\n' +
                                'transporters\n' +
                               'enzymes')

            tx.run(query,import_data=import_data,entity=entity)
        
        self.session.write_transaction(tx_function,import_data,entity)
    
    def create_drug2pw_edge(self):
        import_data = self.import_data
        def tx_function(tx,import_data):
            query = "WITH '" + import_data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            UNWIND value.pathways as pway \
            WITH pway, value \
            MATCH (pw:Pathway {smpdb_id:pway.smpdb_id}) \
            MATCH (d:Drug {drugbank_id:value.drugbank_id}) \
            MERGE (d)-[:INVOLVED_IN]->(pw)"
            
            tx.run(query,import_data=import_data)
        
        self.session.write_transaction(tx_function,import_data)

    def create_pw2protein_edge(self):
        ppw_df = self.parse_reactome_file()
        
        def tx_function(tx,p,pw_id,pw_name):
            query = "MATCH (pw:Pathway{reactome_id:$pw_id}) \
            MATCH (p:Protein{uniprot_id:$p}) \
            MERGE (pw)-[:CANDIDATE]->(p)"
            tx.run(query,p=p,pw_id=pw_id,pw_name=pw_name)
        
        for pw_id,pw_name,plist in zip(ppw_df["Pathway identifier"],ppw_df["Pathway name"],ppw_df["Submitted entities found"]):
            for p in plist:
                self.session.write_transaction(tx_function,p,pw_id,pw_name)

In [20]:
oKG = LoadKG(import_file_path='file://cvdrug_ent_drugpw.json',\
             reactome_file_path='C:\\Users\\ttran\\OneDrive\\Desktop\\COVID-CDV-DATA\\covidii_KG\\reactome\\result.csv',\
             driver=driver)

In [None]:
oKG.create_constraints()

In [21]:
oKG.create_protein_node('targets')
oKG.update_protein_node('targets')
oKG.create_drug_node()
oKG.create_drug2protein_edge('targets')
oKG.create_protein_node('enzymes')
oKG.update_protein_node('enzymes')
oKG.create_drug2protein_edge('enzymes')
oKG.create_protein_node('carriers')
oKG.update_protein_node('carriers')
oKG.create_drug2protein_edge('carriers')
oKG.create_protein_node('transporters')
oKG.update_protein_node('transporters')
oKG.create_drug2protein_edge('transporters')
oKG.create_drugpw_node()
oKG.create_drug2pw_edge()
oKG.create_proteinpw_node()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ppw_df['Submitted entities found'][i] = plist.split(';')


In [22]:
oKG = LoadKG(import_file_path='file://pmid_to_mesh.json',\
             reactome_file_path='C:\\Users\\ttran\\OneDrive\\Desktop\\COVID-CDV-DATA\\covidii_KG\\reactome\\result.csv',\
             driver=driver)
oKG.create_mesh_node()
oKG.create_doc_node()
oKG.create_doc2mesh_edge()
oKG = LoadKG(import_file_path='file://uniprot_mesh_pub.json',\
             reactome_file_path='C:\\Users\\ttran\\OneDrive\\Desktop\\COVID-CDV-DATA\\covidii_KG\\reactome\\result.csv',\
             driver=driver)
oKG.create_protein_node('UniProt')
oKG.create_protein2doc_edge()
oKG = LoadKG(import_file_path='file:///protein-nodes.csv',\
             reactome_file_path='C:\\Users\\ttran\\OneDrive\\Desktop\\COVID-CDV-DATA\\covidii_KG\\reactome\\result.csv',\
             driver=driver)
oKG.update_protein_node()
oKG.create_pw2protein_edge()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ppw_df['Submitted entities found'][i] = plist.split(';')


In [23]:
driver.close()