In [2]:
import sys
import logging

from neo4j.exceptions import ClientError as Neo4jClientError

from util.base_importer import BaseImporter

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s - %(message)s',
    datefmt='%H:%M:%S'
)


In [None]:

class HPOImporter(BaseImporter):

    def __init__(self, argv=[]): # Default to empty list for notebook
        # Replace __file__ with a string because __file__ doesn't exist in notebooks
        super().__init__(command="HPOImporter_Notebook", argv=argv) 
        self._database = "neo4j"
        # do not use it since in community edition just one db(default) is enabled.
        #with self._driver.session(database="system") as session:
            #Listing 3.13
        #    session.run(f"CREATE DATABASE {self._database} IF NOT EXISTS")

    def set_constraints(self):
        # Listing 14
        queries = ["CREATE CONSTRAINT n10s_unique_uri IF NOT EXISTS FOR (r:Resource) REQUIRE r.uri IS UNIQUE;",
                   "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Resource) REQUIRE (n.id) IS UNIQUE;",
                   "CREATE INDEX disease_id IF NOT EXISTS FOR (n:HpoDisease) ON (n.id);",
                   "CREATE INDEX phenotype_id IF NOT EXISTS FOR (n:HpoPhenotype) ON (n.id);"]
        with self._driver.session(database=self._database) as session:
            for q in queries:
                try:
                    session.run(q)
                except Neo4jClientError as e:
                    # ignore if we already have the rule in place
                    if e.code != "Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists":
                        raise e

    def check_neo_semantics(self):
        query = 'SHOW PROCEDURES YIELD name WHERE name ="n10s.graphconfig.init"'
        with self._driver.session(database=self._database) as session:
            r = session.run(query)
            if len(r.data()) == 0:
                raise RuntimeError(
                    "Can not find `n10s.graphconfig.init`.Please make sure that Neosemantics is installed.\n"
                    "https://neo4j.com/labs/neosemantics/installation/")

    def initialize_neo_semantics(self):
        # check if the RDF data is already imported
        test_query = "MATCH (n:Resource) RETURN n"
        with self._driver.session(database=self._database) as session:
            r = session.run(test_query)
            if len(r.data()) == 0:
                # Listing 15
                queries = ["CALL n10s.graphconfig.init();",
                           "CALL n10s.graphconfig.set({ handleVocabUris: 'IGNORE' });", # to ignore namespaces, e.g. http://xmlns.com/foaf/0.1/name, take this as name
                           "CALL n10s.graphconfig.set({ applyNeo4jNaming: True });"] # to encode relationships with Uppercase

                with self._driver.session(database=self._database) as session:
                    for q in queries:
                        session.run(q)
                       

    def load_HPO_ontology(self):
        # Listing 16
        query = """
                CALL n10s.rdf.import.fetch("http://purl.obolibrary.org/obo/hp.owl","RDF/XML");
                """
        with self._driver.session(database=self._database) as session:
            session.run(query)

    def label_HPO_entities(self):
        # Listing 17
        #put the HpoPhenotype label for HP ones, and put id to null ones.
        query = """
                MATCH (n:Resource) 
                WHERE n.uri STARTS WITH "http://purl.obolibrary.org/obo/HP" 
                SET n:HpoPhenotype, 
                    n.id = coalesce(n.id, replace(apoc.text.replace(n.uri,'(.*)obo/',''),'_', ':'));
                """
        with self._driver.session(database=self._database) as session:
            session.run(query)

    def create_disease_entities(self):
        # Listing 19
        query = """
                LOAD CSV FROM 'https://github.com/obophenotype/human-phenotype-ontology/releases/latest/download/phenotype.hpoa' AS row 
                FIELDTERMINATOR '\t'
                WITH row
                SKIP 5 
                MERGE (dis:Resource:HpoDisease {id: row[0]}) 
                ON CREATE SET dis.label = row[1]; 
                """

        with self._driver.session(database=self._database) as session:
            session.run(query)

    def create_rels_features_diseases(self):
        # Listing 20
        query = """
                LOAD CSV FROM 'https://github.com/obophenotype/human-phenotype-ontology/releases/latest/download/phenotype.hpoa' AS row
                FIELDTERMINATOR '\t'
                WITH row
                SKIP 5
                MATCH (dis:HpoDisease)
                WHERE dis.id = row[0]
                MATCH (phe:HpoPhenotype)
                WHERE phe.id = row[3]
                MERGE (dis)-[:HAS_PHENOTYPIC_FEATURE]->(phe)
                """

        with self._driver.session(database=self._database) as session:
            session.run(query)

    def add_base_properties_to_rels(self):
        # Listing 22
        # take the different properties from the columns
        # for each iterate over the list comes from the case statement, if the specific column is empty,
        # it returns empty list and set statement is not executed, if the row is not empty, it returns a list with one dummy variable
        # and query is execute one for that specific row.
        query = """
                LOAD CSV FROM 'https://github.com/obophenotype/human-phenotype-ontology/releases/latest/download/phenotype.hpoa' AS row 
                FIELDTERMINATOR '\t' 
                WITH row 
                SKIP 5 
                MATCH (dis:HpoDisease)-[rel:HAS_PHENOTYPIC_FEATURE]->(phe:HpoPhenotype)
                WHERE phe.id = row[3] and dis.id = row[0] 
                FOREACH(ignoreMe IN CASE WHEN row[4] is not null THEN [1] ELSE [] END| 
                    SET rel.source = row[4]) 
                FOREACH(ignoreMe IN CASE WHEN row[5] is not null THEN [1] ELSE [] END| 
                    SET rel.evidence = row[5]) 
                FOREACH(ignoreMe IN CASE WHEN row[6] is not null THEN [1] ELSE [] END| 
                    SET rel.onset = row[6]) 
                FOREACH(ignoreMe IN CASE WHEN row[7] is not null THEN [1] ELSE [] END| 
                    SET rel.frequency = row[7]) 
                FOREACH(ignoreMe IN CASE WHEN row[8] is not null THEN [1] ELSE [] END| 
                    SET rel.sex = row[8]) 
                FOREACH(ignoreMe IN CASE WHEN row[9] is not null THEN [1] ELSE [] END| 
                    SET rel.modifier = row[9]) 
                FOREACH(ignoreMe IN CASE WHEN row[10] is not null THEN [1] ELSE [] END| 
                    SET rel.aspect = row[10])
                FOREACH(ignoreMe IN CASE WHEN row[11] is not null THEN [1] ELSE [] END| 
                    SET rel.biocuration = row[11])
                """

        with self._driver.session(database=self._database) as session:
            session.run(query)

    def enrich_with_descriptive_properties(self):
        # Listing 23
        #split 	bicuration HPO:probinson[2021-06-21] to creadtion date and created by
        #give name of the aspect from aspect code P or I, put aspect description
        #give name of the evidence from evidence code IEA, PCS, or TAS, put evidence description
        # create clickable url for source
        query = """
                CALL apoc.periodic.iterate(
                    "MATCH (dis:HpoDisease)-[rel:HAS_PHENOTYPIC_FEATURE]->(phe:HpoPhenotype) RETURN rel",
                    "SET rel.createdBy = apoc.text.regexGroups(rel.biocuration, 'HPO:(\\w+)\\[')[0][1],
                    rel.creationDate = apoc.text.regexGroups(rel.biocuration, '\\[(\\d{4}-\\d{2}-\\d{2})\\]')[0][1],
                    rel.aspectName = 
                    CASE  
                        WHEN rel.aspect = 'P' THEN 'Phenotypic abnormality' 
                        WHEN rel.aspect = 'I' THEN 'Inheritance' 
                    END, 
                    rel.aspectDescription = 
                    CASE 
                        WHEN rel.aspect = 'P' THEN 'Terms with the P aspect are located in the Phenotypic abnormality subontology' 
                        WHEN rel.aspect = 'I' THEN 'Terms with the I aspect are from the Inheritance subontology' 
                    END, 
                    rel.evidenceName = 
                    CASE  
                        WHEN rel.evidence = 'IEA' THEN 'Inferred from electronic annotation' 
                        WHEN rel.evidence = 'PCS' THEN 'Published clinical study' 
                        WHEN rel.evidence = 'TAS' THEN 'Traceable author statement' 
                    END, 
                    rel.evidenceDescription = 
                    CASE 
                        WHEN rel.evidence = 'IEA' THEN 'Annotations extracted by parsing the Clinical Features sections of the Online Mendelian Inheritance in Man resource are assigned the evidence code IEA.' 
                        WHEN rel.evidence = 'PCS' THEN 'PCS is used for information extracted from articles in the medical literature. Generally, annotations of this type will include the pubmed id of the published study in the DB_Reference field.' 
                        WHEN rel.evidence = 'TAS' THEN 'TAS is used for information gleaned from knowledge bases such as OMIM or Orphanet that have derived the information from a published source.' 
                    END, 
                    rel.url = 
                    CASE 
                        WHEN rel.source STARTS WITH 'PMID:' THEN 'https://pubmed.ncbi.nlm.nih.gov/' + apoc.text.replace(rel.source, '(.*)PMID:', '') 
                        WHEN rel.source STARTS WITH 'OMIM:' THEN 'https://omim.org/entry/' + apoc.text.replace(rel.source, '(.*)OMIM:', '') 
                    END",
                {batchSize: 1000})
                """

        with self._driver.session(database=self._database) as session:
            session.run(query)
    
    def remove_unused_node(self):
        # Listing 27
        query = """
                CALL apoc.periodic.iterate(
                    "MATCH (n:Resource) RETURN id(n) as node_id",
                    "MATCH (n)
                     WHERE id(n) = node_id AND
                           NOT 'HpoPhenotype' in labels(n) AND
                           NOT 'HpoDisease' in labels(n)
                     DETACH DELETE n",
                     {batchSize:10000})
                YIELD batches, total return batches, total
                """

        with self._driver.session(database=self._database) as session:
            session.run(query)



In [6]:
importing = HPOImporter(argv=[])


In [7]:
logging.info('Setting Constraints')
importing.set_constraints()

[08:18:20] INFO - Setting Constraints


In [9]:
logging.info('Initializing Neosemantics')
importing.check_neo_semantics()
importing.initialize_neo_semantics()


[08:29:43] INFO - Initializing Neosemantics


In [10]:
logging.info('Loading HPO Ontology')
importing.load_HPO_ontology()


[08:34:09] INFO - Loading HPO Ontology


In [11]:
logging.info('Loading HPO Entities')
importing.label_HPO_entities()

[08:46:33] INFO - Loading HPO Entities


In [12]:
logging.info('Creating Disease Entities')
importing.create_disease_entities()

[08:54:34] INFO - Creating Disease Entities


In [13]:
logging.info('Creating Phenotype Relationships')
importing.create_rels_features_diseases()

[09:06:41] INFO - Creating Phenotype Relationships


In [14]:
logging.info('Base Relationship Enriching')
importing.add_base_properties_to_rels()

[09:28:36] INFO - Base Relationship Enriching


In [15]:
logging.info('Descriptive Relationship Enriching')
importing.enrich_with_descriptive_properties()

[09:41:03] INFO - Descriptive Relationship Enriching


In [16]:
logging.info('Cleaning the Knowledge Graph...')
importing.remove_unused_node()

[09:44:31] INFO - Cleaning the Knowledge Graph...


In [None]:
## the end graph can be checked with these queries e.g. from http://localhost:7474/browser/
# to see the diseases for the symptoms of a person
"""MATCH (phe:HpoPhenotype)
WHERE phe.label IN [
  "Growth delay", 
  "Large knee", 
  "Sensorineural hearing impairment", 
  "Pruritus", 
  "Type I diabetes mellitus"
]
WITH phe
MATCH path=(dis:HpoDisease)-[:HAS_PHENOTYPIC_FEATURE]->(phe)
UNWIND dis as nodes
RETURN
  dis.id as disease_id, 
  dis.label as disease_name,
  collect(phe.label) as features,
  count(nodes) as num_of_features
ORDER BY num_of_features DESC, disease_name
LIMIT 5"""

# to see the  the symptoms of a disease
"""MATCH path=(dis:HpoDisease)-[:HAS_PHENOTYPIC_FEATURE]->(phe:HpoPhenotype)
WHERE dis.id = "OMIM:222100"
RETURN path"""

# see the subclasses of endocrine system for at most depth 3
"""MATCH (p:HpoPhenotype)<-[:SUBCLASSOF*1..3]-(n:HpoPhenotype)  
WHERE p.id = "HP:0000818"
RETURN p,n"""


#Subset of the results of annotations implicitly connected to the “Abnormality of the
#endocrine system” phenotypic feature. Phenotypic features that are direct or inferred subclasses of this
#phenotypic feature are highlighted in bold.
"""MATCH (cat:HpoPhenotype {label: "Abnormality of the endocrine system"})  
CALL n10s.inference.nodesInCategory(cat, { 
  inCatRel: "HAS_PHENOTYPIC_FEATURE", 
  subCatRel: "SUBCLASSOF"})  
YIELD node as dis
WHERE dis.label IN [
  "Congenital atransferrinemia",
  "Deafness, autosomal recessive 4, with enlarged vestibular aqueduct",
  "Diabetes mellitus, transient neonatal, 1",
  "Edema, familial idiopathic, prepubertal",
  "Familial dysalbuminemic hyperthyroxinemia"
]   
MATCH (dis)-[:HAS_PHENOTYPIC_FEATURE]->(phe:HpoPhenotype)  
RETURN dis.label as disease, collect(DISTINCT phe.label) as features
ORDER BY size(features) ASC, disease"""