Identifiers dict

PathwayMerger · Oct 3, 2018 · 5e7b0cf · 5e7b0cf
1 parent 57dda21
commit 5e7b0cf
Show file tree

Hide file tree

Showing 5 changed files with 533 additions and 42 deletions.
diff --git a/src/pathme/utils.py b/src/pathme/utils.py
@@ -110,7 +110,7 @@ def entry_result_to_dict(entry, **kwargs):
     attributes_dict = {
         str(label): str(entry[label])
         for label in entry.labels
-        if label and entry[label] is not None
+        if label is not None and entry[label] is not None
     }
 
     if 'directed_interaction' in kwargs:
@@ -131,6 +131,19 @@ def entry_result_to_dict(entry, **kwargs):
 
     return attributes_dict
 
+def entries_dict_ids_argument(entries_dict):
+    entries_dict_ids = defaultdict(dict)
+    for entry_id, entry_att in entries_dict.items():
+        entry_identifiers = {}
+
+        for label, value in entry_att.items():
+            if 'bdb' in label:
+                entry_identifiers[label] = value
+            else:
+                entries_dict_ids[entry_id][label] = value
+        entries_dict_ids[entry_id]['identifiers'] = entry_identifiers
+
+    return entries_dict
 
 def query_result_to_dict(entries, **kwargs) -> Dict[str, Dict[str, Dict[str, str]]]:
     """Export to a dictionary a SPARQL query result data structure.
@@ -144,10 +157,11 @@ def query_result_to_dict(entries, **kwargs) -> Dict[str, Dict[str, Dict[str, str
     for rdf_entry in entries:
         dict_rdf_entry = entry_result_to_dict(rdf_entry, **kwargs)
 
-        if 'identifier' in rdf_entry.labels:
+        if 'identifier' in rdf_entry.labels and rdf_entry.identifier is not None:
+            print (str(rdf_entry.identifier))
             id_key = str(rdf_entry.identifier)
 
-        elif 'uri_id' in rdf_entry.labels:
+        elif 'uri_id' in rdf_entry.labels and rdf_entry.uri_id is not None:
             id_key = rdf_entry.uri_id
 
         else:
@@ -184,7 +198,11 @@ def query_result_to_dict(entries, **kwargs) -> Dict[str, Dict[str, Dict[str, str
             for attr in attr_empty
         }
 
-    return entries_dict
+    if kwargs.get('ids_argument') == True:
+        return entries_dict_ids_argument(entries_dict)
+
+    else:
+        return entries_dict
 
 
 """Statistics functions"""

diff --git a/src/pathme/wikipathways/convert_to_bel.py b/src/pathme/wikipathways/convert_to_bel.py
@@ -31,6 +31,7 @@ def convert_to_bel(nodes: Dict[str, Dict], complexes: Dict[str, Dict], interacti
         authors="Sarah Mubeen, Daniel Domingo-Fernández & Josep Marín-Llaó",
         contact='daniel.domingo.fernandez@scai.fraunhofer.de',
     )
+    print(pathway_info['pathway_id'])
 
     nodes = nodes_to_bel(nodes, hgnc_manager)
     nodes.update(complexes_to_bel(complexes, nodes, graph))
@@ -57,39 +58,52 @@ def node_to_bel(node: Dict, hgnc_manager: Manager) -> BaseEntity:
 
     if 'identifier' in node.keys():
         identifier = node['identifier']
-
     else:
         identifier = uri_id
 
+    if isinstance(identifier, set):
+        print('Multiple identifier {}'.format(node['identifier']))
+        # TODO: print the wikipathways bps that return a set because they are probably wrong.
+        identifier = list(identifier)[0]
+
+
+    if 'identifiers' in node.keys():
+        node_ids_dict = node['identifiers']
+    else:
+        node_ids_dict = node
+
     _, _, namespace, _ = parse_id_uri(uri_id)
 
-    if isinstance(node['name'], set):
-        print('{}'.format(node['name']))
+    name = node['name']
+
+    if isinstance(name, set):
+        print('Multiple name {}'.format(node['name']))
         # TODO: print the wikipathways bps that return a set because they are probably wrong.
-        name = list(node['name'])[0]
+        name = list(name)[0]
+
 
     if 'Protein' in node_types:
-        namespace, name, identifier = get_valid_gene_identifier(node, hgnc_manager)
+        namespace, name, identifier = get_valid_gene_identifier(node_ids_dict, hgnc_manager)
         return protein(namespace=namespace, name=name, identifier=identifier)
 
     elif 'Rna' in node_types:
-        namespace, name, identifier = get_valid_gene_identifier(node, hgnc_manager)
+        namespace, name, identifier = get_valid_gene_identifier(node_ids_dict, hgnc_manager)
         return rna(namespace=namespace, name=name, identifier=identifier)
 
     elif 'GeneProduct' in node_types:
-        namespace, name, identifier = get_valid_gene_identifier(node, hgnc_manager)
+        namespace, name, identifier = get_valid_gene_identifier(node_ids_dict, hgnc_manager)
         return gene(namespace=HGNC, name=name, identifier=identifier)
 
     elif 'Metabolite' in node_types:
         # FIX node[name]
-        return abundance(namespace=namespace, name=node['name'], identifier=identifier)
+        return abundance(namespace=namespace, name=name, identifier=identifier)
 
     elif 'Pathway' in node_types:
-        return bioprocess(namespace=namespace, name=node['name'], identifier=identifier)
+        return bioprocess(namespace=namespace, name=name, identifier=identifier)
 
 
     elif 'DataNode' in node_types:
-        return abundance(namespace=namespace, name=node['name'], identifier=identifier)
+        return abundance(namespace=namespace, name=name, identifier=identifier)
 
     else:
         log.warning('Unknown %s', node_types)
@@ -108,6 +122,7 @@ def complex_to_bel(complex, nodes, graph: BELGraph):
     members = {
         nodes[member_id]
         for member_id in complex['participants']
+        if member_id in nodes.keys()
     }
 
     _, _, _, identifier = parse_id_uri(complex['uri_id'])
@@ -147,9 +162,15 @@ def add_edges(graph: BELGraph, participants, nodes, att: Dict):
 
     else:
         for source, target in participants:
-            u = nodes[source]
-            v = nodes[target]
-            add_simple_edge(graph, u, v, edge_types, uri_id)
+            if source in nodes.keys():
+                u = nodes[source]
+
+                if target in nodes.keys():
+                    v = nodes[target]
+                    add_simple_edge(graph, u, v, edge_types, uri_id)
+
+                else:
+                    log.warning('No valid target id %s', target)
 
 
 def add_simple_edge(graph: BELGraph, u, v, edge_types, uri_id):

diff --git a/src/pathme/wikipathways/rdf_sparql.py b/src/pathme/wikipathways/rdf_sparql.py
@@ -24,7 +24,15 @@
     'rdf': RDF,
     'dcterms': DCTERMS,
     'dc': DC,
-    'hgnc': Namespace('http://identifiers.org/hgnc.symbol/')
+    'hgnc': Namespace('http://identifiers.org/hgnc.symbol/'),
+    'ensembl': Namespace('http://identifiers.org/ensembl/'),
+    'ncbigene': Namespace('http://identifiers.org/ncbigene/'),
+    'uniprot': Namespace('http://identifiers.org/uniprot/'),
+    'chebi': Namespace('http://identifiers.org/chebi/'),
+    'chemspider': Namespace('http://identifiers.org/chemspider/'),
+    'pubchem': Namespace('http://rdf.ncbi.nlm.nih.gov/pubchem/compound/'),
+    'wikidata': Namespace('http://www.wikidata.org/entity/'),
+    'hmdb': Namespace('http://identifiers.org/hmdb/')
 }
 
 #: SPARQL query to get all the subtypes for a specific primary {type} (DataNode or Interaction) in a pathway network.
@@ -40,46 +48,60 @@
 
 #: SPARQL query to get all data nodes in a pathway network with some arguments.
 GET_ALL_DATA_NODES_SPARQL = """
-    SELECT DISTINCT ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?node_types) ?identifier (STRAFTER(STR(?hgnc_uri), str(hgnc:)) AS ?hgnc_symbol) ?hgnc_uri ?name
+    SELECT DISTINCT (?uri_id AS ?identifier) (?dc_identifier AS ?identifier) ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?node_types) (STRAFTER(STR(?ncbigene_uri), str(ncbigene:)) AS ?identifier ) (STRAFTER(STR(?hgnc_uri), str(hgnc:)) AS ?bdb_hgncsymbol) (STRAFTER(STR(?ensembl_uri), str(ensembl:)) AS ?bdb_ensembl) (STRAFTER(STR(?ncbigene_uri), str(ncbigene:)) AS ?bdb_ncbigene) (STRAFTER(STR(?uniprot_uri), str(uniprot:)) AS ?bdb_uniprot) (STRAFTER(STR(?chebi_uri), str(chebi:)) AS ?bdb_chebi) (STRAFTER(STR(?chemspider_uri), str(chemspider:)) AS ?bdb_chemspider) (STRAFTER(STR(?pubchem_uri), str(pubchem:)) AS ?bdb_pubchem) (STRAFTER(STR(?wikidata_uri), str(wikidata:)) AS ?bdb_wikidata) (STRAFTER(STR(?hmdb_uri), str(hmdb:)) AS ?bdb_hmdb) ?name
     WHERE {
        ?pathway a wp:Pathway .
        ?uri_id dcterms:isPartOf ?pathway .
+
        ?uri_id a wp:DataNode .
        ?uri_id rdf:type ?uri_type .
-       ?uri_id dcterms:identifier ?identifier .
+    
+       optional {?uri_id dcterms:identifier ?dc_identifier .}
+       
        optional {?uri_id wp:bdbHgncSymbol ?hgnc_uri .}
+       optional {?uri_id wp:bdbEnsembl ?ensembl_uri .}
+       optional {?uri_id wp:bdbEntrezGene ?ncbigene_uri .}
+       optional {?uri_id wp:bdbUniprot ?uniprot_uri .}
+       
+       optional {?uri_id wp:bdbChEBI ?chebi_uri .}
+       optional {?uri_id wp:bdbChemspider ?chemspider_uri .}
+       optional {?uri_id wp:bdbPubChem ?pubchem_uri .}
+       optional {?uri_id wp:bdbWikidata ?wikidata_uri .}
+       optional {?uri_id wp:bdbHmdb ?hmdba_uri .}
+
        ?uri_id rdfs:label ?name .
     }
     """
 
 #: SPARQL query to get all data nodes in a pathway network with some arguments.
 GET_ALL_COMPLEXES_SPARQL = """
-    SELECT DISTINCT ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?node_types) ?participants ?name
+    SELECT DISTINCT ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?node_types) (?participants_entry AS ?participants) (?participants_id AS ?participants) ?name (STRAFTER(STR(?ncbigene_participants), str(ncbigene:)) AS ?participants )
     WHERE {
        ?pathway a wp:Pathway .
        ?uri_id dcterms:isPartOf ?pathway .
        ?uri_id a wp:Complex .
        ?uri_id rdf:type ?uri_type .
-       ?uri_id wp:participants ?participants .
-       ?participants dcterms:identifier ?participants .
+       ?uri_id wp:participants ?participants_entry .
+       optional {?participants_entry dcterms:identifier ?participants_id .}
+       optional {?participants_entry dcterms:bdbEntrezGene ?ncbigene_participants .}
     }
     """
 
 # TODO: Check interaction complexes.
 #: SPARQL query to get all directed interactions in a pathway network with source and target.
 GET_ALL_DIRECTED_INTERACTIONS_SPARQL = """
-    SELECT DISTINCT ?source ?target ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?interaction_types)
+    SELECT DISTINCT (?source_entry AS ?source) (?dc_source AS ?source) (?target_entry AS ?target) (?dc_target AS ?target) ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?interaction_types) (STRAFTER(STR(?ncbigene_source), str(ncbigene:)) AS ?source ) (STRAFTER(STR(?ncbigene_target), str(ncbigene:)) AS ?target )
     WHERE {
        ?pathway a wp:Pathway .
        ?uri_id dcterms:isPartOf ?pathway .
        ?uri_id a wp:DirectedInteraction .
        ?uri_id rdf:type ?uri_type .
-       ?uri_id wp:source ?sourceUri .
-       ?uri_id wp:target ?targetUri .
-       ?sourceEntry dc:identifier ?sourceUri .
-       ?targetEntry dc:identifier ?targetUri .
-       ?sourceEntry dcterms:identifier ?source .
-       ?targetEntry dcterms:identifier ?target .
+       ?uri_id wp:source ?source_entry .
+       ?uri_id wp:target ?target_entry .
+       optional {?source_entry dcterms:identifier ?dc_source .}
+       optional {?target_entry dcterms:identifier ?dc_target .}
+       optional {?source_entry dcterms:bdbEntrezGene ?ncbigene_source .}
+       optional {?target_entry dcterms:bdbEntrezGene ?ncbigene_target .}
     }
     """
 
@@ -117,7 +139,8 @@ def _get_nodes(rdf_graph) -> Dict[str, Dict[str, Dict[str, str]]]:
     :returns: Nodes dict with nodes ids as keys and their metadata as values
     """
     return query_result_to_dict(
-        rdf_graph.query(GET_ALL_DATA_NODES_SPARQL, initNs=PREFIXES)
+        rdf_graph.query(GET_ALL_DATA_NODES_SPARQL, initNs=PREFIXES),
+        ids_argument=True
     )
 
 

diff --git a/src/pathme/wikipathways/utils.py b/src/pathme/wikipathways/utils.py
@@ -30,55 +30,90 @@ def evaluate_wikipathways_metadata(metadata):
     return metadata
 
 
-def get_valid_gene_identifier(node_dict, hgnc_manager):
+def get_valid_gene_identifier(node_ids_dict, hgnc_manager):
     """Return protein/gene identifier for a given RDF node.
 
-    :param dict node_dict: node dictionary
+    :param dict node_ids_dict: node dictionary
     :param bio2bel_hgnc.Manager hgnc_manager: hgnc manager
     :rtype: tuple[str,str,str]
     :return: namespace, name, identifier
     """
+
     # Try to get hgnc symbol
-    if 'hgnc_symbol' in node_dict:
 
-        hgnc_symbol = node_dict['hgnc_symbol']
+    if 'bdb_hgncsymbol' in node_ids_dict:
+
+        hgnc_symbol = node_ids_dict['bdb_hgncsymbol']
         hgnc_entry = hgnc_manager.get_gene_by_hgnc_symbol(hgnc_symbol)
 
+        if isinstance(hgnc_entry, list):
+            log.warning('Manager returning list %s', hgnc_entry)
+            if hgnc_entry != []:
+                hgnc_entry = hgnc_entry[0]
+
         if not hgnc_entry:
             log.warning('No valid HGNC Symbol %s', hgnc_symbol)
+            return 'HGNC_SYMBOL', hgnc_symbol, hgnc_symbol
 
         return HGNC, hgnc_symbol, hgnc_entry.identifier
 
+    # Try to get ENTREZ id
+    elif 'bdb_ncbigene' in node_ids_dict:
+        entrez_id = node_ids_dict['bdb_ncbigene']
+        hgnc_entry = hgnc_manager.get_gene_by_entrez_id(entrez_id)
+
+        if isinstance(hgnc_entry, list):
+            log.warning('Manager returning list %s', hgnc_entry)
+            if hgnc_entry != []:
+                hgnc_entry = hgnc_entry[0]
+
+        if not hgnc_entry:
+            log.warning('No valid ENTREZ %s', entrez_id)
+            return 'ENTREZ', entrez_id, entrez_id
+
+        return HGNC, hgnc_entry.symbol, hgnc_entry.identifier
+
     # Try to get UniProt id
-    elif 'uniprot' in node_dict:
-        uniprot_id = node_dict['uniprot']
+    elif 'bdb_uniprot' in node_ids_dict:
+        uniprot_id = node_ids_dict['bdb_uniprot']
         hgnc_entry = hgnc_manager.get_gene_by_uniprot_id(uniprot_id)
 
+        if isinstance(hgnc_entry, list):
+            log.warning('Manager returning list %s', hgnc_entry)
+            if hgnc_entry != []:
+                hgnc_entry = hgnc_entry[0]
+
         if not hgnc_entry:
             log.warning('No valid Uniprot %s', uniprot_id)
             return 'UNIPROT', uniprot_id, uniprot_id
 
         return HGNC, hgnc_entry.symbol, hgnc_entry.identifier
 
     # Try to get ENSEMBL id
-    elif 'ensembl' in node_dict:
-        ensembl_id = node_dict['ensembl']
+    elif 'bdb_ncbigene' in node_ids_dict:
+        ensembl_id = node_ids_dict['bdb_ncbigene']
         hgnc_entry = hgnc_manager.get_gene_by_uniprot_id(ensembl_id)
 
+        if isinstance(hgnc_entry, list):
+            log.warning('Manager returning list %s', hgnc_entry)
+            if hgnc_entry != []:
+                hgnc_entry = hgnc_entry[0]
+
         if not hgnc_entry:
             log.warning('No valid ENSEMBL %s', ensembl_id)
             return 'ENSEMBL', ensembl_id, ensembl_id
 
         return HGNC, hgnc_entry.symbol, hgnc_entry.identifier
 
-    elif 'ec-code' in node_dict:
-        enzyme = node_dict['ec-code']
+
+    elif 'ec-code' in node_ids_dict['uri_id']:
+        enzyme = node_ids_dict['name']
         # TODO: Fix and get enzyme
         # hgnc_entry = hgnc_manager.get_enzymes(enzyme)
 
         return HGNC, 'PASS', 'PASAS'
 
-    raise Exception('Unknown identifier for node %s', node_dict)
+    raise Exception('Unknown identifier for node %s', node_ids_dict)
 
 
 def merge_two_dicts(dict1, dict2):