Skip to content

Commit

Permalink
Identifiers dict
Browse files Browse the repository at this point in the history
  • Loading branch information
josep.marin.llao committed Oct 3, 2018
1 parent 57dda21 commit 5e7b0cf
Show file tree
Hide file tree
Showing 5 changed files with 533 additions and 42 deletions.
26 changes: 22 additions & 4 deletions src/pathme/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def entry_result_to_dict(entry, **kwargs):
attributes_dict = {
str(label): str(entry[label])
for label in entry.labels
if label and entry[label] is not None
if label is not None and entry[label] is not None
}

if 'directed_interaction' in kwargs:
Expand All @@ -131,6 +131,19 @@ def entry_result_to_dict(entry, **kwargs):

return attributes_dict

def entries_dict_ids_argument(entries_dict):
entries_dict_ids = defaultdict(dict)
for entry_id, entry_att in entries_dict.items():
entry_identifiers = {}

for label, value in entry_att.items():
if 'bdb' in label:
entry_identifiers[label] = value
else:
entries_dict_ids[entry_id][label] = value
entries_dict_ids[entry_id]['identifiers'] = entry_identifiers

return entries_dict

def query_result_to_dict(entries, **kwargs) -> Dict[str, Dict[str, Dict[str, str]]]:
"""Export to a dictionary a SPARQL query result data structure.
Expand All @@ -144,10 +157,11 @@ def query_result_to_dict(entries, **kwargs) -> Dict[str, Dict[str, Dict[str, str
for rdf_entry in entries:
dict_rdf_entry = entry_result_to_dict(rdf_entry, **kwargs)

if 'identifier' in rdf_entry.labels:
if 'identifier' in rdf_entry.labels and rdf_entry.identifier is not None:
print (str(rdf_entry.identifier))
id_key = str(rdf_entry.identifier)

elif 'uri_id' in rdf_entry.labels:
elif 'uri_id' in rdf_entry.labels and rdf_entry.uri_id is not None:
id_key = rdf_entry.uri_id

else:
Expand Down Expand Up @@ -184,7 +198,11 @@ def query_result_to_dict(entries, **kwargs) -> Dict[str, Dict[str, Dict[str, str
for attr in attr_empty
}

return entries_dict
if kwargs.get('ids_argument') == True:
return entries_dict_ids_argument(entries_dict)

else:
return entries_dict


"""Statistics functions"""
Expand Down
47 changes: 34 additions & 13 deletions src/pathme/wikipathways/convert_to_bel.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def convert_to_bel(nodes: Dict[str, Dict], complexes: Dict[str, Dict], interacti
authors="Sarah Mubeen, Daniel Domingo-Fernández & Josep Marín-Llaó",
contact='daniel.domingo.fernandez@scai.fraunhofer.de',
)
print(pathway_info['pathway_id'])

nodes = nodes_to_bel(nodes, hgnc_manager)
nodes.update(complexes_to_bel(complexes, nodes, graph))
Expand All @@ -57,39 +58,52 @@ def node_to_bel(node: Dict, hgnc_manager: Manager) -> BaseEntity:

if 'identifier' in node.keys():
identifier = node['identifier']

else:
identifier = uri_id

if isinstance(identifier, set):
print('Multiple identifier {}'.format(node['identifier']))
# TODO: print the wikipathways bps that return a set because they are probably wrong.
identifier = list(identifier)[0]


if 'identifiers' in node.keys():
node_ids_dict = node['identifiers']
else:
node_ids_dict = node

_, _, namespace, _ = parse_id_uri(uri_id)

if isinstance(node['name'], set):
print('{}'.format(node['name']))
name = node['name']

if isinstance(name, set):
print('Multiple name {}'.format(node['name']))
# TODO: print the wikipathways bps that return a set because they are probably wrong.
name = list(node['name'])[0]
name = list(name)[0]


if 'Protein' in node_types:
namespace, name, identifier = get_valid_gene_identifier(node, hgnc_manager)
namespace, name, identifier = get_valid_gene_identifier(node_ids_dict, hgnc_manager)
return protein(namespace=namespace, name=name, identifier=identifier)

elif 'Rna' in node_types:
namespace, name, identifier = get_valid_gene_identifier(node, hgnc_manager)
namespace, name, identifier = get_valid_gene_identifier(node_ids_dict, hgnc_manager)
return rna(namespace=namespace, name=name, identifier=identifier)

elif 'GeneProduct' in node_types:
namespace, name, identifier = get_valid_gene_identifier(node, hgnc_manager)
namespace, name, identifier = get_valid_gene_identifier(node_ids_dict, hgnc_manager)
return gene(namespace=HGNC, name=name, identifier=identifier)

elif 'Metabolite' in node_types:
# FIX node[name]
return abundance(namespace=namespace, name=node['name'], identifier=identifier)
return abundance(namespace=namespace, name=name, identifier=identifier)

elif 'Pathway' in node_types:
return bioprocess(namespace=namespace, name=node['name'], identifier=identifier)
return bioprocess(namespace=namespace, name=name, identifier=identifier)


elif 'DataNode' in node_types:
return abundance(namespace=namespace, name=node['name'], identifier=identifier)
return abundance(namespace=namespace, name=name, identifier=identifier)

else:
log.warning('Unknown %s', node_types)
Expand All @@ -108,6 +122,7 @@ def complex_to_bel(complex, nodes, graph: BELGraph):
members = {
nodes[member_id]
for member_id in complex['participants']
if member_id in nodes.keys()
}

_, _, _, identifier = parse_id_uri(complex['uri_id'])
Expand Down Expand Up @@ -147,9 +162,15 @@ def add_edges(graph: BELGraph, participants, nodes, att: Dict):

else:
for source, target in participants:
u = nodes[source]
v = nodes[target]
add_simple_edge(graph, u, v, edge_types, uri_id)
if source in nodes.keys():
u = nodes[source]

if target in nodes.keys():
v = nodes[target]
add_simple_edge(graph, u, v, edge_types, uri_id)

else:
log.warning('No valid target id %s', target)


def add_simple_edge(graph: BELGraph, u, v, edge_types, uri_id):
Expand Down
51 changes: 37 additions & 14 deletions src/pathme/wikipathways/rdf_sparql.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,15 @@
'rdf': RDF,
'dcterms': DCTERMS,
'dc': DC,
'hgnc': Namespace('http://identifiers.org/hgnc.symbol/')
'hgnc': Namespace('http://identifiers.org/hgnc.symbol/'),
'ensembl': Namespace('http://identifiers.org/ensembl/'),
'ncbigene': Namespace('http://identifiers.org/ncbigene/'),
'uniprot': Namespace('http://identifiers.org/uniprot/'),
'chebi': Namespace('http://identifiers.org/chebi/'),
'chemspider': Namespace('http://identifiers.org/chemspider/'),
'pubchem': Namespace('http://rdf.ncbi.nlm.nih.gov/pubchem/compound/'),
'wikidata': Namespace('http://www.wikidata.org/entity/'),
'hmdb': Namespace('http://identifiers.org/hmdb/')
}

#: SPARQL query to get all the subtypes for a specific primary {type} (DataNode or Interaction) in a pathway network.
Expand All @@ -40,46 +48,60 @@

#: SPARQL query to get all data nodes in a pathway network with some arguments.
GET_ALL_DATA_NODES_SPARQL = """
SELECT DISTINCT ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?node_types) ?identifier (STRAFTER(STR(?hgnc_uri), str(hgnc:)) AS ?hgnc_symbol) ?hgnc_uri ?name
SELECT DISTINCT (?uri_id AS ?identifier) (?dc_identifier AS ?identifier) ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?node_types) (STRAFTER(STR(?ncbigene_uri), str(ncbigene:)) AS ?identifier ) (STRAFTER(STR(?hgnc_uri), str(hgnc:)) AS ?bdb_hgncsymbol) (STRAFTER(STR(?ensembl_uri), str(ensembl:)) AS ?bdb_ensembl) (STRAFTER(STR(?ncbigene_uri), str(ncbigene:)) AS ?bdb_ncbigene) (STRAFTER(STR(?uniprot_uri), str(uniprot:)) AS ?bdb_uniprot) (STRAFTER(STR(?chebi_uri), str(chebi:)) AS ?bdb_chebi) (STRAFTER(STR(?chemspider_uri), str(chemspider:)) AS ?bdb_chemspider) (STRAFTER(STR(?pubchem_uri), str(pubchem:)) AS ?bdb_pubchem) (STRAFTER(STR(?wikidata_uri), str(wikidata:)) AS ?bdb_wikidata) (STRAFTER(STR(?hmdb_uri), str(hmdb:)) AS ?bdb_hmdb) ?name
WHERE {
?pathway a wp:Pathway .
?uri_id dcterms:isPartOf ?pathway .
?uri_id a wp:DataNode .
?uri_id rdf:type ?uri_type .
?uri_id dcterms:identifier ?identifier .
optional {?uri_id dcterms:identifier ?dc_identifier .}
optional {?uri_id wp:bdbHgncSymbol ?hgnc_uri .}
optional {?uri_id wp:bdbEnsembl ?ensembl_uri .}
optional {?uri_id wp:bdbEntrezGene ?ncbigene_uri .}
optional {?uri_id wp:bdbUniprot ?uniprot_uri .}
optional {?uri_id wp:bdbChEBI ?chebi_uri .}
optional {?uri_id wp:bdbChemspider ?chemspider_uri .}
optional {?uri_id wp:bdbPubChem ?pubchem_uri .}
optional {?uri_id wp:bdbWikidata ?wikidata_uri .}
optional {?uri_id wp:bdbHmdb ?hmdba_uri .}
?uri_id rdfs:label ?name .
}
"""

#: SPARQL query to get all data nodes in a pathway network with some arguments.
GET_ALL_COMPLEXES_SPARQL = """
SELECT DISTINCT ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?node_types) ?participants ?name
SELECT DISTINCT ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?node_types) (?participants_entry AS ?participants) (?participants_id AS ?participants) ?name (STRAFTER(STR(?ncbigene_participants), str(ncbigene:)) AS ?participants )
WHERE {
?pathway a wp:Pathway .
?uri_id dcterms:isPartOf ?pathway .
?uri_id a wp:Complex .
?uri_id rdf:type ?uri_type .
?uri_id wp:participants ?participants .
?participants dcterms:identifier ?participants .
?uri_id wp:participants ?participants_entry .
optional {?participants_entry dcterms:identifier ?participants_id .}
optional {?participants_entry dcterms:bdbEntrezGene ?ncbigene_participants .}
}
"""

# TODO: Check interaction complexes.
#: SPARQL query to get all directed interactions in a pathway network with source and target.
GET_ALL_DIRECTED_INTERACTIONS_SPARQL = """
SELECT DISTINCT ?source ?target ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?interaction_types)
SELECT DISTINCT (?source_entry AS ?source) (?dc_source AS ?source) (?target_entry AS ?target) (?dc_target AS ?target) ?uri_id (STRAFTER(STR(?uri_type), str(wp:)) AS ?interaction_types) (STRAFTER(STR(?ncbigene_source), str(ncbigene:)) AS ?source ) (STRAFTER(STR(?ncbigene_target), str(ncbigene:)) AS ?target )
WHERE {
?pathway a wp:Pathway .
?uri_id dcterms:isPartOf ?pathway .
?uri_id a wp:DirectedInteraction .
?uri_id rdf:type ?uri_type .
?uri_id wp:source ?sourceUri .
?uri_id wp:target ?targetUri .
?sourceEntry dc:identifier ?sourceUri .
?targetEntry dc:identifier ?targetUri .
?sourceEntry dcterms:identifier ?source .
?targetEntry dcterms:identifier ?target .
?uri_id wp:source ?source_entry .
?uri_id wp:target ?target_entry .
optional {?source_entry dcterms:identifier ?dc_source .}
optional {?target_entry dcterms:identifier ?dc_target .}
optional {?source_entry dcterms:bdbEntrezGene ?ncbigene_source .}
optional {?target_entry dcterms:bdbEntrezGene ?ncbigene_target .}
}
"""

Expand Down Expand Up @@ -117,7 +139,8 @@ def _get_nodes(rdf_graph) -> Dict[str, Dict[str, Dict[str, str]]]:
:returns: Nodes dict with nodes ids as keys and their metadata as values
"""
return query_result_to_dict(
rdf_graph.query(GET_ALL_DATA_NODES_SPARQL, initNs=PREFIXES)
rdf_graph.query(GET_ALL_DATA_NODES_SPARQL, initNs=PREFIXES),
ids_argument=True
)


Expand Down
57 changes: 46 additions & 11 deletions src/pathme/wikipathways/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,55 +30,90 @@ def evaluate_wikipathways_metadata(metadata):
return metadata


def get_valid_gene_identifier(node_dict, hgnc_manager):
def get_valid_gene_identifier(node_ids_dict, hgnc_manager):
"""Return protein/gene identifier for a given RDF node.
:param dict node_dict: node dictionary
:param dict node_ids_dict: node dictionary
:param bio2bel_hgnc.Manager hgnc_manager: hgnc manager
:rtype: tuple[str,str,str]
:return: namespace, name, identifier
"""

# Try to get hgnc symbol
if 'hgnc_symbol' in node_dict:

hgnc_symbol = node_dict['hgnc_symbol']
if 'bdb_hgncsymbol' in node_ids_dict:

hgnc_symbol = node_ids_dict['bdb_hgncsymbol']
hgnc_entry = hgnc_manager.get_gene_by_hgnc_symbol(hgnc_symbol)

if isinstance(hgnc_entry, list):
log.warning('Manager returning list %s', hgnc_entry)
if hgnc_entry != []:
hgnc_entry = hgnc_entry[0]

if not hgnc_entry:
log.warning('No valid HGNC Symbol %s', hgnc_symbol)
return 'HGNC_SYMBOL', hgnc_symbol, hgnc_symbol

return HGNC, hgnc_symbol, hgnc_entry.identifier

# Try to get ENTREZ id
elif 'bdb_ncbigene' in node_ids_dict:
entrez_id = node_ids_dict['bdb_ncbigene']
hgnc_entry = hgnc_manager.get_gene_by_entrez_id(entrez_id)

if isinstance(hgnc_entry, list):
log.warning('Manager returning list %s', hgnc_entry)
if hgnc_entry != []:
hgnc_entry = hgnc_entry[0]

if not hgnc_entry:
log.warning('No valid ENTREZ %s', entrez_id)
return 'ENTREZ', entrez_id, entrez_id

return HGNC, hgnc_entry.symbol, hgnc_entry.identifier

# Try to get UniProt id
elif 'uniprot' in node_dict:
uniprot_id = node_dict['uniprot']
elif 'bdb_uniprot' in node_ids_dict:
uniprot_id = node_ids_dict['bdb_uniprot']
hgnc_entry = hgnc_manager.get_gene_by_uniprot_id(uniprot_id)

if isinstance(hgnc_entry, list):
log.warning('Manager returning list %s', hgnc_entry)
if hgnc_entry != []:
hgnc_entry = hgnc_entry[0]

if not hgnc_entry:
log.warning('No valid Uniprot %s', uniprot_id)
return 'UNIPROT', uniprot_id, uniprot_id

return HGNC, hgnc_entry.symbol, hgnc_entry.identifier

# Try to get ENSEMBL id
elif 'ensembl' in node_dict:
ensembl_id = node_dict['ensembl']
elif 'bdb_ncbigene' in node_ids_dict:
ensembl_id = node_ids_dict['bdb_ncbigene']
hgnc_entry = hgnc_manager.get_gene_by_uniprot_id(ensembl_id)

if isinstance(hgnc_entry, list):
log.warning('Manager returning list %s', hgnc_entry)
if hgnc_entry != []:
hgnc_entry = hgnc_entry[0]

if not hgnc_entry:
log.warning('No valid ENSEMBL %s', ensembl_id)
return 'ENSEMBL', ensembl_id, ensembl_id

return HGNC, hgnc_entry.symbol, hgnc_entry.identifier

elif 'ec-code' in node_dict:
enzyme = node_dict['ec-code']

elif 'ec-code' in node_ids_dict['uri_id']:
enzyme = node_ids_dict['name']
# TODO: Fix and get enzyme
# hgnc_entry = hgnc_manager.get_enzymes(enzyme)

return HGNC, 'PASS', 'PASAS'

raise Exception('Unknown identifier for node %s', node_dict)
raise Exception('Unknown identifier for node %s', node_ids_dict)


def merge_two_dicts(dict1, dict2):
Expand Down

0 comments on commit 5e7b0cf

Please sign in to comment.