Skip to content

Commit

Permalink
Code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Apr 26, 2019
1 parent 379d084 commit 7714f7f
Show file tree
Hide file tree
Showing 6 changed files with 238 additions and 245 deletions.
4 changes: 2 additions & 2 deletions src/pathme/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def download():
@click.option('-c', '--connection', help="Defaults to {}".format(DEFAULT_CACHE_CONNECTION))
@click.option('-d', '--debug', is_flag=True, default=False, help='Debug mode')
@click.option('-x', '--only-canonical', default=True, help='Parse only canonical pathways')
def bel(connection, debug, only_canonical):
def bel(connection: str, debug: bool, only_canonical: bool):
"""Convert WikiPathways to BEL."""
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")

Expand All @@ -166,7 +166,7 @@ def bel(connection, debug, only_canonical):
wikipathways_to_pickles(resource_files, resource_folder, hgnc_manager)

logger.info(
'WikiPathways exported in %.2f seconds. A total of {} warnings regarding entities that could not be converted '
'WikiPathways exported in %.2f seconds. A total of %d warnings regarding entities that could not be converted '
'to standard identifiers were found.',
time.time() - t, logging.debug.counter
)
Expand Down
76 changes: 39 additions & 37 deletions src/pathme/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,41 @@

from bio2bel.utils import get_connection

MODULE_NAME = 'pathme'
DEFAULT_PATHME_DIR = os.path.join(os.path.expanduser('~'), '.pathme')
PATHME_DIR = os.environ.get('PATHME_DIRECTORY', DEFAULT_PATHME_DIR)

def get_data_dir():
"""Ensures the appropriate PathMe data directory exists for the given module, then returns the file path.

:return: The module's data directory
:rtype: str
"""
def get_data_dir() -> str:
"""Ensure the appropriate PathMe data directory exists for the given module, then returns the file path."""
os.makedirs(PATHME_DIR, exist_ok=True)
return PATHME_DIR


def ensure_pathme_folders():
DATA_DIR = get_data_dir()
DEFAULT_CACHE_CONNECTION = get_connection(MODULE_NAME)

KEGG = 'kegg'
KEGG_DIR = os.path.join(DATA_DIR, KEGG)
KEGG_BEL = os.path.join(KEGG_DIR, 'bel')
KEGG_FILES = os.path.join(KEGG_DIR, 'xml')
KEGG_CACHE = os.path.join(KEGG_DIR, 'cache')

REACTOME = 'reactome'
REACTOME_DIR = os.path.join(DATA_DIR, REACTOME)
REACTOME_BEL = os.path.join(REACTOME_DIR, 'bel')
REACTOME_FILES = os.path.join(REACTOME_DIR, 'rdf')

WIKIPATHWAYS = 'wikipathways'
WIKIPATHWAYS_DIR = os.path.join(DATA_DIR, WIKIPATHWAYS)
WIKIPATHWAYS_BEL = os.path.join(WIKIPATHWAYS_DIR, 'bel')
WIKIPATHWAYS_FILES = os.path.join(WIKIPATHWAYS_DIR, 'rdf')

SPIA_DIR = os.path.join(DATA_DIR, 'spia')
UNIVERSE_DIR = os.path.join(DATA_DIR, 'universe')


def ensure_pathme_folders(): # TODO why is this a function?
"""Ensure data folders are created."""
os.makedirs(KEGG_DIR, exist_ok=True)
os.makedirs(REACTOME_DIR, exist_ok=True)
Expand All @@ -35,38 +58,16 @@ def ensure_pathme_folders():
os.makedirs(WIKIPATHWAYS_FILES, exist_ok=True)


MODULE_NAME = 'pathme'
PATHME_DIR = os.environ.get('PATHME_DIRECTORY', os.path.join(os.path.expanduser('~'), '.pathme'))
DATA_DIR = get_data_dir()
DEFAULT_CACHE_CONNECTION = get_connection(MODULE_NAME)
ensure_pathme_folders()

KEGG = 'kegg'
UNKNOWN = 'unknown'

# Other namespaces
INTERPRO = 'interpro'
PFAM = 'pfam'
BRENDA = 'brenda'

REACTOME = 'reactome'
WIKIPATHWAYS = 'wikipathways'

KEGG_DIR = os.path.join(DATA_DIR, KEGG)
REACTOME_DIR = os.path.join(DATA_DIR, REACTOME)
WIKIPATHWAYS_DIR = os.path.join(DATA_DIR, WIKIPATHWAYS)
SPIA_DIR = os.path.join(DATA_DIR, 'spia')
UNIVERSE_DIR = os.path.join(DATA_DIR, 'universe')

KEGG_BEL = os.path.join(KEGG_DIR, 'bel')
REACTOME_BEL = os.path.join(REACTOME_DIR, 'bel')
WIKIPATHWAYS_BEL = os.path.join(WIKIPATHWAYS_DIR, 'bel')

KEGG_FILES = os.path.join(KEGG_DIR, 'xml')
REACTOME_FILES = os.path.join(REACTOME_DIR, 'rdf')
WIKIPATHWAYS_FILES = os.path.join(WIKIPATHWAYS_DIR, 'rdf')

KEGG_CACHE = os.path.join(DATA_DIR, KEGG, 'cache')

ensure_pathme_folders()

UNKNOWN = 'unknown'
CHEMBL = 'chembl'
MIRBASE = 'mirbase'

KEGG_ID = 'kegg_id'
KEGG_NAME = 'kegg_name'
Expand All @@ -86,10 +87,11 @@ def ensure_pathme_folders():
'phosphorylation': 'Ph',
'glycosylation': 'Glyco',
'ubiquitination': 'Ub',
'methylation': 'Me'
'methylation': 'Me',
}
KEGG_CITATION = '10592173'

# FIXME why doesn't this just import the compath_resources package?
KEGG_WIKIPATHWAYS_MAPPINGS = 'https://github.com/ComPath/curation/raw/master/mappings/kegg_wikipathways.xlsx'
KEGG_REACTOME_MAPPINGS = 'https://github.com/ComPath/curation/raw/master/mappings/kegg_reactome.xlsx'
WIKIPATHWAYS_REACTOME_MAPPINGS = 'https://github.com/ComPath/curation/raw/master/mappings/wikipathways_reactome.xlsx'
Expand Down Expand Up @@ -140,7 +142,7 @@ def ensure_pathme_folders():
'hidden compound': 'XML Hidden Compound Relations',
'missing interaction': 'XML Missing Interaction Relations',
'state change': 'XML State Change Relations',
'brite': 'XML Brite Hierarchy'
'brite': 'XML Brite Hierarchy',
}

BEL_STATS_COLUMN_NAMES = {
Expand All @@ -162,5 +164,5 @@ def ensure_pathme_folders():
'hasVariant': 'BEL Variant Edges',
'hasReactant': 'BEL Reactants Edges',
'hasProduct': 'BEL Products Edges',
'translatedTo': 'BEL Translation Edges'
'translatedTo': 'BEL Translation Edges',
}
91 changes: 30 additions & 61 deletions src/pathme/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,20 @@
import itertools as itt
import logging
import os
import re
import pickle
from typing import Dict, List, Optional, Set
from typing import Dict, Iterable, List, Optional, Set, Tuple
from urllib.parse import urlparse
from urllib.request import urlretrieve

import click
import pandas as pd
import pybel
import rdflib
from pybel import BELGraph, from_pickle
from pybel.struct.summary import count_functions, count_relations

from pathme.constants import UNKNOWN, BEL_STATS_COLUMN_NAMES, PATHME_DIR

import pybel
from pathme.constants import BEL_STATS_COLUMN_NAMES, UNKNOWN
from pathme.export_utils import get_files_in_folder
from pybel import from_pickle
from pybel import union
from pybel.struct.summary import count_functions, edge_summary
from pybel_tools import summary

log = logging.getLogger(__name__)

Expand All @@ -40,38 +36,14 @@ def __call__(self, *args, **kwargs):
return self.method(*args, **kwargs)


def check_multiple(element, element_name):
"""Check whether element is iterable.
:param element: variable to check
:param element_name: name to print
:return:
"""
if isinstance(element, set) or isinstance(element, list):
log.warning('Multiple {}: {}'.format(element_name, element))
# TODO: print the wikipathways bps that return a set because they are probably wrong.
if len(element) == 1:
return list(element)[0]
elif len(element) > 1:
for subelement in element:
if bool(re.match('^[A-Z0-9]+$', subelement)):
return subelement

return list(element)[0]
else:
log.warning('Empty list/set %s', element)

return element

def parse_id_uri(uri):
"""Get the components of a given uri (with identifier at the last position).
:param str uri: URI
:param uri: URI
:returns: prefix (ex: http://rdf.wikipathways.org/...)
:returns: prefix_namespaces: if there are many namespaces, until the penultimate (ex: .../Pathway/WP22_r97775/...)
:returns: namespace: if there are many namespaces, the last (ex: .../Interaction/)
:returns: identifier (ex: .../c562c/)
:rtype: tuple[str,str,str,str]
"""
parsed_url = urlparse(uri)
uri_suffix = parsed_url.path.split('/')
Expand All @@ -82,18 +54,21 @@ def parse_id_uri(uri):
# namespace (Interaction),
# identifier (id61b0d9c7) in the given example ->
# (http://rdf.wikipathways.org/Pathway/WP2118_r97625/WP/Interaction/id61b0d9c7)
return parsed_url.netloc, '/'.join(uri_suffix[0:-2]), uri_suffix[-2], uri_suffix[-1]
return (
parsed_url.netloc,
'/'.join(uri_suffix[0:-2]),
uri_suffix[-2],
uri_suffix[-1],
)


def parse_namespace_uri(uri):
def parse_namespace_uri(uri: str) -> Tuple[str, str, str]:
"""Get the prefix and namespace of a given URI (without identifier, only with a namspace at last position).
:param str uri: URI
:param uri: URI
:returns: prefix (ex: http://purl.org/dc/terms/...)
:returns: namespace (ex: .../isPartOf)
:rtype: tuple[str,str]
"""

# Split the uri str by '/'.
splited_uri = uri.split('/')

Expand Down Expand Up @@ -137,7 +112,6 @@ def parse_rdf(path: str, format: Optional[str] = None) -> rdflib.Graph:
def entry_result_to_dict(entry, **kwargs):
"""Export to a dictionary a SPARQL query result data structure.
:param str rdflib.plugins.sparql.processor.SPARQLResult: SPARQL query result data structure, with all the arguments queried for all entries of a certain primary type.
:returns: entries_dict: Dictionary with all the entries id as keys and the entries arguments as values.
:rtype: dict
"""
Expand Down Expand Up @@ -184,7 +158,6 @@ def entries_dict_ids_argument(entries_dict):
def query_result_to_dict(entries, **kwargs) -> Dict[str, Dict[str, Dict[str, str]]]:
"""Export to a dictionary a SPARQL query result data structure.
:param str rdflib.plugins.sparql.processor.SPARQLResult: SPARQL query result data structure, with all the arguments queried for all entries of a certain primary type.
:returns: entries_dict: Dictionary with all the entries id as keys and the entries arguments as values.
:rtype: dict
"""
Expand Down Expand Up @@ -276,8 +249,8 @@ def get_pathway_statitics(nodes_types, edges_types, bel_graph, **kwargs):
pathway_statistics = {
'RDF nodes': rdf_nodes_statistics,
'RDF interactions': rdf_edges_statistics,
'BEL imported nodes': pybel.struct.summary.count_functions(bel_graph),
'BEL imported edges': summary.edge_summary.count_relations(bel_graph),
'BEL imported nodes': count_functions(bel_graph),
'BEL imported edges': count_relations(bel_graph),
'bel_vs_rdf': {
'RDF nodes': rdf_total_nodes,
'RDF interactions': rdf_total_edges,
Expand Down Expand Up @@ -338,10 +311,10 @@ def statistics_to_df(all_pathways_statistics):
return df


def get_bel_types(path):
def get_bel_types(path: str):
"""Get BEL node and edge type statistics.
:param str path: path to pickle
:param path: path to pickle
:return: count of all nodes and edges in a BEL graph
:rtype: dict
"""
Expand All @@ -357,16 +330,16 @@ def get_bel_types(path):
bel_stats.update(bel_functions_dict)

# Get count of all BEL edge types
bel_edges_dict = edge_summary.count_relations(bel_graph)
bel_edges_dict = count_relations(bel_graph)
bel_stats.update(bel_edges_dict)

return bel_stats


def get_bel_stats(resource_folder):
def get_bel_stats(resource_folder: str):
"""Get all BEL node and edge type statistics.
:param str resource_folder: path to BEL pickles folder
:param resource_folder: path to BEL pickles folder
:return: count of all nodes and edges in all BEL graphs from one resource
:rtype: dict
"""
Expand Down Expand Up @@ -401,7 +374,7 @@ def get_bel_stats(resource_folder):
def get_genes_from_pickles(resource_folder: str, files: List[str], manager) -> Dict[str, set]:
"""Get BEL graph gene set for all pathways in resource.
:param str resource_folder: path to resource folder
:param resource_folder: path to resource folder
:param list files: list of BEL graph pickles
:param bio2bel Manager manager: Manager
:return: BEL graph gene sets for each pathway in resource
Expand Down Expand Up @@ -452,9 +425,8 @@ def get_kegg_genes_from_pickles(resource_folder, files: List[str], manager) -> D
def get_genes_in_graph(graph: pybel.BELGraph) -> Set[str]:
"""Get BEL graph gene set for a pathway.
:param pybel.BELGraph graph: BEL Graph
:param graph: BEL Graph
:return: BEL graph gene set
:rtype: set
"""
gene_set = set()

Expand Down Expand Up @@ -495,10 +467,10 @@ def jaccard_similarity(database_gene_set, bel_genes_set):
count_no_similarity += 1

print('Jaccard index for gene sets in database vs gene sets in BEL:')
print('{} of {} gene sets in the database and BEL graphs have a similarity of 100%.'.format(count, len(
jaccard_similarities)))
print('{} of {} gene sets in the database and BEL graphs have a similarity of 0%.'.format(count_no_similarity, len(
jaccard_similarities)))
print('{} of {} gene sets in the database and BEL '
'graphs have a similarity of 100%.'.format(count, len(jaccard_similarities)))
print('{} of {} gene sets in the database and '
'BEL graphs have a similarity of 0%.'.format(count_no_similarity, len(jaccard_similarities)))

return jaccard_similarities

Expand Down Expand Up @@ -537,13 +509,10 @@ def download_data(force_download=False):
decompress_file(data, export_path)


def summarize_helper(graphs):
"""Print in console summary of graphs.
:param iter[graphs] graphs: BEL Graphs
"""
def summarize_helper(graphs: Iterable[BELGraph]):
"""Print in console summary of graphs."""
click.echo('joining graphs')
graph = union(graphs)
graph = pybel.union(graphs)

click.echo('generating summary')
summary_str = graph.summary_str()
Expand Down

0 comments on commit 7714f7f

Please sign in to comment.