Code cleanup

PathwayMerger · Apr 26, 2019 · 7714f7f · 7714f7f
1 parent 379d084
commit 7714f7f
Show file tree

Hide file tree

Showing 6 changed files with 238 additions and 245 deletions.
diff --git a/src/pathme/cli.py b/src/pathme/cli.py
@@ -141,7 +141,7 @@ def download():
 @click.option('-c', '--connection', help="Defaults to {}".format(DEFAULT_CACHE_CONNECTION))
 @click.option('-d', '--debug', is_flag=True, default=False, help='Debug mode')
 @click.option('-x', '--only-canonical', default=True, help='Parse only canonical pathways')
-def bel(connection, debug, only_canonical):
+def bel(connection: str, debug: bool, only_canonical: bool):
     """Convert WikiPathways to BEL."""
     logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
 
@@ -166,7 +166,7 @@ def bel(connection, debug, only_canonical):
     wikipathways_to_pickles(resource_files, resource_folder, hgnc_manager)
 
     logger.info(
-        'WikiPathways exported in %.2f seconds. A total of {} warnings regarding entities that could not be converted '
+        'WikiPathways exported in %.2f seconds. A total of %d warnings regarding entities that could not be converted '
         'to standard identifiers were found.',
         time.time() - t, logging.debug.counter
     )

diff --git a/src/pathme/constants.py b/src/pathme/constants.py
@@ -6,18 +6,41 @@
 
 from bio2bel.utils import get_connection
 
+MODULE_NAME = 'pathme'
+DEFAULT_PATHME_DIR = os.path.join(os.path.expanduser('~'), '.pathme')
+PATHME_DIR = os.environ.get('PATHME_DIRECTORY', DEFAULT_PATHME_DIR)
 
-def get_data_dir():
-    """Ensures the appropriate PathMe data directory exists for the given module, then returns the file path.
 
-    :return: The module's data directory
-    :rtype: str
-    """
+def get_data_dir() -> str:
+    """Ensure the appropriate PathMe data directory exists for the given module, then returns the file path."""
     os.makedirs(PATHME_DIR, exist_ok=True)
     return PATHME_DIR
 
 
-def ensure_pathme_folders():
+DATA_DIR = get_data_dir()
+DEFAULT_CACHE_CONNECTION = get_connection(MODULE_NAME)
+
+KEGG = 'kegg'
+KEGG_DIR = os.path.join(DATA_DIR, KEGG)
+KEGG_BEL = os.path.join(KEGG_DIR, 'bel')
+KEGG_FILES = os.path.join(KEGG_DIR, 'xml')
+KEGG_CACHE = os.path.join(KEGG_DIR, 'cache')
+
+REACTOME = 'reactome'
+REACTOME_DIR = os.path.join(DATA_DIR, REACTOME)
+REACTOME_BEL = os.path.join(REACTOME_DIR, 'bel')
+REACTOME_FILES = os.path.join(REACTOME_DIR, 'rdf')
+
+WIKIPATHWAYS = 'wikipathways'
+WIKIPATHWAYS_DIR = os.path.join(DATA_DIR, WIKIPATHWAYS)
+WIKIPATHWAYS_BEL = os.path.join(WIKIPATHWAYS_DIR, 'bel')
+WIKIPATHWAYS_FILES = os.path.join(WIKIPATHWAYS_DIR, 'rdf')
+
+SPIA_DIR = os.path.join(DATA_DIR, 'spia')
+UNIVERSE_DIR = os.path.join(DATA_DIR, 'universe')
+
+
+def ensure_pathme_folders(): # TODO why is this a function?
     """Ensure data folders are created."""
     os.makedirs(KEGG_DIR, exist_ok=True)
     os.makedirs(REACTOME_DIR, exist_ok=True)
@@ -35,38 +58,16 @@ def ensure_pathme_folders():
     os.makedirs(WIKIPATHWAYS_FILES, exist_ok=True)
 
 
-MODULE_NAME = 'pathme'
-PATHME_DIR = os.environ.get('PATHME_DIRECTORY', os.path.join(os.path.expanduser('~'), '.pathme'))
-DATA_DIR = get_data_dir()
-DEFAULT_CACHE_CONNECTION = get_connection(MODULE_NAME)
+ensure_pathme_folders()
 
-KEGG = 'kegg'
+UNKNOWN = 'unknown'
+
+# Other namespaces
 INTERPRO = 'interpro'
 PFAM = 'pfam'
 BRENDA = 'brenda'
-
-REACTOME = 'reactome'
-WIKIPATHWAYS = 'wikipathways'
-
-KEGG_DIR = os.path.join(DATA_DIR, KEGG)
-REACTOME_DIR = os.path.join(DATA_DIR, REACTOME)
-WIKIPATHWAYS_DIR = os.path.join(DATA_DIR, WIKIPATHWAYS)
-SPIA_DIR = os.path.join(DATA_DIR, 'spia')
-UNIVERSE_DIR = os.path.join(DATA_DIR, 'universe')
-
-KEGG_BEL = os.path.join(KEGG_DIR, 'bel')
-REACTOME_BEL = os.path.join(REACTOME_DIR, 'bel')
-WIKIPATHWAYS_BEL = os.path.join(WIKIPATHWAYS_DIR, 'bel')
-
-KEGG_FILES = os.path.join(KEGG_DIR, 'xml')
-REACTOME_FILES = os.path.join(REACTOME_DIR, 'rdf')
-WIKIPATHWAYS_FILES = os.path.join(WIKIPATHWAYS_DIR, 'rdf')
-
-KEGG_CACHE = os.path.join(DATA_DIR, KEGG, 'cache')
-
-ensure_pathme_folders()
-
-UNKNOWN = 'unknown'
+CHEMBL = 'chembl'
+MIRBASE = 'mirbase'
 
 KEGG_ID = 'kegg_id'
 KEGG_NAME = 'kegg_name'
@@ -86,10 +87,11 @@ def ensure_pathme_folders():
     'phosphorylation': 'Ph',
     'glycosylation': 'Glyco',
     'ubiquitination': 'Ub',
-    'methylation': 'Me'
+    'methylation': 'Me',
 }
 KEGG_CITATION = '10592173'
 
+# FIXME why doesn't this just import the compath_resources package?
 KEGG_WIKIPATHWAYS_MAPPINGS = 'https://github.com/ComPath/curation/raw/master/mappings/kegg_wikipathways.xlsx'
 KEGG_REACTOME_MAPPINGS = 'https://github.com/ComPath/curation/raw/master/mappings/kegg_reactome.xlsx'
 WIKIPATHWAYS_REACTOME_MAPPINGS = 'https://github.com/ComPath/curation/raw/master/mappings/wikipathways_reactome.xlsx'
@@ -140,7 +142,7 @@ def ensure_pathme_folders():
     'hidden compound': 'XML Hidden Compound Relations',
     'missing interaction': 'XML Missing Interaction Relations',
     'state change': 'XML State Change Relations',
-    'brite': 'XML Brite Hierarchy'
+    'brite': 'XML Brite Hierarchy',
 }
 
 BEL_STATS_COLUMN_NAMES = {
@@ -162,5 +164,5 @@ def ensure_pathme_folders():
     'hasVariant': 'BEL Variant Edges',
     'hasReactant': 'BEL Reactants Edges',
     'hasProduct': 'BEL Products Edges',
-    'translatedTo': 'BEL Translation Edges'
+    'translatedTo': 'BEL Translation Edges',
 }
diff --git a/src/pathme/utils.py b/src/pathme/utils.py
@@ -6,24 +6,20 @@
 import itertools as itt
 import logging
 import os
-import re
 import pickle
-from typing import Dict, List, Optional, Set
+from typing import Dict, Iterable, List, Optional, Set, Tuple
 from urllib.parse import urlparse
 from urllib.request import urlretrieve
 
 import click
 import pandas as pd
+import pybel
 import rdflib
+from pybel import BELGraph, from_pickle
+from pybel.struct.summary import count_functions, count_relations
 
-from pathme.constants import UNKNOWN, BEL_STATS_COLUMN_NAMES, PATHME_DIR
-
-import pybel
+from pathme.constants import BEL_STATS_COLUMN_NAMES, UNKNOWN
 from pathme.export_utils import get_files_in_folder
-from pybel import from_pickle
-from pybel import union
-from pybel.struct.summary import count_functions, edge_summary
-from pybel_tools import summary
 
 log = logging.getLogger(__name__)
 
@@ -40,38 +36,14 @@ def __call__(self, *args, **kwargs):
         return self.method(*args, **kwargs)
 
 
-def check_multiple(element, element_name):
-    """Check whether element is iterable.
-
-    :param element: variable to check
-    :param element_name: name to print
-    :return:
-    """
-    if isinstance(element, set) or isinstance(element, list):
-        log.warning('Multiple {}: {}'.format(element_name, element))
-        # TODO: print the wikipathways bps that return a set because they are probably wrong.
-        if len(element) == 1:
-            return list(element)[0]
-        elif len(element) > 1:
-            for subelement in element:
-                if bool(re.match('^[A-Z0-9]+$', subelement)):
-                    return subelement
-
-            return list(element)[0]
-        else:
-            log.warning('Empty list/set %s', element)
-
-    return element
-
 def parse_id_uri(uri):
     """Get the components of a given uri (with identifier at the last position).
 
-    :param str uri: URI
+    :param uri: URI
     :returns: prefix (ex: http://rdf.wikipathways.org/...)
     :returns: prefix_namespaces: if there are many namespaces, until the penultimate (ex: .../Pathway/WP22_r97775/...)
     :returns: namespace: if there are many namespaces, the last (ex: .../Interaction/)
     :returns: identifier (ex: .../c562c/)
-    :rtype: tuple[str,str,str,str]
     """
     parsed_url = urlparse(uri)
     uri_suffix = parsed_url.path.split('/')
@@ -82,18 +54,21 @@ def parse_id_uri(uri):
     # namespace (Interaction),
     # identifier (id61b0d9c7) in the given example ->
     # (http://rdf.wikipathways.org/Pathway/WP2118_r97625/WP/Interaction/id61b0d9c7)
-    return parsed_url.netloc, '/'.join(uri_suffix[0:-2]), uri_suffix[-2], uri_suffix[-1]
+    return (
+        parsed_url.netloc,
+        '/'.join(uri_suffix[0:-2]),
+        uri_suffix[-2],
+        uri_suffix[-1],
+    )
 
 
-def parse_namespace_uri(uri):
+def parse_namespace_uri(uri: str) -> Tuple[str, str, str]:
     """Get the prefix and namespace of a given URI (without identifier, only with a namspace at last position).
 
-    :param str uri: URI
+    :param uri: URI
     :returns: prefix (ex: http://purl.org/dc/terms/...)
     :returns: namespace (ex: .../isPartOf)
-    :rtype: tuple[str,str]
     """
-
     # Split the uri str by '/'.
     splited_uri = uri.split('/')
 
@@ -137,7 +112,6 @@ def parse_rdf(path: str, format: Optional[str] = None) -> rdflib.Graph:
 def entry_result_to_dict(entry, **kwargs):
     """Export to a dictionary a SPARQL query result data structure.
 
-    :param str rdflib.plugins.sparql.processor.SPARQLResult: SPARQL query result data structure, with all the arguments queried for all entries of a certain primary type.
     :returns: entries_dict: Dictionary with all the entries id as keys and the entries arguments as values.
     :rtype: dict
     """
@@ -184,7 +158,6 @@ def entries_dict_ids_argument(entries_dict):
 def query_result_to_dict(entries, **kwargs) -> Dict[str, Dict[str, Dict[str, str]]]:
     """Export to a dictionary a SPARQL query result data structure.
 
-    :param str rdflib.plugins.sparql.processor.SPARQLResult: SPARQL query result data structure, with all the arguments queried for all entries of a certain primary type.
     :returns: entries_dict: Dictionary with all the entries id as keys and the entries arguments as values.
     :rtype: dict
     """
@@ -276,8 +249,8 @@ def get_pathway_statitics(nodes_types, edges_types, bel_graph, **kwargs):
     pathway_statistics = {
         'RDF nodes': rdf_nodes_statistics,
         'RDF interactions': rdf_edges_statistics,
-        'BEL imported nodes': pybel.struct.summary.count_functions(bel_graph),
-        'BEL imported edges': summary.edge_summary.count_relations(bel_graph),
+        'BEL imported nodes': count_functions(bel_graph),
+        'BEL imported edges': count_relations(bel_graph),
         'bel_vs_rdf': {
             'RDF nodes': rdf_total_nodes,
             'RDF interactions': rdf_total_edges,
@@ -338,10 +311,10 @@ def statistics_to_df(all_pathways_statistics):
     return df
 
 
-def get_bel_types(path):
+def get_bel_types(path: str):
     """Get BEL node and edge type statistics.
 
-    :param str path: path to pickle
+    :param path: path to pickle
     :return: count of all nodes and edges in a BEL graph
     :rtype: dict
     """
@@ -357,16 +330,16 @@ def get_bel_types(path):
     bel_stats.update(bel_functions_dict)
 
     # Get count of all BEL edge types
-    bel_edges_dict = edge_summary.count_relations(bel_graph)
+    bel_edges_dict = count_relations(bel_graph)
     bel_stats.update(bel_edges_dict)
 
     return bel_stats
 
 
-def get_bel_stats(resource_folder):
+def get_bel_stats(resource_folder: str):
     """Get all BEL node and edge type statistics.
 
-    :param str resource_folder: path to BEL pickles folder
+    :param resource_folder: path to BEL pickles folder
     :return: count of all nodes and edges in all BEL graphs from one resource
     :rtype: dict
     """
@@ -401,7 +374,7 @@ def get_bel_stats(resource_folder):
 def get_genes_from_pickles(resource_folder: str, files: List[str], manager) -> Dict[str, set]:
     """Get BEL graph gene set for all pathways in resource.
 
-    :param str resource_folder: path to resource folder
+    :param resource_folder: path to resource folder
     :param list files: list of BEL graph pickles
     :param bio2bel Manager manager: Manager
     :return: BEL graph gene sets for each pathway in resource
@@ -452,9 +425,8 @@ def get_kegg_genes_from_pickles(resource_folder, files: List[str], manager) -> D
 def get_genes_in_graph(graph: pybel.BELGraph) -> Set[str]:
     """Get BEL graph gene set for a pathway.
 
-    :param pybel.BELGraph graph: BEL Graph
+    :param graph: BEL Graph
     :return: BEL graph gene set
-    :rtype: set
     """
     gene_set = set()
 
@@ -495,10 +467,10 @@ def jaccard_similarity(database_gene_set, bel_genes_set):
             count_no_similarity += 1
 
     print('Jaccard index for gene sets in database vs gene sets in BEL:')
-    print('{} of {} gene sets in the database and BEL graphs have a similarity of 100%.'.format(count, len(
-        jaccard_similarities)))
-    print('{} of {} gene sets in the database and BEL graphs have a similarity of 0%.'.format(count_no_similarity, len(
-        jaccard_similarities)))
+    print('{} of {} gene sets in the database and BEL '
+          'graphs have a similarity of 100%.'.format(count, len(jaccard_similarities)))
+    print('{} of {} gene sets in the database and '
+          'BEL graphs have a similarity of 0%.'.format(count_no_similarity, len(jaccard_similarities)))
 
     return jaccard_similarities
 
@@ -537,13 +509,10 @@ def download_data(force_download=False):
     decompress_file(data, export_path)
 
 
-def summarize_helper(graphs):
-    """Print in console summary of graphs.
-
-    :param iter[graphs] graphs: BEL Graphs
-    """
+def summarize_helper(graphs: Iterable[BELGraph]):
+    """Print in console summary of graphs."""
     click.echo('joining graphs')
-    graph = union(graphs)
+    graph = pybel.union(graphs)
 
     click.echo('generating summary')
     summary_str = graph.summary_str()