Add export universe module

PathwayMerger · Mar 5, 2019 · a725346 · a725346
1 parent 21dfffd
commit a725346
Show file tree

Hide file tree

Showing 2 changed files with 241 additions and 5 deletions.
diff --git a/src/pathme/cli.py b/src/pathme/cli.py
@@ -6,20 +6,20 @@
 import time
 
 import click
-from bio2bel_chebi import Manager as ChebiManager
-from bio2bel_hgnc import Manager as HgncManager
-from pybel import from_pickle
 from tqdm import tqdm
 
+from bio2bel_chebi import Manager as ChebiManager
+from bio2bel_hgnc import Manager as HgncManager
 from pathme.constants import *
-from pathme.constants import DEFAULT_CACHE_CONNECTION
 from pathme.kegg.convert_to_bel import kegg_to_pickles
 from pathme.kegg.utils import download_kgml_files, get_kegg_pathway_ids
 from pathme.reactome.rdf_sparql import get_reactome_statistics, reactome_to_bel
 from pathme.reactome.utils import untar_file
 from pathme.utils import CallCounted, get_files_in_folder, make_downloader, statistics_to_df, summarize_helper
 from pathme.wikipathways.rdf_sparql import get_wp_statistics, wikipathways_to_pickles
 from pathme.wikipathways.utils import get_file_name_from_url, get_wikipathways_files, unzip_file
+from pybel import from_pickle
+from pybel.struct.mutation import collapse_to_genes, collapse_all_variants
 
 log = logging.getLogger(__name__)
 
@@ -359,7 +359,22 @@ def export_to_spia(kegg_path, reactome_path, wikipathways_path, output):
 @main.command()
 def get_harmonize_universe():
     """Return harmonized universe of all the databases included in PathMe."""
-    NotImplemented
+
+    def get_universe_graph():
+        raise NotADirectoryError
+
+    universe = get_universe_graph()
+
+    # Step 1: Flat complexes and composites
+    from pybel_tools.node_utils import list_abundance_cartesian_expansion, reaction_cartesian_expansion
+    list_abundance_cartesian_expansion(universe)
+    reaction_cartesian_expansion(universe)
+
+    # TODO: Harmonize entitiy names
+
+    # Step: 3. Merge to genes and variants
+    collapse_all_variants(universe)
+    collapse_to_genes(universe)
 
 
 if __name__ == '__main__':

diff --git a/src/pathme/export_universe.py b/src/pathme/export_universe.py
@@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+
+"""Export harmonized universe."""
+
+from typing import List
+from pybel.dsl import Abundance, BiologicalProcess, CentralDogma, ListAbundance, Reaction
+
+WIKIPATHWAYS_BIOL_PROCESS = {
+    "lipid biosynthesis", "hsc survival", "glycolysis & gluconeogenesis",
+    "triacylglyceride  synthesis", "wnt canonical signaling", "regulation of actin skeleton",
+    "fatty acid metabolism", "mrna processing major splicing pathway", "senescence",
+    "monocyte differentiation", "pentose phosphate pathway", "ethanolamine  phosphate",
+    "hsc differentiation", "actin, stress fibers and adhesion",
+    "regulation of actin cytoskeleton", "s-phase progression", "g1-s transition",
+    "toll-like receptor signaling pathway", "regulation of  actin cytoskeleton",
+    "proteasome degradation", "apoptosis", "bmp pathway", "ampk activation",
+    "g1/s checkpoint arrest", "mapk signaling pathway",
+    "chromatin remodeling and  epigenetic modifications", "wnt signaling pathway",
+    "ros production", "erbb signaling pathway", "shh pathway", "inflammation",
+    "dna replication", "mrna translation", "oxidative stress",
+    "cell cycle checkpoint activation", "gi/go pathway", "wnt pathway",
+    "g1/s transition of mitotic cell cycle", "modulation of estrogen receptor signalling",
+    "dna repair", "bmp canonical signaling", "igf and insuline signaling", "unfolded protein response", "cell death",
+    "p38/mapk  pathway", "glycogen metabolism", "gnrh signal pathway",
+    "the intra-s-phase checkpoint mediated arrest of cell cycle progression", "tca cycle",
+    "mtor protein kinase signaling pathway", "proteasome  degradation pathway", "morphine metabolism", "hsc aging",
+    "gastric pepsin release", "parietal cell production", "prostaglandin pathway", "cell cycle (g1/s)  progression",
+    "notch pathway", "g2/m progression", "wnt signaling", "cell adhesion", "cell cycle progression", "egfr pathway",
+    "cell cycle", "angiogenesis", "g2/m-phase checkpoint", "hsc self renewal", "26s proteasome  degradation",
+    "mapk signaling", "immune system up or down regulation", "m-phase progression", "insulin signaling",
+    "nf kappa b pathway", "cell cycle  progression", "gi pathway",
+    "cd45+ hematopoietic-    derived cell    proliferation",
+    "kreb's cycle", "glycogen synthesis", "apoptosis pathway",
+    "g1/s progression", "inflammasome activation", "melanin biosynthesis", "proteasomal degradation",
+    "g2/m checkpoint arrest",
+    "g1/s cell cycle transition", "dna damage response", "gastric histamine release"
+}
+
+WIKIPATHWAYS_METAB = {
+    "2,8-dihydroxyadenine", "8,11-dihydroxy-delta-9-thc", "adp-ribosyl", "cocaethylene", "dhcer1p",
+    "ecgonidine", "f2-isoprostane", "fumonisins b1", "iodine", "l-glutamate", "lactosylceramide",
+    "methylecgonidine", "n-acetyl-l-aspartate", "nad+", "nadph oxidase", "neuromelanin",
+    "nicotinic acid (na)", "nmn", "pip2", "sphingomyelin", "thf"
+}
+WIKIPATHWAYS_NAME_NORMALIZATION = {
+    "Ca 2+": "ca 2+", "acetyl coa": "acetyl-coa", "acetyl-coa(mit)": "acetyl-coa",
+    "h20": "h2o"
+}
+
+# Entities in Reactome that required manual curation
+BLACK_LIST_REACTOME = {"5'"}
+REACTOME_PROT = {
+    "phospho-g2/m transition proteins", "integrin alpha5beta1, integrin alphavbeta3, cd47",
+    "food proteins", "activated fgfr2", "adherens junction-associated proteins",
+    "pi3k mutants,activator:pi3k", "prolyl 3-hydroxylases", "gpi-anchored proteins", "c3d, c3dg, ic3b",
+    "c4s/c6s chains", "activated fgfr1 mutants and fusions", "activated fgfr3 mutants", "protein",
+    "cyclin a2:cdk2 phosphorylated g2/m transition protein", "c4c, c3f", "activated raf/ksr1",
+    "activated fgfr1 mutants", "g2/m transition proteins", "lman family receptors", "cyclin",
+    "usp12:wdr48:wdr20,usp26", "proteins with cleaved gpi-anchors", "activated fgfr2 mutants", "c4d, ic3b",
+    "c5b:c6:c7, c8, c9", "cyclin a1:cdk2 phosphorylated g2/m transition protein",
+    "genetically or chemically inactive braf", "il13-downregulated proteins", "activated fgfr4 mutants",
+    "rna-binding protein in rnp (ribonucleoprotein) complexes", "effector proteins", "usp3, saga complex",
+    'dephosphorylated "receiver" raf/ksr1'
+}
+
+
+def process_reactome_multiple_genes(genes: str) -> List:
+    """Process a wrong ID with multiple identifiers"""
+    gene_list = []
+    for counter, gene in enumerate(genes):
+
+        # Strip the ' gene' prefix
+        gene = gene.strip().strip(' gene').strip(' genes')
+
+        # First element is always OK
+        if counter == 0:
+            gene_list.append(gene)
+
+        # If the identifier starts the same than the first one, it is right
+        elif gene[:2] == genes[0][:2]:
+            gene_list.append(gene)
+
+        # If the identifier is longer than 2 it is a 'valid' HGNC symbol
+        elif len(gene) > 2:
+            gene_list.append(gene)
+
+        # If they start different, it might have only a number (e.g., 'ABC1, 2, 3') so it needs to be appended
+        elif gene.isdigit():
+            gene_list.append(genes[0][:-1] + gene)
+
+        # If the have only one letter (e.g., HTR1A,B,D,E,F,HTR5A)
+        elif len(gene) == 1:
+            gene_list.append(genes[0][:-1] + gene)
+
+    return gene_list
+
+
+def munge_reactome_gene(gene):
+    """Process Reactome gene"""
+    if "," in gene:
+        return process_reactome_multiple_genes(gene.split(","))
+
+    elif "/" in gene:
+        return process_reactome_multiple_genes(gene.split("/"))
+
+    return gene
+
+
+def calculate_database_sets(nodes, database):
+    """Calculate node sets for each modality in the database"""
+    gene_nodes = set()
+    mirna_nodes = set()
+    metabolite_nodes = set()
+    bp_nodes = set()
+
+    for node in nodes:
+
+        if isinstance(node, ListAbundance) or isinstance(node, Reaction) or not node.name:
+            continue
+
+        # Lower case name and strip quotes or white spaces
+        name = node.name.lower().strip('"').strip()
+
+        # Dealing with Genes/miRNAs
+        if isinstance(node, CentralDogma):
+
+            ##################
+            # miRNA entities #
+            ##################
+
+            if name.startswith("mir"):
+
+                # Reactome preprocessing to flat multiple identifiers
+                if database == 'reactome':
+                    reactome_cell = munge_reactome_gene(name)
+                    if isinstance(reactome_cell, list):
+                        for name in reactome_cell:
+                            mirna_nodes.add(name.replace("mir-", "mir"))
+                    else:
+                        mirna_nodes.add(name.strip(' genes').replace("mir-", "mir"))
+
+                    continue
+
+                mirna_nodes.add(name.replace("mir-", "mir"))
+
+            ##################
+            # Genes entities #
+            ##################
+
+            else:
+                # Reactome preprocessing to flat multiple identifiers
+                if database == 'reactome':
+                    reactome_cell = munge_reactome_gene(name)
+                    if isinstance(reactome_cell, list):
+                        for name in reactome_cell:
+                            if name in BLACK_LIST_REACTOME:  # Filter entities in black list
+                                continue
+                            elif name.startswith("("):  # remove redundant parentheses
+                                name = name.strip("(").strip(")")
+
+                            gene_nodes.add(name)
+                    else:
+                        gene_nodes.add(name)
+                    continue
+
+                # WikiPathways and KEGG do not require any processing of genes
+                if name in WIKIPATHWAYS_BIOL_PROCESS:
+                    bp_nodes.add(name)
+                    continue
+                gene_nodes.add(name)
+
+        #######################
+        # Metabolite entities #
+        #######################
+
+        elif isinstance(node, Abundance):
+
+            if database == 'wikipathways':
+                # Biological processes that are captured as abundance in BEL since they were characterized wrong in WikiPathways
+                if name in WIKIPATHWAYS_BIOL_PROCESS:
+                    bp_nodes.add(name)
+                    continue
+
+                elif node.namespace in {'WIKIDATA', 'WIKIPATHWAYS', 'REACTOME'} and name not in WIKIPATHWAYS_METAB:
+                    bp_nodes.add(name)
+                    continue
+
+                # Fix naming in duplicate entity
+                if name in WIKIPATHWAYS_NAME_NORMALIZATION:
+                    name = WIKIPATHWAYS_NAME_NORMALIZATION[name]
+
+            elif database == 'reactome':
+                # Curated proteins that were coded as metabolites
+                if name in REACTOME_PROT:
+                    gene_nodes.add(name)
+                    continue
+
+                # Flat multiple identifiers (this is not trivial because most of ChEBI names contain commas,
+                # so a clever way to fix some of the entities is to check that all identifiers contain letters)
+                elif "," in name and all(
+                        string.isalpha()
+                        for string in name.split(",")
+                ):
+                    for string in name.split(","):
+                        metabolite_nodes.add(name)
+                    continue
+
+            metabolite_nodes.add(name)
+
+        #################################
+        # Biological Processes entities #
+        #################################
+
+        elif isinstance(node, BiologicalProcess):
+            if name.startswith('title:'):
+                name = name[6:]  # KEGG normalize
+
+            bp_nodes.add(name)
+
+    return gene_nodes, mirna_nodes, metabolite_nodes, bp_nodes
+