Skip to content

Commit

Permalink
Add export universe module
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Domingo-Fernandez committed Mar 5, 2019
1 parent 21dfffd commit a725346
Show file tree
Hide file tree
Showing 2 changed files with 241 additions and 5 deletions.
25 changes: 20 additions & 5 deletions src/pathme/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@
import time

import click
from bio2bel_chebi import Manager as ChebiManager
from bio2bel_hgnc import Manager as HgncManager
from pybel import from_pickle
from tqdm import tqdm

from bio2bel_chebi import Manager as ChebiManager
from bio2bel_hgnc import Manager as HgncManager
from pathme.constants import *
from pathme.constants import DEFAULT_CACHE_CONNECTION
from pathme.kegg.convert_to_bel import kegg_to_pickles
from pathme.kegg.utils import download_kgml_files, get_kegg_pathway_ids
from pathme.reactome.rdf_sparql import get_reactome_statistics, reactome_to_bel
from pathme.reactome.utils import untar_file
from pathme.utils import CallCounted, get_files_in_folder, make_downloader, statistics_to_df, summarize_helper
from pathme.wikipathways.rdf_sparql import get_wp_statistics, wikipathways_to_pickles
from pathme.wikipathways.utils import get_file_name_from_url, get_wikipathways_files, unzip_file
from pybel import from_pickle
from pybel.struct.mutation import collapse_to_genes, collapse_all_variants

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -359,7 +359,22 @@ def export_to_spia(kegg_path, reactome_path, wikipathways_path, output):
@main.command()
def get_harmonize_universe():
"""Return harmonized universe of all the databases included in PathMe."""
NotImplemented

def get_universe_graph():
raise NotADirectoryError

universe = get_universe_graph()

# Step 1: Flat complexes and composites
from pybel_tools.node_utils import list_abundance_cartesian_expansion, reaction_cartesian_expansion
list_abundance_cartesian_expansion(universe)
reaction_cartesian_expansion(universe)

# TODO: Harmonize entitiy names

# Step: 3. Merge to genes and variants
collapse_all_variants(universe)
collapse_to_genes(universe)


if __name__ == '__main__':
Expand Down
221 changes: 221 additions & 0 deletions src/pathme/export_universe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
# -*- coding: utf-8 -*-

"""Export harmonized universe."""

from typing import List
from pybel.dsl import Abundance, BiologicalProcess, CentralDogma, ListAbundance, Reaction

WIKIPATHWAYS_BIOL_PROCESS = {
"lipid biosynthesis", "hsc survival", "glycolysis & gluconeogenesis",
"triacylglyceride synthesis", "wnt canonical signaling", "regulation of actin skeleton",
"fatty acid metabolism", "mrna processing major splicing pathway", "senescence",
"monocyte differentiation", "pentose phosphate pathway", "ethanolamine phosphate",
"hsc differentiation", "actin, stress fibers and adhesion",
"regulation of actin cytoskeleton", "s-phase progression", "g1-s transition",
"toll-like receptor signaling pathway", "regulation of actin cytoskeleton",
"proteasome degradation", "apoptosis", "bmp pathway", "ampk activation",
"g1/s checkpoint arrest", "mapk signaling pathway",
"chromatin remodeling and epigenetic modifications", "wnt signaling pathway",
"ros production", "erbb signaling pathway", "shh pathway", "inflammation",
"dna replication", "mrna translation", "oxidative stress",
"cell cycle checkpoint activation", "gi/go pathway", "wnt pathway",
"g1/s transition of mitotic cell cycle", "modulation of estrogen receptor signalling",
"dna repair", "bmp canonical signaling", "igf and insuline signaling", "unfolded protein response", "cell death",
"p38/mapk pathway", "glycogen metabolism", "gnrh signal pathway",
"the intra-s-phase checkpoint mediated arrest of cell cycle progression", "tca cycle",
"mtor protein kinase signaling pathway", "proteasome degradation pathway", "morphine metabolism", "hsc aging",
"gastric pepsin release", "parietal cell production", "prostaglandin pathway", "cell cycle (g1/s) progression",
"notch pathway", "g2/m progression", "wnt signaling", "cell adhesion", "cell cycle progression", "egfr pathway",
"cell cycle", "angiogenesis", "g2/m-phase checkpoint", "hsc self renewal", "26s proteasome degradation",
"mapk signaling", "immune system up or down regulation", "m-phase progression", "insulin signaling",
"nf kappa b pathway", "cell cycle progression", "gi pathway",
"cd45+ hematopoietic- derived cell proliferation",
"kreb's cycle", "glycogen synthesis", "apoptosis pathway",
"g1/s progression", "inflammasome activation", "melanin biosynthesis", "proteasomal degradation",
"g2/m checkpoint arrest",
"g1/s cell cycle transition", "dna damage response", "gastric histamine release"
}

WIKIPATHWAYS_METAB = {
"2,8-dihydroxyadenine", "8,11-dihydroxy-delta-9-thc", "adp-ribosyl", "cocaethylene", "dhcer1p",
"ecgonidine", "f2-isoprostane", "fumonisins b1", "iodine", "l-glutamate", "lactosylceramide",
"methylecgonidine", "n-acetyl-l-aspartate", "nad+", "nadph oxidase", "neuromelanin",
"nicotinic acid (na)", "nmn", "pip2", "sphingomyelin", "thf"
}
WIKIPATHWAYS_NAME_NORMALIZATION = {
"Ca 2+": "ca 2+", "acetyl coa": "acetyl-coa", "acetyl-coa(mit)": "acetyl-coa",
"h20": "h2o"
}

# Entities in Reactome that required manual curation
BLACK_LIST_REACTOME = {"5'"}
REACTOME_PROT = {
"phospho-g2/m transition proteins", "integrin alpha5beta1, integrin alphavbeta3, cd47",
"food proteins", "activated fgfr2", "adherens junction-associated proteins",
"pi3k mutants,activator:pi3k", "prolyl 3-hydroxylases", "gpi-anchored proteins", "c3d, c3dg, ic3b",
"c4s/c6s chains", "activated fgfr1 mutants and fusions", "activated fgfr3 mutants", "protein",
"cyclin a2:cdk2 phosphorylated g2/m transition protein", "c4c, c3f", "activated raf/ksr1",
"activated fgfr1 mutants", "g2/m transition proteins", "lman family receptors", "cyclin",
"usp12:wdr48:wdr20,usp26", "proteins with cleaved gpi-anchors", "activated fgfr2 mutants", "c4d, ic3b",
"c5b:c6:c7, c8, c9", "cyclin a1:cdk2 phosphorylated g2/m transition protein",
"genetically or chemically inactive braf", "il13-downregulated proteins", "activated fgfr4 mutants",
"rna-binding protein in rnp (ribonucleoprotein) complexes", "effector proteins", "usp3, saga complex",
'dephosphorylated "receiver" raf/ksr1'
}


def process_reactome_multiple_genes(genes: str) -> List:
"""Process a wrong ID with multiple identifiers"""
gene_list = []
for counter, gene in enumerate(genes):

# Strip the ' gene' prefix
gene = gene.strip().strip(' gene').strip(' genes')

# First element is always OK
if counter == 0:
gene_list.append(gene)

# If the identifier starts the same than the first one, it is right
elif gene[:2] == genes[0][:2]:
gene_list.append(gene)

# If the identifier is longer than 2 it is a 'valid' HGNC symbol
elif len(gene) > 2:
gene_list.append(gene)

# If they start different, it might have only a number (e.g., 'ABC1, 2, 3') so it needs to be appended
elif gene.isdigit():
gene_list.append(genes[0][:-1] + gene)

# If the have only one letter (e.g., HTR1A,B,D,E,F,HTR5A)
elif len(gene) == 1:
gene_list.append(genes[0][:-1] + gene)

return gene_list


def munge_reactome_gene(gene):
"""Process Reactome gene"""
if "," in gene:
return process_reactome_multiple_genes(gene.split(","))

elif "/" in gene:
return process_reactome_multiple_genes(gene.split("/"))

return gene


def calculate_database_sets(nodes, database):
"""Calculate node sets for each modality in the database"""
gene_nodes = set()
mirna_nodes = set()
metabolite_nodes = set()
bp_nodes = set()

for node in nodes:

if isinstance(node, ListAbundance) or isinstance(node, Reaction) or not node.name:
continue

# Lower case name and strip quotes or white spaces
name = node.name.lower().strip('"').strip()

# Dealing with Genes/miRNAs
if isinstance(node, CentralDogma):

##################
# miRNA entities #
##################

if name.startswith("mir"):

# Reactome preprocessing to flat multiple identifiers
if database == 'reactome':
reactome_cell = munge_reactome_gene(name)
if isinstance(reactome_cell, list):
for name in reactome_cell:
mirna_nodes.add(name.replace("mir-", "mir"))
else:
mirna_nodes.add(name.strip(' genes').replace("mir-", "mir"))

continue

mirna_nodes.add(name.replace("mir-", "mir"))

##################
# Genes entities #
##################

else:
# Reactome preprocessing to flat multiple identifiers
if database == 'reactome':
reactome_cell = munge_reactome_gene(name)
if isinstance(reactome_cell, list):
for name in reactome_cell:
if name in BLACK_LIST_REACTOME: # Filter entities in black list
continue
elif name.startswith("("): # remove redundant parentheses
name = name.strip("(").strip(")")

gene_nodes.add(name)
else:
gene_nodes.add(name)
continue

# WikiPathways and KEGG do not require any processing of genes
if name in WIKIPATHWAYS_BIOL_PROCESS:
bp_nodes.add(name)
continue
gene_nodes.add(name)

#######################
# Metabolite entities #
#######################

elif isinstance(node, Abundance):

if database == 'wikipathways':
# Biological processes that are captured as abundance in BEL since they were characterized wrong in WikiPathways
if name in WIKIPATHWAYS_BIOL_PROCESS:
bp_nodes.add(name)
continue

elif node.namespace in {'WIKIDATA', 'WIKIPATHWAYS', 'REACTOME'} and name not in WIKIPATHWAYS_METAB:
bp_nodes.add(name)
continue

# Fix naming in duplicate entity
if name in WIKIPATHWAYS_NAME_NORMALIZATION:
name = WIKIPATHWAYS_NAME_NORMALIZATION[name]

elif database == 'reactome':
# Curated proteins that were coded as metabolites
if name in REACTOME_PROT:
gene_nodes.add(name)
continue

# Flat multiple identifiers (this is not trivial because most of ChEBI names contain commas,
# so a clever way to fix some of the entities is to check that all identifiers contain letters)
elif "," in name and all(
string.isalpha()
for string in name.split(",")
):
for string in name.split(","):
metabolite_nodes.add(name)
continue

metabolite_nodes.add(name)

#################################
# Biological Processes entities #
#################################

elif isinstance(node, BiologicalProcess):
if name.startswith('title:'):
name = name[6:] # KEGG normalize

bp_nodes.add(name)

return gene_nodes, mirna_nodes, metabolite_nodes, bp_nodes

0 comments on commit a725346

Please sign in to comment.