In [62]:
import sys

if ".." not in sys.path:
    sys.path.append("..")

In [63]:
import pandas as pd
import numpy as np
from cobra.io import read_sbml_model

GENES1 = "../model_building/genes/Ruegeria-pomeroyi-DSS-3-representative-genome-gene-to-reaction-mapping.tsv"
GENES2 = "../model_building/genes/Ruegeria-pomeroyi-DSS-3-gene-to-reaction-mapping.tsv"
MODEL_FILE = "../model/Rpom_05.xml"

# Load gene to reaction mapping
gene_to_reaction_1 = pd.read_csv(GENES1, sep="\t")
gene_to_reaction_2 = pd.read_csv(GENES2, sep="\t")

# Load model
model = read_sbml_model(MODEL_FILE)

In [64]:
# Some genes catalyze multiple reactions. Split the Reaction column into lists, broken by " // "
gene_to_reaction_1["Reaction"] = gene_to_reaction_1["Reaction"].str.split(" // ")
gene_to_reaction_2["Reaction"] = gene_to_reaction_2["Reaction"].str.split(" // ")


gene_to_reaction_2.head()

Unnamed: 0,Gene Name,Accession-1,Left-End-Position,Right-End-Position,Product,Reaction,Left,EC-Number,Right,ARALIP,KEGG reaction,METANETX,RHEA,Common-Name
0,G1RHL-3273,,4572154,4572399,G1RHL-3273-MONOMER,,,,,,,,,
1,G1RHL-378,,557329,558357,G1RHL-378-MONOMER,,,,,,,,,
2,G1RHL-3020,,4215298,4216131,G1RHL-3020-MONOMER,,,,,,,,,
3,SPO1136,SPO1136,1191288,1192661,SPO1136-MONOMER,[R101-RXN],CPD-470 // 2-KETOGLUTARATE,EC-2.6.1.76,L-ASPARTATE-SEMIALDEHYDE // GLT,,,,,
4,G1RHL-1195,,1693553,1694386,G1RHL-1195-MONOMER,,,,,,,,,


In [65]:
from collections import Counter

for i, mapping in enumerate([gene_to_reaction_1, gene_to_reaction_2]):
    print(f"Gene to reaction mapping {i + 1} =================================================")
        
    # For each reaction in the model, check if its stem is in the gene to reaction mapping
    has_match = []
    has_no_match = []
    for reaction in model.reactions:
        reaction_stem = reaction.annotation.get("stem", reaction.id)
        
        in_mapping = any(reaction_stem in reactions
                for reactions in mapping["Reaction"]
                if isinstance(reactions, list))
        
        if in_mapping:
            has_match.append(reaction)
        else:
            has_no_match.append(reaction)

    # Count how many reactions are in the mapping
    print(len(has_match), "reactions are in the gene to reaction mapping")
    print(len(has_no_match), "reactions are not in the gene to reaction mapping")

    # Of the model reactions that are not in the model, count how many have genes in the model
    # and how many do not
    has_genes = []
    for reaction in has_no_match:
        has_genes.append(len(reaction.genes) > 0)

    print(f"\t- of missing reactions, {Counter(has_genes)[True]} have genes in the model and {Counter(has_genes)[False]} do not")
        

768 reactions are in the gene to reaction mapping
1200 reactions are not in the gene to reaction mapping
	- of missing reactions, 805 have genes in the model and 395 do not
882 reactions are in the gene to reaction mapping
1086 reactions are not in the gene to reaction mapping
	- of missing reactions, 694 have genes in the model and 392 do not


In [75]:
len(model.reactions)

1968

In [73]:
has_no_match[24]

0,1
Reaction identifier,2.7.8.23-RXN
Name,2.7.8.23-RXN
Memory address,0x7f312c4d59a0
Stoichiometry,1-CARBOXYVINYL-CARBOXYPHOSPHONATE[c] + PROTON[c] <=> 3-HYDROHYDROXYPHOSPHORYLPYRUVATE[c] + CARBON-DIOXIDE[c]  1-carboxyvinyl carboxyphosphonate + H+ <=> 3-(hydroxyphosphinoyl)pyruvate + CO2
GPR,G1G48-1501
Lower bound,-1000.0
Upper bound,1000.0


In [60]:
# Create a Venn diagram of the genes that are in one mapping or the other

# For each reaction in the model, check if its stem is in the gene to reaction mapping
reactions_in_mapping_1 = set()
reactions_in_mapping_2 = set()

# Include metabolic reactions only
metabolic_reactions = {
    reaction
    for reaction in model.reactions
    if reaction not in model.boundary
    and not reaction.id.endswith("tex")
    and not reaction.id.endswith("tpp")
    and not reaction.id.endswith("-transport")
    and not "biomass" in reaction.id.lower()
    and not reaction.id == "ATPM"
    # and not "TRANS-RXN1G48" in reaction.id
}

for i, mapping in enumerate([gene_to_reaction_1, gene_to_reaction_2]):
    has_match = []
    has_no_match = []
    for reaction in metabolic_reactions:
        reaction_stem = reaction.annotation.get("stem", reaction.id)
        
        in_mapping = any(reaction_stem in reactions
                for reactions in mapping["Reaction"]
                if isinstance(reactions, list))
        
        if in_mapping:
            [reactions_in_mapping_1, reactions_in_mapping_2][i].add(reaction)

unique_to_1 = reactions_in_mapping_1 - reactions_in_mapping_2
unique_to_2 = reactions_in_mapping_2 - reactions_in_mapping_1
shared = reactions_in_mapping_1 & reactions_in_mapping_2
absent = metabolic_reactions - reactions_in_mapping_1 - reactions_in_mapping_2

print("(1 (1&2) 2) X")
print(f"({len(unique_to_1)}({len(shared)}){len(unique_to_2)}) {len(absent)}")


(1 (1&2) 2) X
(130(638)244) 810


In [77]:
len(model.reactions.query("TRANS-RXN1G48"))

332

In [61]:
# How many of the absent reactions have genes in the model?
# How many unique stems are there?

Counter(len(reaction.genes) > 0 for reaction in absent)

absent_and_has_genes = [reaction for reaction in absent if len(reaction.genes) > 0]

print(f"{len(absent_and_has_genes)} reactions are absent from both mappings and have genes in the model")

unique_stems = set()
for reaction in absent:
    unique_stems.add(reaction.annotation.get("stem", reaction.id))
print(f"Of these, there are {len(unique_stems)} unique stems")

unique_stems

557 reactions are absent from both mappings and have genes in the model
Of these, there are 444 unique stems


{'1.1.1.178-RXN',
 '1.1.1.262-RXN',
 '1.1.1.8-RXN',
 '1.13.11.25-RXN',
 '1.2.1.25-RXN',
 '1.3.99.15-RXN',
 '1.5.1.15-RXN',
 '1.5.5.1-RXN',
 '2-METHYLACYL-COA-DEHYDROGENASE-RXN',
 '2-METHYLCITRATE-DEHYDRATASE-RXN',
 '2-METHYLCITRATE-SYNTHASE-RXN',
 '2.1.3.1-RXN',
 '2.3.1.154-RXN',
 '2.3.1.176-RXN',
 '2.5.1.32-RXN',
 '2.6.1.22-RXN',
 '2.7.8.23-RXN',
 '25-DIOXOVALERATE-DEHYDROGENASE-RXN',
 '2KETO-3METHYLVALERATE-RXN',
 '2KETO-4METHYL-PENTANOATE-DEHYDROG-RXN',
 '3-HYDROXBUTYRYL-COA-DEHYDRATASE-RXN',
 '3-ISOPROPYLMALISOM-RXN',
 '3-OCTAPRENYL-4-OHBENZOATE-DECARBOX-RXN',
 '3-OXOADIPATE-COA-TRANSFERASE-RXN',
 '3.1.2.21-RXN',
 '3.1.2.23-RXN',
 '3.5.2.18-RXN',
 '3.5.5.7-RXN',
 '3.6.3.1-RXN',
 '3.6.3.1-RXN-2',
 '3.6.3.16-RXN',
 '3.6.3.20-RXN',
 '3.6.3.4-RXN',
 '3.8.1.8-RXN',
 '4-HYDROXYBENZOATE--COA-LIGASE-RXN',
 '4.1.99.4-RXN',
 '4.2.1.41-RXN',
 '4.2.1.93-RXN',
 '4.2.1.99-RXN',
 '5-OXOPROLINASE-ATP-HYDROLYSING-RXN',
 '6-OXOHEXANOATE-OXIDATION-RXN',
 '6PFRUCTPHOS-RXN',
 'AACPS4',
 'ABC-24-RXN',
 

In [49]:
model.genes

[<Gene G1G48-883 at 0x7f313116f7a0>,
 <Gene G1G48-3028 at 0x7f313116f7d0>,
 <Gene G1G48-2752 at 0x7f313116f860>,
 <Gene G1G48-3496 at 0x7f313116f8c0>,
 <Gene G1G48-2935 at 0x7f313116f920>,
 <Gene G1G48-1374 at 0x7f313116f8f0>,
 <Gene G1G48-719 at 0x7f313116f950>,
 <Gene G1G48-974 at 0x7f313116f980>,
 <Gene G1G48-3002 at 0x7f313116f9b0>,
 <Gene G1G48-462 at 0x7f313116f9e0>,
 <Gene G1G48-116 at 0x7f313116fa10>,
 <Gene G1G48-3616 at 0x7f313116fa40>,
 <Gene G1G48-3718 at 0x7f313116fa70>,
 <Gene G1G48-2914 at 0x7f313116faa0>,
 <Gene G1G48-934 at 0x7f313116fad0>,
 <Gene G1G48-341 at 0x7f313116fb00>,
 <Gene G1G48-3799 at 0x7f313116fb30>,
 <Gene G1G48-14 at 0x7f313116fb60>,
 <Gene G1G48-1357 at 0x7f313116fb90>,
 <Gene G1G48-1106 at 0x7f313116fbc0>,
 <Gene G1G48-3706 at 0x7f313116fbf0>,
 <Gene G1G48-2912 at 0x7f313116fc20>,
 <Gene G1G48-2904 at 0x7f313116fc50>,
 <Gene G1G48-2902 at 0x7f313116fc80>,
 <Gene G1G48-2535 at 0x7f313116fcb0>,
 <Gene G1G48-2235 at 0x7f313116fce0>,
 <Gene G1G48-315 at 0

In [50]:
# Are all reactions of the form TRANS-RXN1G48-... transporters from p->c?
for rxn in model.reactions.query("TRANS-RXN1G48"):
    # assert len(rxn.metabolites) == 2, rxn
    # assert "c" in rxn.compartments
    # assert "p" in rxn.compartments
    # if len(rxn.metabolites) != 2:
    print(rxn, rxn.genes)
    


TRANS-RXN1G48-1: SULFATE[p] --> SULFATE[c] frozenset({<Gene G1G48-1994 at 0x7f313102c980>, <Gene G1G48-1827 at 0x7f3131191610>, <Gene G1G48-3107 at 0x7f313102c9b0>})
TRANS-RXN1G48-10: THR[p] --> THR[c] frozenset({<Gene G1G48-1098 at 0x7f313102d700>, <Gene G1G48-2668 at 0x7f313102d730>, <Gene G1G48-3454 at 0x7f313102d760>, <Gene G1G48-295 at 0x7f313116f890>, <Gene G1G48-156 at 0x7f313102d6a0>, <Gene G1G48-3259 at 0x7f313102d6d0>, <Gene G1G48-2025 at 0x7f313102d790>})
TRANS-RXN1G48-11-LEU//LEU.9.: LEU[p] --> LEU[c] frozenset({<Gene G1G48-1098 at 0x7f313102d700>, <Gene G1G48-2668 at 0x7f313102d730>, <Gene G1G48-3454 at 0x7f313102d760>, <Gene G1G48-2025 at 0x7f313102d790>, <Gene G1G48-156 at 0x7f313102d6a0>, <Gene G1G48-3259 at 0x7f313102d6d0>, <Gene G1G48-311 at 0x7f3130ff5ee0>})
TRANS-RXN1G48-12: HCO3[p] + NA+[p] --> HCO3[c] + NA+[c] frozenset({<Gene G1G48-3366 at 0x7f3130fc1940>})
TRANS-RXN1G48-13-3-HYDROXY-L-KYNURENINE//3-HYDROXY-L-KYNURENINE.47.: 3-HYDROXY-L-KYNURENINE[p] --> 3-HYDROX

In [55]:
[gene.name for gene in model.genes if "G1G48" not in gene.name]

['G_SPO2573',
 'G_SPO0591',
 'G_SPO3469',
 'G_SPO2995',
 'G_SPO2996',
 'G_SPO1087',
 'G_SPO0184',
 'G_SPO3041',
 'G_SPO3040',
 'G_SPO2630',
 'G_SPO2628',
 'G_SPO0676',
 'G_SPO0674',
 'G_SPO0378',
 'G_SPO0379',
 'G_SPO0595',
 'G_SPO0596',
 'G_SPO0594',
 'G_slcD',
 'G_SPO1562',
 'G_SPO0997',
 'G_SPO0330',
 'G_SPO2241',
 'G_SPO2240',
 'G_SPO2242',
 'G_SPO3790',
 'G_SPO0340',
 'G_SPO3073',
 'G_SPOA0189',
 'G_SPOA0190',
 'G_SPO3525',
 'G_SPOA0188',
 'G_SPO3076',
 'G_SPO3526',
 'G_SPO1383',
 'G_SPOA0187',
 'G_G1RHL__45__427',
 'G_G1RHL__45__200',
 'G_SPO3524',
 'G_SPO3523',
 'G_SPO0273',
 'G_SPO0271',
 'G_SPO0272']

In [None]:
# Has genes on biocyc?? (representative genome)
gene_to_reaction_2["HOMOCYSMET-RXN"]

KeyError: 'HOMOCYSMET-RXN'

In [28]:
model.reactions.get_by_id("GLUCONOKIN-RXN").genes

frozenset({<Gene G1G48-865 at 0x7f3130ff75c0>})

In [7]:
# Create a combined mapping by taking the union of the two mappings
combined_mapping = gene_to_reaction_1[["Gene Name", "Reaction"]].merge(gene_to_reaction_2[["Gene Name", "Reaction"]], how="outer", on="Gene Name", suffixes=("_1", "_2"))

# Combine the reactions
reactions_combined = []
for reactions_1, reactions_2 in zip(combined_mapping["Reaction_1"], combined_mapping["Reaction_2"]):
    if isinstance(reactions_1, list) and isinstance(reactions_2, list):
        reactions_combined.append(list(set(reactions_1 + reactions_2)))
    elif isinstance(reactions_1, list):
        reactions_combined.append(reactions_1)
    elif isinstance(reactions_2, list):
        reactions_combined.append(reactions_2)
    else:
        reactions_combined.append([])

combined_mapping["Reactions combined"] = reactions_combined
combined_mapping

Unnamed: 0,Gene Name,Reaction_1,Reaction_2,Reactions combined
0,G1RHL-1,,,[]
1,G1RHL-10,,,[]
2,G1RHL-100,,,[]
3,G1RHL-1000,,,[]
4,G1RHL-1001,,,[]
...,...,...,...,...
8805,SPO_SP23SB,,,[]
8806,SPO_SP23SC,,,[]
8807,SPO_SP5SD,,,[]
8808,SPO_SP5SE,,,[]


In [8]:
# For each reaction in the model, check if its stem is in the gene to reaction mapping
has_match = []
has_no_match = []
for reaction in model.reactions:
    reaction_stem = reaction.annotation.get("stem", reaction.id)

    in_mapping = any(reaction_stem in reactions
            for reactions in combined_mapping["Reactions combined"]
            if isinstance(reactions, list))

    if in_mapping:
        has_match.append(reaction)
    else:
        has_no_match.append(reaction)

# Count how many reactions are in the mapping
print(len(has_match), "reactions are in the gene to reaction mapping")
print(len(has_no_match), "reactions are not in the gene to reaction mapping")


1012 reactions are in the gene to reaction mapping
956 reactions are not in the gene to reaction mapping


In [9]:
# Of the reactions that are not in the mapping, count how many have non-zero flux on glucose

with model:
    # Set maintenance and glucose uptake
    model.reactions.get_by_id("EX_glc").lower_bound = -10
    model.reactions.get_by_id("ATPM").bounds = 25, 25

    sol = model.optimize()
    print(sol.objective_value)

    missing_reaction_fluxes = {}
    for reaction in has_no_match:
        if len(reaction.genes) > 0:
            missing_reaction_fluxes[reaction.id] = sol.fluxes[reaction.id]
    
print(f"Of the {len(has_no_match)} reactions that are not in the gene to reaction mapping\n" \
      "and have some associated genes, "\
      f"{len([flux for flux in missing_reaction_fluxes.values() if abs(flux) > 0])} have non-zero flux on glucose")
print()

# Print reactions with non-zero flux
for reaction, flux in missing_reaction_fluxes.items():
    if abs(flux) > 0:
        print(reaction, flux)

0.7189837856059783
Of the 956 reactions that are not in the gene to reaction mapping
and have some associated genes, 30 have non-zero flux on glucose

1.5.5.1-RXN-ETF-Reduced/UBIQUINONE-10//ETF-Oxidized/CPD-9958/PROTON.56. 0.924038549558733
2-METHYLACYL-COA-DEHYDROGENASE-RXN 6.064791611132614e-19
2KETO-3METHYLVALERATE-RXN 6.064791611132614e-19
3-HYDROXBUTYRYL-COA-DEHYDRATASE-RXN -0.6694789062808455
ADCLY-RXN 0.0004810001263071747
ERYTH4PDEHYDROG-RXN 0.00016033337543576632
HOMOCYSMET-RXN 0.09197986189716041
ORNDECARBOX-RXN 0.3074294783187237
RXN-11667 1.4853621635263388
RXN-12745 8.203105729480625e-15
RXN-14014-DELTA1-PIPERIDEINE-2-6-DICARBOXYLATE/NAD/WATER//CPD-14443/NADH/PROTON.70. 0.20554771568230945
RXN-14049 1.0265119190148647e-16
RXN-15149 -2.0619801545231825e-12
RXN-16032-PALMITYL-COA/1-PALMITOYLGLYCEROL-3-PHOSPHATE//CPD0-1422/CO-A.61. -0.10815529231323955
RXN-17018 0.0458812310395082
RXN-7968-SHIKIMATE/NAD//3-DEHYDRO-SHIKIMATE/NADH/PROTON.47. 0.4866717579364221
RXN-8960 0.815883