In [1]:
import sys
if ".." not in sys.path:
    sys.path.append("..")

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from cobra.io import read_sbml_model

from experiments.fast_dFBA import setup_drawdown, MichaelisMentenBounds, dFBA

MODEL_FILE = "../model/Rpom_05.xml"
BIOMASS_ID = "RPOM_provisional_biomass"

# Load and set up model
model = read_sbml_model(MODEL_FILE)
setup_drawdown(model)

In [21]:
# Load annotated genome data
gene_annotated = pd.read_excel("../data/Metabolic Inventory.xlsx",
                               sheet_name="RPOM_genome_annotations",
                               usecols=lambda x: 'Unnamed' not in x)
gene_annotated

Unnamed: 0,gene_callers_id,contig,start,stop,direction,Locus_Tag,Locus_Tag (ACCESSION),COG20_PATHWAY,COG20_PATHWAY (ACCESSION),KEGG_BRITE,KEGG_BRITE (ACCESSION),COG20_CATEGORY,COG20_CATEGORY (ACCESSION),SPO_ID,SPO_ID (ACCESSION),Gene_ID,Gene_ID (ACCESSION),KEGG_Class,KEGG_Class (ACCESSION),COG20_FUNCTION,COG20_FUNCTION (ACCESSION),KOfam,KOfam (ACCESSION),KEGG_Module,KEGG_Module (ACCESSION),dna_sequence
0,1,chromosome,0,1868,f,,SPO_RS00005,tRNA modification,COG0445,Transfer RNA biogenesis>>>Eukaryotic type>>>tR...,ko03016!!!ko03016!!!ko03036,"Translation, ribosomal structure and biogenesis",J,,SPO0001,glucose inhibited division protein A,gidA,,,tRNA U34 5-carboxymethylaminomethyl modifying ...,COG0445,tRNA uridine 5-carboxymethylaminomethyl modifi...,K03495,,,GTGAAACATTCGGATTTCGATATTGTCGTGATCGGGGCCGGACATG...
1,2,chromosome,1865,2479,f,,SPO_RS22355,16S rRNA modification,COG0357,Enzymes>>>2. Transferases>>>2.1 Transferring ...,ko01000!!!ko03009!!!ko03036,"Translation, ribosomal structure and biogenesis",J,,SPO0002,glucose-inhibited division protein B,gidB,,,16S rRNA G527 N7-methylase RsmG (former glucos...,COG0357,16S rRNA (guanine527-N7)-methyltransferase [EC...,K03501,,,ATGATGGTTCCCGATGCGAACACGCTCAATGTTTCACGTGAAACAT...
2,3,chromosome,2472,3275,f,,SPO_RS00015,,,Chromosome and associated proteins>>>Prokaryot...,ko03036!!!ko04812,"Cell cycle control, cell division, chromosome ...",D!!!N,,SPO0003,chromosome partitioning protein ParA,parA,,,ParA-like ATPase involved in chromosome/plasmi...,COG1192,chromosome partitioning protein,K03496,,,GTGTCTGATCTTTCCCGTCCTGCCGGACCCCGGATCATTGCGGTCG...
3,4,chromosome,3298,4188,f,,SPO_RS00020,,,Transcription factors>>>Prokaryotic type>>>Oth...,ko03000!!!ko03036!!!ko04812,"Cell cycle control, cell division, chromosome ...",D,,SPO0004,chromosome partitioning protein parB,parB,,,"Chromosome segregation protein Spo0J, contains...",COG1475,"ParB family transcriptional regulator, chromos...",K03497,,,ATGGTTTCGAACAAGCCCCGGGGATTGGGACGCGGATTGTCCGCGT...
4,5,chromosome,4178,4531,r,,SPO_RS00025,,,KEGG Orthology (KO)>>>09190 Not Included in Pa...,ko00001,Function unknown,S,,SPO0005,hypothetical protein,SPO0005,,,"Uncharacterized membrane protein YbaN, DUF454 ...",COG2832,uncharacterized protein,K09790,,,TGCAGTTTATCTGGGCAGCTCTTGGACTGGTCTGCGTTGCCCTCGC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4337,4338,megaplasmid,489040,489135,f,,,,,,,,,,SPOA0442,hypothetical protein,SPOA0442,,,,,,,,,ATGCTCGACTCTGTGGGGCGTTTTTGCTATCATGCCGCCACTTTTC...
4338,4339,megaplasmid,489175,489438,f,,SPO_RS22065,,,,,,,,SPOA0443,hypothetical protein,SPOA0443,,,,,,,,,ATGCCAGAAGACGGAATCGGTTTTGAATTCTTCAGAGACGATCAAA...
4339,4340,megaplasmid,489505,490014,r,,SPO_RS22070,Fatty acid biosynthesis,COG0764,KEGG Orthology (KO)>>>09100 Metabolism>>>09103...,ko00001!!!ko01000!!!ko01000!!!ko01004,Lipid transport and metabolism,I,,SPOA0444,3-hydroxydecanoyl-ACP dehydratase,fabA,Pathway modules; Lipid metabolism; Fatty acid ...,M00083,3-hydroxymyristoyl/3-hydroxydecanoyl-(acyl car...,COG0764,3-hydroxyacyl-[acyl-carrier protein] dehydrata...,K01716,"Fatty acid biosynthesis, elongation",M00083,TGGCCCAATACCCGAGCAGCTTTGACAAGGAAGACCTGCTGAAATG...
4340,4341,megaplasmid,490203,490622,f,,SPO_RS22075,,,Transcription factors>>>Prokaryotic type>>>Hel...,ko03000,Inorganic ion transport and metabolism,P,,SPOA0445,FUR family transcriptional regulator,SPOA0445,,,Fe2+ or Zn2+ uptake regulation protein Fur/Zur...,COG0735,"Fur family transcriptional regulator, iron res...",K09826,,,ATGACGCCAAATTCTCAGGAAATCGCCACCGATTGGCTGGTCGATG...


In [6]:
# Load gene calls data (has aa sequence)
gene_calls = pd.read_excel("../data/DSS3_external_gene_calls.xlsx")
gene_calls

Unnamed: 0,gene_callers_id,contig,start,stop,direction,partial,call_type,source,version,aa_sequence
0,SPO0001,chromosome,0,1868,f,0,1,Glimmer,2,VKHSDFDIVVIGAGHAGAEAAHAAARMGMRTALVSLSERDIGVMSC...
1,SPO0002,chromosome,1865,2479,f,0,1,Glimmer,2,MMVPDANTLNVSRETFERLKIFADLVHKWNPRINLVSKRSLEDLWT...
2,SPO0003,chromosome,2472,3275,f,0,1,Glimmer,2,VSDLSRPAGPRIIAVANQKGGVGKTTTAINLAAALVESGQRVLVVD...
3,SPO0004,chromosome,3298,4188,f,0,1,Glimmer,2,MVSNKPRGLGRGLSALMADVTQPAEAAASEAARRPDRTVPIEKLRA...
4,SPO0005,chromosome,4178,4531,r,0,1,Glimmer,2,MQFIWAALGLVCVALALIGVALPLLPTVPFLLLAAFFFARSSERLH...
...,...,...,...,...,...,...,...,...,...,...
4337,SPOA0442,megaplasmid,489040,489135,f,0,1,Glimmer,2,MLDSVGRFCYHAATFLNFVTCFGAIEDSRVF
4338,SPOA0443,megaplasmid,489175,489438,f,0,1,Glimmer,2,MPEDGIGFEFFRDDQNPRLTIVNEAALQNASNQLFNEIAHLRAFDH...
4339,SPOA0444,megaplasmid,489505,490014,r,0,1,Glimmer,2,MAQYPSSFDKEDLLKCARGELFGPGNAQLPAPPMLMMDRITEVSAD...
4340,SPOA0445,megaplasmid,490203,490622,f,0,1,Glimmer,2,MTPNSQEIATDWLVDAGLRPTRQRVALAELLVGDGRHRHVTAESLF...


In [8]:
# Load transcriptomic data
transcript_data = pd.read_excel("../data/DSS3_MIT1002_expression_data_updated-20230915.xlsx", sheet_name="DSS-3_prot")
transcript_data

Unnamed: 0,Rank,gene_callers_id,DSS3_ac_mean_abund,DSS3_glc_mean_abund,DSS3_late_mean_abund,DSS3_early_mean_abund,DSS3_ac_v_DSS3_glc_FOLD_CHANGE_prot,DSS3_ac_v_DSS3_late_FOLD_CHANGE_prot,DSS3_ac_v_DSS3_early_FOLD_CHANGE_prot,DSS3_glc_v_DSS3_late_FOLD_CHANGE_prot,DSS3_glc_v_DSS3_early_FOLD_CHANGE_prot,DSS3_late_v_DSS3_early_FOLD_CHANGE_prot,ANOVA p-value,BH_crit,BH_valid,DSS3_ac_v_DSS3_glc_Tukey_HSD_adjusted_p,DSS3_ac_v_DSS3_late_Tukey_HSD_adjusted_p,DSS3_ac_v_DSS3_early_Tukey_HSD_adjusted_p,DSS3_glc_v_DSS3_late_Tukey_HSD_adjusted_p,DSS3_glc_v_DSS3_early_Tukey_HSD_adjusted_p,DSS3_late_v_DSS3_early_Tukey_HSD_adjusted_p,SPO_ID (ACCESSION),KOfam (ACCESSION),KEGG_Module,COG20_FUNCTION
0,1,476,0.000023,0.000040,0.000006,0.000006,-0.788498,1.999999,1.999999,2.788497,2.788497,0.000000,0.0,0.000012,True,0.0,0.0,0.0,0.0,,,SPO0476,K06162,,Alpha-D-ribose 1-methylphosphonate 5-triphosph...
1,2,231,0.000035,0.000001,0.000015,0.000004,4.696130,1.169927,3.169923,-3.526203,-1.526206,1.999996,0.0,0.000023,True,0.0,0.0,0.0,0.0,,,SPO0233,K03719,,"DNA-binding transcriptional regulator, Lrp fam..."
2,3,384,0.000011,0.000006,0.000003,0.000003,0.877139,1.999997,1.999997,1.122859,1.122859,0.000000,0.0,0.000035,True,0.0,0.0,0.0,0.0,,,SPO0384,,,Uncharacterized conserved protein containing a...
3,4,3705,0.000007,0.000014,0.000007,0.000007,-0.921748,0.000000,0.000000,0.921748,0.921748,0.000000,0.0,0.000046,True,0.0,,,0.0,,,SPO3704,K15546,,Shikimate kinase (AroK) (PDB:1E6C)!!!Archaeal ...
4,5,1365,0.000008,0.000012,0.000008,0.000008,-0.544543,0.000000,0.000000,0.544543,0.544543,0.000000,0.0,0.000058,True,0.0,,,0.0,,,SPO1363,K00796,"Tetrahydrofolate biosynthesis, GTP => THF",Dihydropteroate synthase (FolP) (PDB:1AD1)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4337,4338,4337,,,,,,,,,,,,0.049954,False,,,,,,,SPOA0441,,,
4338,4339,4338,,,,,,,,,,,,0.049965,False,,,,,,,SPOA0442,,,
4339,4340,4339,,,,,,,,,,,,0.049977,False,,,,,,,SPOA0443,,,
4340,4341,4341,,,,0.000004,,,,,,,,0.049988,False,,,,,,,SPOA0445,K09826,,Fe2+ or Zn2+ uptake regulation protein Fur/Zur...


In [30]:
# Get gene call ids, locations, aa sequences, abundance in glc and acetate, and additional ids/modules in one dataframe
genes = gene_annotated.set_index(
    "SPO_ID (ACCESSION)"
).join(
    transcript_data.set_index(
        "SPO_ID (ACCESSION)"
    ).filter(
        items=["Rank",
               "gene_callers_id",
               "DSS3_ac_mean_abund",
               "DSS3_glc_mean_abund",
               "SPO_ID (ACCESSION)",
               "KOfam (ACCESSION)",
               "KEGG_Module",
               "COG20_FUNCTION"]
    ),
    rsuffix="_transcriptomic"
).join(
    gene_calls.set_index("gene_callers_id").filter(
        items=["aa_sequence"]),
    rsuffix="annotated"
)
genes

Unnamed: 0_level_0,gene_callers_id,contig,start,stop,direction,Locus_Tag,Locus_Tag (ACCESSION),COG20_PATHWAY,COG20_PATHWAY (ACCESSION),KEGG_BRITE,KEGG_BRITE (ACCESSION),COG20_CATEGORY,COG20_CATEGORY (ACCESSION),SPO_ID,Gene_ID,Gene_ID (ACCESSION),KEGG_Class,KEGG_Class (ACCESSION),COG20_FUNCTION,COG20_FUNCTION (ACCESSION),KOfam,KOfam (ACCESSION),KEGG_Module,KEGG_Module (ACCESSION),dna_sequence,Rank,gene_callers_id_transcriptomic,DSS3_ac_mean_abund,DSS3_glc_mean_abund,KOfam (ACCESSION)_transcriptomic,KEGG_Module_transcriptomic,COG20_FUNCTION_transcriptomic,aa_sequence
SPO_ID (ACCESSION),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
SPO0001,1,chromosome,0,1868,f,,SPO_RS00005,tRNA modification,COG0445,Transfer RNA biogenesis>>>Eukaryotic type>>>tR...,ko03016!!!ko03016!!!ko03036,"Translation, ribosomal structure and biogenesis",J,,glucose inhibited division protein A,gidA,,,tRNA U34 5-carboxymethylaminomethyl modifying ...,COG0445,tRNA uridine 5-carboxymethylaminomethyl modifi...,K03495,,,GTGAAACATTCGGATTTCGATATTGTCGTGATCGGGGCCGGACATG...,1855,1,0.000064,0.000097,K03495,,tRNA U34 5-carboxymethylaminomethyl modifying ...,VKHSDFDIVVIGAGHAGAEAAHAAARMGMRTALVSLSERDIGVMSC...
SPO0002,2,chromosome,1865,2479,f,,SPO_RS22355,16S rRNA modification,COG0357,Enzymes>>>2. Transferases>>>2.1 Transferring ...,ko01000!!!ko03009!!!ko03036,"Translation, ribosomal structure and biogenesis",J,,glucose-inhibited division protein B,gidB,,,16S rRNA G527 N7-methylase RsmG (former glucos...,COG0357,16S rRNA (guanine527-N7)-methyltransferase [EC...,K03501,,,ATGATGGTTCCCGATGCGAACACGCTCAATGTTTCACGTGAAACAT...,1708,2,0.000003,0.000032,K03501,,16S rRNA G527 N7-methylase RsmG (former glucos...,MMVPDANTLNVSRETFERLKIFADLVHKWNPRINLVSKRSLEDLWT...
SPO0003,3,chromosome,2472,3275,f,,SPO_RS00015,,,Chromosome and associated proteins>>>Prokaryot...,ko03036!!!ko04812,"Cell cycle control, cell division, chromosome ...",D!!!N,,chromosome partitioning protein ParA,parA,,,ParA-like ATPase involved in chromosome/plasmi...,COG1192,chromosome partitioning protein,K03496,,,GTGTCTGATCTTTCCCGTCCTGCCGGACCCCGGATCATTGCGGTCG...,1839,3,0.000366,0.000195,K03496,,ParA-like ATPase involved in chromosome/plasmi...,VSDLSRPAGPRIIAVANQKGGVGKTTTAINLAAALVESGQRVLVVD...
SPO0004,4,chromosome,3298,4188,f,,SPO_RS00020,,,Transcription factors>>>Prokaryotic type>>>Oth...,ko03000!!!ko03036!!!ko04812,"Cell cycle control, cell division, chromosome ...",D,,chromosome partitioning protein parB,parB,,,"Chromosome segregation protein Spo0J, contains...",COG1475,"ParB family transcriptional regulator, chromos...",K03497,,,ATGGTTTCGAACAAGCCCCGGGGATTGGGACGCGGATTGTCCGCGT...,1852,4,0.002236,0.002169,K03497,,"Chromosome segregation protein Spo0J, contains...",MVSNKPRGLGRGLSALMADVTQPAEAAASEAARRPDRTVPIEKLRA...
SPO0005,5,chromosome,4178,4531,r,,SPO_RS00025,,,KEGG Orthology (KO)>>>09190 Not Included in Pa...,ko00001,Function unknown,S,,hypothetical protein,SPO0005,,,"Uncharacterized membrane protein YbaN, DUF454 ...",COG2832,uncharacterized protein,K09790,,,TGCAGTTTATCTGGGCAGCTCTTGGACTGGTCTGCGTTGCCCTCGC...,1995,5,,,K09790,,"Uncharacterized membrane protein YbaN, DUF454 ...",MQFIWAALGLVCVALALIGVALPLLPTVPFLLLAAFFFARSSERLH...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SPOA0442,4338,megaplasmid,489040,489135,f,,,,,,,,,,hypothetical protein,SPOA0442,,,,,,,,,ATGCTCGACTCTGTGGGGCGTTTTTGCTATCATGCCGCCACTTTTC...,4339,4338,,,,,,MLDSVGRFCYHAATFLNFVTCFGAIEDSRVF
SPOA0443,4339,megaplasmid,489175,489438,f,,SPO_RS22065,,,,,,,,hypothetical protein,SPOA0443,,,,,,,,,ATGCCAGAAGACGGAATCGGTTTTGAATTCTTCAGAGACGATCAAA...,4340,4339,,,,,,MPEDGIGFEFFRDDQNPRLTIVNEAALQNASNQLFNEIAHLRAFDH...
SPOA0444,4340,megaplasmid,489505,490014,r,,SPO_RS22070,Fatty acid biosynthesis,COG0764,KEGG Orthology (KO)>>>09100 Metabolism>>>09103...,ko00001!!!ko01000!!!ko01000!!!ko01004,Lipid transport and metabolism,I,,3-hydroxydecanoyl-ACP dehydratase,fabA,Pathway modules; Lipid metabolism; Fatty acid ...,M00083,3-hydroxymyristoyl/3-hydroxydecanoyl-(acyl car...,COG0764,3-hydroxyacyl-[acyl-carrier protein] dehydrata...,K01716,"Fatty acid biosynthesis, elongation",M00083,TGGCCCAATACCCGAGCAGCTTTGACAAGGAAGACCTGCTGAAATG...,232,4340,0.000437,0.000089,K01716,"Fatty acid biosynthesis, elongation",3-hydroxymyristoyl/3-hydroxydecanoyl-(acyl car...,MAQYPSSFDKEDLLKCARGELFGPGNAQLPAPPMLMMDRITEVSAD...
SPOA0445,4341,megaplasmid,490203,490622,f,,SPO_RS22075,,,Transcription factors>>>Prokaryotic type>>>Hel...,ko03000,Inorganic ion transport and metabolism,P,,FUR family transcriptional regulator,SPOA0445,,,Fe2+ or Zn2+ uptake regulation protein Fur/Zur...,COG0735,"Fur family transcriptional regulator, iron res...",K09826,,,ATGACGCCAAATTCTCAGGAAATCGCCACCGATTGGCTGGTCGATG...,4341,4341,,,K09826,,Fe2+ or Zn2+ uptake regulation protein Fur/Zur...,MTPNSQEIATDWLVDAGLRPTRQRVALAELLVGDGRHRHVTAESLF...


In [33]:
genes["DSS3_glc_mean_abund"] / genes["DSS3_glc_mean_abund"].sum(skipna=True)

7.04462639544399

In [68]:
from collections import Counter

def seq_to_vec(seq):
    aas = "GHFPWCIYMKLRATEVDSQN"
    counts = Counter(str(seq))
    return np.array([counts[aa] for aa in aas])

genes["aa_counts"] = genes["aa_sequence"].apply(seq_to_vec)
aa_contributions_glc = np.nansum(np.stack((genes["aa_counts"] * (genes["DSS3_glc_mean_abund"] / genes["DSS3_glc_mean_abund"].sum(skipna=True))).values), axis=0)
aa_contributions_glc

array([49.43476566,  8.65709876, 17.86061784, 22.51310572,  4.54624429,
        3.82884719, 28.95856145, 12.73373764, 15.9532848 , 25.62622435,
       41.36892365, 30.36187711, 58.52801903, 29.37543013, 38.25807863,
       41.27723622, 35.21251956, 25.90224378, 17.6484405 , 18.36501644])