In [1]:
import pandas as pd
import cobra

In [2]:
def read_metabolomics_data(file_path):
    """
    Reads metabolomics data from an Excel file and preprocesses it.

    The function reads data from the 'intracellular_metabolite' sheet of the specified Excel file,
    skips the second row, and ensures the first 9 columns are read as strings. It then filters out
    rows where the 'Metabolite' column contains 'unknown_' and renames certain columns for consistency.

    Returns:
        pd.DataFrame: A DataFrame containing the preprocessed metabolomics data.
    """
    df = pd.read_excel(
        file_path,
        sheet_name='intracellular_metabolite',
        skiprows=[1],
        dtype={col: str for col in range(9)}
    )
    df = df[~df['Metabolite'].str.contains('unknown_')]
    df.rename(columns={'Kegg': 'kegg.compound', 'ChEBI': 'chebi', 'PubChem': 'pubchem'}, inplace=True)
    return df

def create_metabolite_dataframe(model):
    """
    Create a pandas DataFrame from the metabolites in a given model.

    This function iterates over the metabolites in the provided COBRA model and 
    constructs a DataFrame where each row corresponds to a metabolite and 
    its associated annotations.

    Parameters:
    model (cobra.Model): A COBRA model object containing metabolites.

    Returns:
    pd.DataFrame: A DataFrame with the following columns:
        - 'metabolite_id': The ID of the metabolite.
        - 'name': The name of the metabolite.
        - 'metabolite_dict': The metabolite object itself.
        - 'formula': The chemical formula of the metabolite.
        - 'compartment': The compartment where the metabolite is located.
        - Additional columns corresponding to the annotations of the metabolite.
    """
    df = pd.DataFrame([])
    for metabolite in model.metabolites:
        series = pd.Series(data = metabolite.annotation, dtype = 'object')
        series['metabolite_id'] = metabolite.id
        series['name'] = metabolite.name
        series['compartment'] = metabolite.compartment
        df = pd.concat([df, series.to_frame().T])
    df = df.reset_index(drop=True)
    return df

In [3]:
metabolomics_df = read_metabolomics_data('../../data/round2/ABF_Aniger_BMCA_2_DATA.xlsx')
metabolomics_df

Unnamed: 0,Metabolite,Standardized name,Super class,Main class,Sub class,BioCyc Common-Name,kegg.compound,chebi,pubchem,SF ABF180_1_R1,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
0,"1,2,3-butanetriol","1,2,3-Butanetriol",Organic oxygen compounds,Alcohols and polyols,"1,2-diols","1,2,3-butanetriol",,131388,20497,-2.445485,...,-1.291797,-1.375166,-1.407763,-1.226282,-0.978914,-0.899033,-1.124178,-1.190706,-1.098620,-1.546280
1,1-deoxypentitol*,1-Deoxy-D-ribitol,Carbohydrates,Monosaccharides,Sugar alcohols,,,,,0.115958,...,0.570524,0.589129,0.734742,0.105245,0.885369,0.731026,0.759539,0.572408,1.171626,0.136986
2,1-monopalmitin*,MG 16:0/0:0/0:0,Glycerolipids,Monoradylglycerols,MAG,1-palmitoyl-sn-glycerol,,75542,3084463,1.298375,...,0.203128,0.239202,0.650708,0.234493,-0.288281,-0.141926,0.141707,0.879022,1.319148,0.561944
3,"2,3-butanediol*","2,3-Butanediol",Organic oxygen compounds,Alcohols and polyols,"1,2-diols","(R,R)-2,3-butanediol",C03044,16982,225936,0.717390,...,-2.777561,,-2.683422,0.657694,-5.035999,-6.387894,-0.445764,-4.175891,-2.843080,-2.730805
4,"2,3-dihydroxy-2-methylbutanoic acid*","2,3-Dihydroxy-2-methylbutanoic acid",Fatty Acyls,Fatty acids,Hydroxy FA,,,,,-2.430510,...,-2.222345,-1.703465,-1.699862,-1.478821,-1.843763,-1.880669,-1.926263,-2.389715,-1.886921,-2.546209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,"tris-(2,4-di-t-butylphenyl) phosphite*",-,,,,,,,,0.516430,...,-0.542506,-0.621173,-0.443398,-1.234835,-2.178700,-1.066177,-0.805851,-3.160368,-4.415549,-1.927679
103,uracil,Uracil,Nucleic acids,Pyrimidines,Other pyrimidines,uracil,C00106,17568,1174,-1.337721,...,-4.334031,-3.338625,-4.120592,-4.976497,-2.937724,-2.529395,-2.754646,-3.883221,-5.316434,-4.294470
104,urea,Urea,Organic acids,Organic carbonic acids,Ureas,urea,C00086,16199,1176,-4.161275,...,-3.962153,-2.131759,-4.236807,-1.842454,-4.765326,-4.670694,-2.764875,-1.817525,-6.424332,-1.648427
105,uridine 5'-monophosphate,UMP,Nucleic acids,Pyrimidines,Pyrimidine rNMP,UMP,C00105,57865,1778309,-5.391405,...,-3.080726,-3.210002,-4.180463,-3.753425,-3.359665,-4.040750,-4.718656,-3.132637,-3.350972,-3.073482


In [4]:
model = cobra.io.load_json_model(
    "../../models/iJB1325_HP.nonnative_genes.pubchem.flipped.nonzero.reduced.json"
)

metabolite_in_model_df = create_metabolite_dataframe(model)
metabolite_in_model_df['chebi'] = metabolite_in_model_df['chebi'].str.extract(r'CHEBI:(\d+)')[0]
metabolite_in_model_df['pubchem'] = metabolite_in_model_df['pubchem'].astype(str)
metabolite_in_model_df

Restricted license - for non-production use only - expires 2025-11-24


Unnamed: 0,chebi,inchi,kegg.compound,metabolite_id,name,compartment,pubchem
0,16001,"InChI=1S/C3H8O10P2/c4-2(1-12-14(6,7)8)3(5)13-1...",C00236,13PDG,"1,3-Bisphospho-D-glycerate",c,
1,17835,"InChI=1S/C3H7O7P/c4-1-2(3(5)6)10-11(7,8)9/h2,4...",C00631,2PG,2-Phospho-D-glycerate,c,
2,17794,"InChI=1S/C3H7O7P/c4-2(3(5)6)1-10-11(7,8)9/h2,4...",C00197,3PG,3-Phospho-D-glycerate,c,
3,15811,"InChI=1S/C3H8NO6P/c4-2(3(5)6)1-10-11(7,8)9/h2H...",C01005,3PSER,3-Phosphoserine,c,
4,15366,"InChI=1S/C2H4O2/c1-2(3)4/h1H3,(H,3,4)",C00033,AC,Acetate,c,
...,...,...,...,...,...,...,...
166,133863,,,COPROB,Coprogen B with iron bound,c,
167,17960,,C00222,3KPROP,3-Oxopropanoic acid,c,
168,133789,,,SMPYRKe,Pyranonigrin K,e,
169,,,,3hpp_c,3-Hydroxypropanoate,c,


In [5]:
"""
Filter the metabolomics DataFrame to include only rows where the 'kegg.compound', 'chebi', or 'pubchem' columns
have non-null values and are present in the corresponding columns of the metabolite_in_model DataFrame.
"""
common_kegg_rows = metabolomics_df['kegg.compound'].notna() & metabolomics_df['kegg.compound'].isin(metabolite_in_model_df['kegg.compound'])
common_chebi_rows = metabolomics_df['chebi'].notna() & metabolomics_df['chebi'].isin(metabolite_in_model_df['chebi'])
common_pubchem_rows = metabolomics_df['pubchem'].notna() & metabolomics_df['pubchem'].isin(metabolite_in_model_df['pubchem'])
print(f"The number of metabolites with common KEGG IDs: {common_kegg_rows.sum()}")
print(f"The number of metabolites with common CHEBI IDs: {common_chebi_rows.sum()}")
print(f"The number of metabolites with common PUBCHEM IDs: {common_pubchem_rows.sum()}")
metabolomics_df[common_kegg_rows|common_chebi_rows|common_pubchem_rows]

The number of metabolites with common KEGG IDs: 19
The number of metabolites with common CHEBI IDs: 5
The number of metabolites with common PUBCHEM IDs: 9


Unnamed: 0,Metabolite,Standardized name,Super class,Main class,Sub class,BioCyc Common-Name,kegg.compound,chebi,pubchem,SF ABF180_1_R1,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
18,3-phosphoglycerate,3-Phosphoglyceric acid,Organic acids,Short-chain acids,Short-chain acids,3-phospho-D-glycerate,C00197,58272,25245548,-1.105768,...,-1.757085,-4.384559,-6.43557,-4.707666,-1.49173,-1.996384,-2.461834,-3.985091,-4.124776,-3.392602
25,adenosine-5-monophosphate,AMP,Nucleic acids,Purines,Purine rNMP,AMP,C00020,456215,15938965,2.227529,...,3.669073,2.527443,1.065925,1.9729,3.136623,2.946398,2.675493,2.8571,2.721162,2.759309
27,beta-alanine,beta-Alanine,Organic acids,Amino acids and peptides,Amino acids,beta-alanine,C00099,57966,4755801,-2.726661,...,8.835588,8.375526,8.571052,8.630487,8.311451,8.368349,8.327763,8.848099,8.966264,8.678906
32,D-fructose-6-phosphate,Fructose 6-phosphate,Carbohydrates,Monosaccharides,Hexose phosphates,beta-D-fructofuranose 6-phosphate,C05345,57634,21604863,-1.545273,...,-3.653907,-4.739498,-7.489143,-4.076364,-2.801591,-3.38955,-3.300864,-4.515599,-4.670263,-3.918425
33,D-gluconic acid,Gluconic acid,Carbohydrates,Monosaccharides,Sugar acids,D-gluconate,C00257,18391,6419706,-2.608108,...,-2.305781,-1.588473,-2.046569,-1.527973,-1.401424,-1.446561,-1.275839,-1.764923,-1.317162,-1.687653
38,D-mannitol,Mannitol,Carbohydrates,Monosaccharides,Sugar alcohols,D-mannitol,C00392,16899,6251,9.933515,...,8.174625,7.459863,8.226259,8.040223,8.324906,8.558492,8.755085,8.150915,8.77825,8.555077
44,D-xylitol,Xylitol,Carbohydrates,Monosaccharides,Sugar alcohols,xylitol,C00379,17151,6912,-0.244439,...,-0.083382,-0.215809,-0.466773,-0.320738,0.056761,-0.013152,0.032887,-0.636214,-0.601203,-0.970096
47,fumaric acid,Fumaric acid,Organic acids,TCA acids,TCA acids,fumarate,C00122,29806,5460307,4.158512,...,3.803346,4.085758,3.70833,4.012434,4.269996,4.161765,4.391759,4.326294,4.203172,3.767543
51,glycerol,Glycerol,Organic oxygen compounds,Alcohols and polyols,"1,2-diols",glycerol,C00116,17754,753,3.273246,...,3.387102,2.894342,3.354089,2.484736,3.426868,3.272069,2.970941,2.97966,2.721803,3.289785
55,glycine,Glycine,Organic acids,Amino acids and peptides,Amino acids,glycine,C00037,57305,5257127,4.596392,...,3.917267,3.76248,3.712548,3.652548,4.653865,4.59801,4.594875,4.12654,4.368322,4.191531


KEGG ID has the best coverage for the ID mapping

19 metabolite species can be mapped

In [6]:
# Check other metabolites that cannot be mapped
metabolomics_df[metabolomics_df['kegg.compound'].isna()]

Unnamed: 0,Metabolite,Standardized name,Super class,Main class,Sub class,BioCyc Common-Name,kegg.compound,chebi,pubchem,SF ABF180_1_R1,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
0,"1,2,3-butanetriol","1,2,3-Butanetriol",Organic oxygen compounds,Alcohols and polyols,"1,2-diols","1,2,3-butanetriol",,131388.0,20497.0,-2.445485,...,-1.291797,-1.375166,-1.407763,-1.226282,-0.978914,-0.899033,-1.124178,-1.190706,-1.09862,-1.54628
1,1-deoxypentitol*,1-Deoxy-D-ribitol,Carbohydrates,Monosaccharides,Sugar alcohols,,,,,0.115958,...,0.570524,0.589129,0.734742,0.105245,0.885369,0.731026,0.759539,0.572408,1.171626,0.136986
2,1-monopalmitin*,MG 16:0/0:0/0:0,Glycerolipids,Monoradylglycerols,MAG,1-palmitoyl-sn-glycerol,,75542.0,3084463.0,1.298375,...,0.203128,0.239202,0.650708,0.234493,-0.288281,-0.141926,0.141707,0.879022,1.319148,0.561944
4,"2,3-dihydroxy-2-methylbutanoic acid*","2,3-Dihydroxy-2-methylbutanoic acid",Fatty Acyls,Fatty acids,Hydroxy FA,,,,,-2.43051,...,-2.222345,-1.703465,-1.699862,-1.478821,-1.843763,-1.880669,-1.926263,-2.389715,-1.886921,-2.546209
5,"2,5-dihydroxypyrazine*","2,5-Dihydroxypyrazine",Organoheterocyclic compounds,Pyrazines,Pyrazines,,,,,-3.711633,...,-2.070249,-2.124013,-2.350119,-2.491771,-1.862655,-1.936294,-2.06696,-2.358508,-2.083446,-2.393518
7,2-aminoadipic acid,alpha-Aminoadipic acid,Organic acids,Amino acids and peptides,Amino acids,,,,,1.463259,...,1.822324,1.889014,1.438255,1.621138,1.637175,1.516587,1.556401,1.806636,1.800055,1.534222
8,"2-deoxy-1,3,4,5-pentitol*",-,,,,,,,,-0.342584,...,2.261848,2.324753,1.990649,2.04382,2.161732,2.072054,1.966341,1.868973,1.726972,1.580129
10,2-ketoacetic acid*,-,,,,,,,,0.337246,...,1.343098,1.462846,1.128779,1.180407,2.445007,2.615303,2.807723,1.779956,1.346024,1.341392
11,2-nitroethan-1-ol*,-,,,,,,,,,...,-6.136888,-7.966338,-7.296756,-6.409848,-9.982923,-7.688856,-5.613405,-4.140861,-6.222273,-5.020563
12,"3,9-dihydro-1H-purine-2,6-dione*",-,,,,,,,,-2.810468,...,-1.788187,-1.599182,-1.679318,-1.542009,-2.467688,-2.00749,-1.772358,-1.273276,-1.33268,-1.233286


In [7]:
# manually edit some KEGG IDs
# be cautious. this needs to be reviewed
kegg_annotation_dict = {
    'Glucose': 'C00267', # GLC !!! should this be the sum of alpha-D-Glucose and beta-D-Glucose in the metabolomics measurements?
    'Fructose': 'C00095', # FRU
    'Ribose 5-phosphate': 'C00117', # R5P
    'Glucose 6-phosphate': 'C00668', # G6P
    'Ribulose 5-phosphate': 'C00199', # RL5P
    'Lactic acid': 'C00256', # LAC
    # not sure if iso-erythritol measures E4P level in the metabolomics
}

for key, value in kegg_annotation_dict.items():
    metabolomics_df.loc[metabolomics_df['Standardized name'] == key, 'kegg.compound'] = value
metabolomics_df.loc[metabolomics_df['Metabolite'] == 'myo-inositol phosphate*', 'kegg.compound'] = 'C04006' # MI1P

In [8]:
metabolomics_df[metabolomics_df['kegg.compound'].notna() & metabolomics_df['kegg.compound'].isin(metabolite_in_model_df['kegg.compound'])]

Unnamed: 0,Metabolite,Standardized name,Super class,Main class,Sub class,BioCyc Common-Name,kegg.compound,chebi,pubchem,SF ABF180_1_R1,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
18,3-phosphoglycerate,3-Phosphoglyceric acid,Organic acids,Short-chain acids,Short-chain acids,3-phospho-D-glycerate,C00197,58272.0,25245548.0,-1.105768,...,-1.757085,-4.384559,-6.43557,-4.707666,-1.49173,-1.996384,-2.461834,-3.985091,-4.124776,-3.392602
25,adenosine-5-monophosphate,AMP,Nucleic acids,Purines,Purine rNMP,AMP,C00020,456215.0,15938965.0,2.227529,...,3.669073,2.527443,1.065925,1.9729,3.136623,2.946398,2.675493,2.8571,2.721162,2.759309
27,beta-alanine,beta-Alanine,Organic acids,Amino acids and peptides,Amino acids,beta-alanine,C00099,57966.0,4755801.0,-2.726661,...,8.835588,8.375526,8.571052,8.630487,8.311451,8.368349,8.327763,8.848099,8.966264,8.678906
31,D-fructose,Fructose,Carbohydrates,Monosaccharides,Hexoses,,C00095,,,1.469726,...,0.303705,0.396132,0.075655,0.120939,0.857758,0.89275,0.74951,0.725502,0.644262,0.537598
32,D-fructose-6-phosphate,Fructose 6-phosphate,Carbohydrates,Monosaccharides,Hexose phosphates,beta-D-fructofuranose 6-phosphate,C05345,57634.0,21604863.0,-1.545273,...,-3.653907,-4.739498,-7.489143,-4.076364,-2.801591,-3.38955,-3.300864,-4.515599,-4.670263,-3.918425
33,D-gluconic acid,Gluconic acid,Carbohydrates,Monosaccharides,Sugar acids,D-gluconate,C00257,18391.0,6419706.0,-2.608108,...,-2.305781,-1.588473,-2.046569,-1.527973,-1.401424,-1.446561,-1.275839,-1.764923,-1.317162,-1.687653
35,D-glucose,Glucose,Carbohydrates,Monosaccharides,Hexoses,glucose,C00267,17234.0,,7.74603,...,7.61748,7.870766,8.593282,8.403691,7.771935,8.354817,8.238355,8.5868,8.572645,8.348746
36,D-glucose-6-phosphate,Glucose 6-phosphate,Carbohydrates,Monosaccharides,Hexose phosphates,D-glucose 6-phosphate,C00668,,,1.355529,...,-1.063286,-2.498352,-3.738051,-2.359753,0.11338,-1.05547,-1.478895,-2.012862,-2.427888,-1.644842
38,D-mannitol,Mannitol,Carbohydrates,Monosaccharides,Sugar alcohols,D-mannitol,C00392,16899.0,6251.0,9.933515,...,8.174625,7.459863,8.226259,8.040223,8.324906,8.558492,8.755085,8.150915,8.77825,8.555077
41,D-ribose-5-phosphate,Ribose 5-phosphate,Carbohydrates,Monosaccharides,Pentose phosphates,,C00117,,,-2.304639,...,1.111518,-1.389675,-2.333759,-2.002918,-0.416963,-0.793615,-1.316437,-0.532733,-1.80236,0.197834


In [9]:
shared_col = 'kegg.compound'
merged_df = pd.merge(metabolite_in_model_df, metabolomics_df, on=shared_col, how='inner').dropna(subset=[shared_col])
columns_to_keep = ['kegg.compound', 'compartment'] + merged_df.columns[-72:].tolist()
filtered_merged_df = merged_df.set_index('metabolite_id').sort_values(by=[shared_col, 'compartment'])[columns_to_keep]
filtered_merged_df

Unnamed: 0_level_0,kegg.compound,compartment,SF ABF180_1_R1,SF ABF180_1_R2,SF ABF180_1_R3,SF ABF180_2_R1,SF ABF180_2_R2,SF ABF180_2_R3,SF ABF180_3_R1,SF ABF180_3_R2,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
metabolite_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PI,C00009,c,8.451162,8.326329,7.655081,7.421478,7.549741,7.826892,8.290493,7.991878,...,7.639649,7.115088,7.388039,7.297237,7.310933,7.419702,7.420593,7.562193,8.007685,7.937854
PIm,C00009,m,8.451162,8.326329,7.655081,7.421478,7.549741,7.826892,8.290493,7.991878,...,7.639649,7.115088,7.388039,7.297237,7.310933,7.419702,7.420593,7.562193,8.007685,7.937854
AMP,C00020,c,2.227529,1.840778,-0.50687,-0.080334,1.996884,2.206182,3.440845,3.245868,...,3.669073,2.527443,1.065925,1.9729,3.136623,2.946398,2.675493,2.8571,2.721162,2.759309
AMPm,C00020,m,2.227529,1.840778,-0.50687,-0.080334,1.996884,2.206182,3.440845,3.245868,...,3.669073,2.527443,1.065925,1.9729,3.136623,2.946398,2.675493,2.8571,2.721162,2.759309
PYR,C00022,c,0.119687,0.677789,0.718626,0.658507,0.889973,0.85109,0.0,-0.13461,...,3.199094,3.002547,3.140019,2.987279,1.397883,1.558587,2.016732,2.638018,2.788837,2.498705
PYRm,C00022,m,0.119687,0.677789,0.718626,0.658507,0.889973,0.85109,0.0,-0.13461,...,3.199094,3.002547,3.140019,2.987279,1.397883,1.558587,2.016732,2.638018,2.788837,2.498705
GLU,C00025,c,6.295579,6.354702,6.274243,5.602697,5.165127,4.806137,5.353329,4.03078,...,2.550131,2.06302,1.607098,1.479614,5.297929,4.423511,3.880943,2.456019,1.962965,2.521303
GLUm,C00025,m,6.295579,6.354702,6.274243,5.602697,5.165127,4.806137,5.353329,4.03078,...,2.550131,2.06302,1.607098,1.479614,5.297929,4.423511,3.880943,2.456019,1.962965,2.521303
GLY,C00037,c,4.596392,4.639347,4.550689,4.955307,5.082095,5.298135,5.064827,4.883926,...,3.917267,3.76248,3.712548,3.652548,4.653865,4.59801,4.594875,4.12654,4.368322,4.191531
ALA,C00041,c,5.499417,5.566166,5.409542,4.908103,4.979516,5.069535,4.254179,4.285516,...,3.946308,3.468724,3.686778,3.84084,4.373924,4.394616,4.487415,2.159536,3.663575,3.31138


In [10]:
# Generates an output dataset by filtering the merged dataframe for rows where the 'compartment' column is 'c',
# drops the 'kegg.compound' and 'compartment' columns, and saves the resulting dataframe to a CSV file.
filtered_merged_df[filtered_merged_df['compartment']=='c'].drop(columns=['kegg.compound', 'compartment']).to_csv('../../data/round2/internal_metabolite_conc.csv')