In [288]:
import numpy as np

import seaborn as sns
import pandas as pd
import os
import pprint
import ast
import re
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict
import json

pp = pprint.PrettyPrinter(depth=6)

os.chdir(os.path.expanduser('~/vivarium-ecoli'))

s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password':'Cellman0451'})


<Response [200]>

In [2]:
# get a set of all monomers with an associated uniprot id
proteins_df = pd.read_csv('reconstruction/ecoli/flat/proteins.tsv', sep='\t').loc[:, ["id", "common_name"]]

for column in ["enzyme_reaction", "cofactors", "metal_features", "other_features"]:
    proteins_df[column] = 0
    proteins_df[column] = proteins_df[column].astype(object)

proteins_df

Unnamed: 0,id,common_name,enzyme_reaction,cofactors,metal_features,other_features
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,0,0,0,0
1,1-PFK-MONOMER,1-phosphofructokinase,0,0,0,0
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,0,0,0,0
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,0,0,0,0
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",0,0,0,0
...,...,...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,0,0,0,0
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,0,0,0,0
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,0,0,0,0
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,0,0,0,0


## Get relevant EcoCyc data and annotate protein monomer table

In [3]:
# example entry
protein = 'EG11333-MONOMER'
req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{protein}&detail=high'

r = s.get(req_str)
if r.status_code != 200:
    print(protein, r.status_code)

o = xmltodict.parse(r.content)['ptools-xml']
pp.pprint(o)

{'@ptools-version': '27.0',
 '@xml:base': 'http://BioCyc.org/getxml?ECOLI:EG11333-MONOMER',
 'Protein': {'@ID': 'ECOLI:EG11333-MONOMER',
             '@detail': 'full',
             '@frameid': 'EG11333-MONOMER',
             '@orgid': 'ECOLI',
             'catalyzes': {'Enzymatic-Reaction': {'@ID': 'ECOLI:ENZRXN0-7774',
                                                  '@detail': 'low',
                                                  '@frameid': 'ENZRXN0-7774',
                                                  '@orgid': 'ECOLI',
                                                  'common-name': {'#text': '2-octaprenylphenol '
                                                                           '6-hydroxylase',
                                                                  '@datatype': 'string'},
                                                  'enzyme': {'Protein': {'@frameid': 'EG11333-MONOMER',
                                                                         '@org

In [7]:
for i in range(len(proteins_df.index)):
    if i % 100 == 0:
        print(i)

    protein = proteins_df.loc[i, 'id']
    proteins_df.at[i, 'other_features'] = set()
    proteins_df.at[i, 'metal_features'] = set()
    proteins_df.at[i, 'enzyme_reaction'] = set()
    proteins_df.at[i, 'cofactors'] = set()

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{protein}&detail=high'



    r = s.get(req_str)
    if r.status_code != 200:
        print(protein, r.status_code)
        continue

    o = xmltodict.parse(r.content)['ptools-xml']

    metal_set = set()
    other_feature_set = set()
    if 'Protein' in o and 'has-feature' in o['Protein']:
        features = o['Protein']['has-feature']

        if type(features) is dict:
            features = [features]

        for feature in features:
            if 'parent' not in feature['Feature']:
                continue

            category = feature['Feature']['parent']['Feature']['@frameid']
            if category == 'Metal-Binding-Sites' and 'comment' in feature['Feature']:

                # Detect match to any of the allowed metal names and allowed cofactor names and add to list
                comment = feature['Feature']['comment']['#text']
                metal_set.add(comment)

                proteins_df.at[i, 'metal_features'] = list(metal_set)

            if category == 'Nucleotide-Phosphate-Binding-Regions' and 'attached-group' in feature['Feature'] and 'Compound' in feature['Feature']['attached-group']:
                attached_group = feature['Feature']['attached-group']['Compound']['@frameid']
                other_feature_set.add(attached_group)

            if category == 'N6-pyridoxal-phosphate-Lys-Modifications':
                other_feature_set.add('PYRIDOXAL_PHOSPHATE')

            if category == 'Protein-Segments' and 'comment' in feature['Feature'] and 'Thiamine' in feature['Feature']['comment']['#text']:
                other_feature_set.add('THIAMINE-PYROPHOSPHATE')

            if category == 'Selenocysteine-site':
                metal_set.add('L-SELENOCYSTEINE')

            proteins_df.at[i, 'metal_features'] = list(metal_set)
            proteins_df.at[i, 'other_features'] = list(other_feature_set)

        if 'catalyzes' in o['Protein']:
            oc = o['Protein']['catalyzes']['Enzymatic-Reaction']

            if type(oc) is dict:
                oc = [oc]

            cofactor_set = set()
            enz_rxn_set = set()

            for enzrxn in oc:
                enz_id = enzrxn['@frameid']

                enz_rxn_set.add(enz_id)

                enz_req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{enz_id}&detail=high'

                rz = s.get(enz_req_str)
                oe = xmltodict.parse(rz.content)['ptools-xml']['Enzymatic-Reaction']

                if "cofactor" in oe:
                    oe = oe['cofactor']

                    if type(oe) is dict:
                        oe = [oe]

                    for cofactor in oe:
                        cof = cofactor['Compound']['@frameid']
                        cofactor_set.add(cof)

            proteins_df.at[i, 'enzyme_reaction'] = enz_rxn_set
            proteins_df.at[i, 'cofactors'] = cofactor_set




0
100
200
300
400
500
600
700
800
900
1000
1100
EG11708-MONOMER 404
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400


In [8]:
proteins_df

Unnamed: 0,id,common_name,enzyme_reaction,cofactors,metal_features,other_features
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,"{ENZRXN0-7992, ENZRXN0-7993, ENZRXN0-8629, ENZ...",{},[],[]
1,1-PFK-MONOMER,1-phosphofructokinase,{},{},[],[ATP]
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,{2-DEHYDROPANTOATE-REDUCT-ENZRXN},{},[],[NADP]
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,{ENZRXN0-6250},{},[],[]
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...","{ADOMET-DMK-METHYLTRANSFER-ENZRXN, 2-OCTAPRENY...",{},[],[]
...,...,...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,{},{},[],[ATP]
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,{},{},[],[]
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,{},{},"[UniProt: Zinc., implicated in a second zinc b...",[]
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,{},{},[],[]


In [9]:
proteins_df.to_parquet('notebooks/fbagd/data/raw_protein_features.parquet', index=False)

## Reload protein feature table

In [16]:
proteins_df = pd.read_parquet('notebooks/fbagd/data/raw_protein_features.parquet')
# proteins_df['metal_features'] = proteins_df['metal_features'].apply(ast.literal_eval)
# # for rows of proteins with where other_features is set, convert from string to set with literal_eval
# proteins_df.loc[proteins_df['other_features'].str.startswith('['), 'other_features'] = \
#     proteins_df.loc[proteins_df['other_features'].str.startswith('['), 'other_features'].apply(ast.literal_eval)

filter_protein_df = proteins_df.copy().loc[:, ['id', 'common_name', 'metal_features', 'other_features', 'enzyme_reaction', 'cofactors']]
filter_protein_df

Unnamed: 0,id,common_name,metal_features,other_features,enzyme_reaction,cofactors
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,[],[],"[ENZRXN0-7992, ENZRXN0-7993, ENZRXN0-8629, ENZ...",[]
1,1-PFK-MONOMER,1-phosphofructokinase,[],[ATP],[],[]
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[],[NADP],[2-DEHYDROPANTOATE-REDUCT-ENZRXN],[]
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[],[],[ENZRXN0-6250],[]
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",[],[],"[ADOMET-DMK-METHYLTRANSFER-ENZRXN, 2-OCTAPRENY...",[]
...,...,...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[],[ATP],[],[]
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],[],[],[]
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,"[UniProt: Zinc., implicated in a second zinc b...",[],[],[]
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],[],[],[]


## Process raw EcoCyc annotations into standard EcoCyc names

In [17]:
ALLOWED_METAL_NAMES =   {'Iron': 'FE+2', 'Cobalt': 'CO+2', 'Copper': 'CU+2', 'Manganese': 'MN+2', 'Molybdenum': 'CPD-8123', 'Nickel': 'NI+2', 'Tungsten': 'W', 'Zinc': 'ZN+2',
                        'Calcium': 'CA+2', 'Magnesium': 'MG+2', 'Sodium': 'NA+', 'Potassium': 'K+',
                        'Iron-sulfur \(4Fe-4S\)': 'CPD-7', 'Iron-sulfur \(2Fe-2S\)': 'CPD-6',
                         'Iron-sulfur \(4Fe-4S-S-AdoMet\)': 'CPD-7', 'Iron-sulfur \(3Fe-4S\)': '3FE-4S', 'Iron-oxo-sulfur \(4Fe-2O-2S\)': 'CPD-7',
                        'heme': 'Heme-b', 'Molybdate': 'CPD-3', 'heme B': 'Heme-b',
                         'L-SELENOCYSTEINE': 'L-SELENOCYSTEINE',
                        'Divalent metal cation': 'Any+2'}

# remove all \ characters from keys in ALLOWED_METAL_NAMES
NON_REGEX_METAL = {key.replace('\\', ''): value for key, value in ALLOWED_METAL_NAMES.items()}

filter_protein_df['metal_features_processed'] = 0
filter_protein_df['metal_features_processed'] = filter_protein_df['metal_features_processed'].astype(object)

metal_pattern = '|'.join(ALLOWED_METAL_NAMES.keys())
metal_regex = re.compile(f'(({metal_pattern})(\s\d[\.,;]|[\.,;]|\s\())')


for i in range(len(filter_protein_df.index)):

    metal_binding = filter_protein_df.loc[i, 'metal_features']

    metal_count_dict = {}
    existing_matches = set()

    for feature in metal_binding:
        matches = metal_regex.search(feature)
        if matches:
            metal = matches.group(0)[:-1]

            # eliminate duplicates
            if metal not in existing_matches:

                existing_matches.add(metal)

                if 'heme' in feature:
                    metal = metal.replace('Iron', 'heme')

                # check if last char of metal is a number, then crop
                if metal[-1].isdigit():
                    metal = metal[:-2]

                metal = metal.strip()

                # replace metal name with allowed metal name
                metal = NON_REGEX_METAL[metal]

                if metal in metal_count_dict:
                    metal_count_dict[metal] += 1
                else:
                    metal_count_dict[metal] = 1

        else:
            print(f'No match for {feature} in {filter_protein_df.loc[i, "id"]}')




    filter_protein_df.at[i, 'metal_features_processed'] = metal_count_dict

No match for UniProt: Magnesium or manganese. in 3-ISOPROPYLMALDEHYDROG-MONOMER
No match for conserved, Fe(III) binding motif in BASS-MONOMER
No match for predicted heme d ligand in CYDA-MONOMER
No match for UniProt: Zn(2+); catalytic. in CYTDEAM-MONOMER
No match for UniProt: Fe(2+); catalytic. in CYTDEAM-MONOMER
No match for The amino-terminus of ClpA contains a Zinc binding site. in EG10156-MONOMER
No match for The active-site magnesium ion is coordinated by three aspartate residues (401, 403, 555). Two of them form part of 
the PDXD active-site motif. in EG10238-MONOMER
No match for Based on crystal structures, Glu-265 and Asp-309 coordinate a divalent cation. in EG10239-MONOMER
No match for Divalent magnesium ions are chelated by three aspartate residues, two in the conserved DPD sequence (345, 347) 
and one in the conserved EGYMD sequence (269). in EG10239-MONOMER
No match for These residues are thought to coordinate the one or two divalent magnesium ions required for the 
gyrase 

In [18]:
filter_protein_df = filter_protein_df.drop(columns=['metal_features'])
filter_protein_df

Unnamed: 0,id,common_name,other_features,enzyme_reaction,cofactors,metal_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,[],"[ENZRXN0-7992, ENZRXN0-7993, ENZRXN0-8629, ENZ...",[],{}
1,1-PFK-MONOMER,1-phosphofructokinase,[ATP],[],[],{}
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[NADP],[2-DEHYDROPANTOATE-REDUCT-ENZRXN],[],{}
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[],[ENZRXN0-6250],[],{}
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",[],"[ADOMET-DMK-METHYLTRANSFER-ENZRXN, 2-OCTAPRENY...",[],{}
...,...,...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[ATP],[],[],{}
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],[],[],{}
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,[],[],[],{'ZN+2': 1}
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],[],[],{}


In [19]:
filter_protein_df['other_features_processed'] = 0
filter_protein_df['other_features_processed'] = filter_protein_df['other_features_processed'].astype(object)

ACCEPTED_OTHER_FEATURES = {'PYRIDOXAL_PHOSPHATE', 'THIAMINE-PYROPHOSPHATE', 'FMN', 'FAD'}

for i in range(len(filter_protein_df.index)):

    other_features = filter_protein_df.loc[i, 'other_features']

    other_feature_count_dict = {}
    existing_matches = set()

    for feature in other_features:

        # eliminate duplicates
        if feature not in existing_matches:

            existing_matches.add(feature)

            if feature in ACCEPTED_OTHER_FEATURES:
                if feature in other_feature_count_dict:
                    other_feature_count_dict[feature] += 1
                else:
                    other_feature_count_dict[feature] = 1

    filter_protein_df.at[i, 'other_features_processed'] = other_feature_count_dict

In [20]:
filter_protein_df = filter_protein_df.drop(columns=['other_features'])

filter_protein_df

Unnamed: 0,id,common_name,enzyme_reaction,cofactors,metal_features_processed,other_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,"[ENZRXN0-7992, ENZRXN0-7993, ENZRXN0-8629, ENZ...",[],{},{}
1,1-PFK-MONOMER,1-phosphofructokinase,[],[],{},{}
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[2-DEHYDROPANTOATE-REDUCT-ENZRXN],[],{},{}
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[ENZRXN0-6250],[],{},{}
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...","[ADOMET-DMK-METHYLTRANSFER-ENZRXN, 2-OCTAPRENY...",[],{},{}
...,...,...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[],[],{},{}
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],[],{},{}
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,[],[],{'ZN+2': 1},{}
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],[],{},{}


In [21]:
filter_protein_df.to_parquet('notebooks/fbagd/data/processed_proteins.parquet', index=False)

# Annotate complexation with EcoCyc data

In [22]:
complex_df = pd.read_csv('reconstruction/ecoli/flat/complexation_reactions.tsv', sep='\t').loc[:, ['id', 'stoichiometry']]


removed_complexes = pd.read_csv('reconstruction/ecoli/flat/complexation_reactions_removed.tsv', sep='\t')

# remove rows where id starts with '#'
complex_df = complex_df[~complex_df['id'].str.startswith('#')].reset_index(drop=True)

# remove rows of complex_df where id matches an id in removed_complexes
complex_df = complex_df[~complex_df['id'].isin(removed_complexes['id'])].reset_index(drop=True)
complex_df.stoichiometry = complex_df.stoichiometry.astype(object)


for i, stoich in enumerate(complex_df.loc[:, 'stoichiometry']):

    if type(stoich) is str and stoich[0] == '{':
        stoich = stoich.replace('null', '-1')
        stoich = ast.literal_eval(stoich)

        complex_df.at[i, 'stoichiometry'] = stoich

    else:
        complex_df.at[i, 'stoichiometry'] = {}


# for each row, find dict entry with positive value
for i in range(len(complex_df.index)):

    stoich = complex_df.loc[i, 'stoichiometry']

    for k,v in stoich.items():
        if v > 0:
            complex_df.at[i, 'id'] = k

complex_df

Unnamed: 0,id,stoichiometry
0,1-PFK,"{'1-PFK': 1, '1-PFK-MONOMER': -2}"
1,2OXOGLUTARATEDEH-CPLX,"{'2OXOGLUTARATEDEH-CPLX': 1, 'E1O': -1, 'E2O':..."
2,3-ISOPROPYLMALDEHYDROG-CPLX,"{'3-ISOPROPYLMALDEHYDROG-CPLX': 1, '3-ISOPROPY..."
3,3-ISOPROPYLMALISOM-CPLX,"{'3-ISOPROPYLMALISOM-CPLX': 1, 'LEUC-MONOMER':..."
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,"{'3-METHYL-2-OXOBUT-OHCH3XFER-CPLX': 1, '3-CH3..."
...,...,...
1063,CPLX0-8053,"{'CPLX0-8053': 1, 'EG10942-MONOMER': -1}"
1064,CPLX0-8253,"{'CPLX0-8253': 1, 'CSRC-RNA': -1, 'EG11447-MON..."
1065,SRP-CPLX,"{'SRP-CPLX': 1, 'EG10300-MONOMER': -1, 'FFS-RN..."
1066,CPLX0-7796APO,"{'CPLX0-7796APO': 1, 'PD04032': -2}"


In [23]:
complex_df["cofactors"] = 0
complex_df["cofactors"] = complex_df["cofactors"].astype(object)

complex_df["enzyme_reaction"] = 0
complex_df["enzyme_reaction"] = complex_df["enzyme_reaction"].astype(object)

for i in range(len(complex_df.index)):

    if i % 100 == 0:
        print(i)

    complex = complex_df.loc[i, 'id']

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{complex}&detail=low'

    r = s.get(req_str)
    if r.status_code != 200:
        print(complex, r.status_code)
        complex_df.at[i, 'enzyme_reaction'] = set()
        complex_df.at[i, 'cofactors'] = set()
        continue

    o = xmltodict.parse(r.content)['ptools-xml']['Protein']


    # if enzyme
    if 'catalyzes' in o:
        o = o['catalyzes']['Enzymatic-Reaction']

        if type(o) is dict:
            o = [o]

        cofactor_set = set()
        enz_rxn_set = set()

        for enzrxn in o:
            enz_id = enzrxn['@frameid']

            enz_rxn_set.add(enz_id)

            enz_req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{enz_id}&detail=high'

            rz = s.get(enz_req_str)
            oe = xmltodict.parse(rz.content)['ptools-xml']['Enzymatic-Reaction']

            if "cofactor" in oe:
                oe = oe['cofactor']

                if type(oe) is dict:
                    oe = [oe]

                for cofactor in oe:
                    cof = cofactor['Compound']['@frameid']
                    cofactor_set.add(cof)

        complex_df.at[i, 'enzyme_reaction'] = enz_rxn_set
        complex_df.at[i, 'cofactors'] = cofactor_set

    else:
        complex_df.at[i, 'enzyme_reaction'] = set()
        complex_df.at[i, 'cofactors'] = set()

0
100
200
CPLX0-2423 404
300
CPLX0-3976 404
400
500
600
700
800
900
RECFOR-CPLX 404
1000
CPLX0-7796APO 404


In [24]:
# save complex_df to parquet in a way that preserves dicts


In [25]:
for col in ['enzyme_reaction', 'cofactors']:
    complex_df[col] = complex_df[col].apply(lambda x: list(x))

complex_df.to_parquet('notebooks/fbagd/data/raw_complexes.parquet', index=False, engine='fastparquet')
complex_df

Unnamed: 0,id,stoichiometry,cofactors,enzyme_reaction
0,1-PFK,"{'1-PFK': 1, '1-PFK-MONOMER': -2}",[MG+2],[1PFRUCTPHOSPHN-ENZRXN]
1,2OXOGLUTARATEDEH-CPLX,"{'2OXOGLUTARATEDEH-CPLX': 1, 'E1O': -1, 'E2O':...","[FAD, LIPOIC-ACID, MG+2, THIAMINE-PYROPHOSPHATE]",[2OXOGLUTARATEDEH-ENZRXN]
2,3-ISOPROPYLMALDEHYDROG-CPLX,"{'3-ISOPROPYLMALDEHYDROG-CPLX': 1, '3-ISOPROPY...","[MN+2, MG+2]",[3-ISOPROPYLMALDEHYDROG-ENZRXN]
3,3-ISOPROPYLMALISOM-CPLX,"{'3-ISOPROPYLMALISOM-CPLX': 1, 'LEUC-MONOMER':...",[CPD-7],[3-ISOPROPYLMALISOM-ENZRXN]
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,"{'3-METHYL-2-OXOBUT-OHCH3XFER-CPLX': 1, '3-CH3...",[MG+2],[3-METHYL-2-OXOBUT-OHCH3XFER-ENZRXN]
...,...,...,...,...
1063,CPLX0-8053,"{'CPLX0-8053': 1, 'EG10942-MONOMER': -1}",[],[]
1064,CPLX0-8253,"{'CPLX0-8253': 1, 'CSRC-RNA': -1, 'EG11447-MON...",[],[]
1065,SRP-CPLX,"{'SRP-CPLX': 1, 'EG10300-MONOMER': -1, 'FFS-RN...",[],[]
1066,CPLX0-7796APO,"{'CPLX0-7796APO': 1, 'PD04032': -2}",[],[]


## Create new column for monomer component stoichiometry

In [26]:
test_complex_df = pd.read_parquet('notebooks/fbagd/data/raw_complexes.parquet')
test_complex_df

# convert cofactors and enzyme_reaction to str
for col in ['stoichiometry', 'enzyme_reaction', 'cofactors']:
    test_complex_df[col] = test_complex_df[col].astype(str)

    # interpret as str literal
    test_complex_df[col] = test_complex_df[col].apply(lambda x: ast.literal_eval(x))

test_complex_df

complex_names = complex_df['id'].tolist()
monomer_names = filter_protein_df['id'].tolist()

In [149]:
def recursive_component_tree(current_component_name, complex_table, protein_table,
                             current_multiplier=1, component_list=None, parent=None):
    """
    Recursively find all downstream components of a given complex.
    """

    complex_names = complex_table['id'].tolist()
    monomer_names = protein_table['id'].tolist()


    my_children = {}

    if component_list is None:
        component_list = []


    if current_component_name in complex_names:


        cplx_idx = complex_table.index[complex_table['id'] == current_component_name][0]
        stoichiometry = complex_table.at[cplx_idx, 'stoichiometry']

        for component_name, coefficient in stoichiometry.items():

            if coefficient < 0 and component_name != current_component_name:

                child_multiplier = abs(coefficient) * current_multiplier

                new_child = recursive_component_tree(component_name, complex_table, protein_table, child_multiplier, component_list, current_component_name)
                my_children = my_children | new_child


            elif coefficient > 0 and component_name == current_component_name:
                continue

            else:
                raise ValueError(f"key {k} and value {v} for complex {component_name} not processed properly.")

        component_list.append({'name': current_component_name,'parent': parent, 'children': list(my_children.keys()),
                               'multiplier': current_multiplier, })


    elif current_component_name in monomer_names:

        # TODO check if enzrxn
        my_children = None

        component_list.append({'parent': parent, 'name': current_component_name, 'multiplier': current_multiplier, 'children': my_children})

    else:
        print(f"component {current_component_name} not found in complex or protein tables")

        return {}


    if parent is None:
        return {current_component_name: my_children}, component_list
    else:
        return {current_component_name: my_children}


In [150]:
complex_tree_structure, nodes = recursive_component_tree('CPLX0-8167', complex_df, filter_protein_df)
pp.pprint(nodes)

[{'children': None,
  'multiplier': 4,
  'name': 'HYAA-MONOMER',
  'parent': 'FORMHYDROGI-CPLX'},
 {'children': None,
  'multiplier': 4,
  'name': 'HYAB-MONOMER',
  'parent': 'FORMHYDROGI-CPLX'},
 {'children': None,
  'multiplier': 2,
  'name': 'HYAC-MONOMER',
  'parent': 'FORMHYDROGI-CPLX'},
 {'children': ['HYAA-MONOMER', 'HYAB-MONOMER', 'HYAC-MONOMER'],
  'multiplier': 2,
  'name': 'FORMHYDROGI-CPLX',
  'parent': 'CPLX0-8167'},
 {'children': ['FORMHYDROGI-CPLX'],
  'multiplier': 1,
  'name': 'CPLX0-8167',
  'parent': None}]


In [153]:
complex_df['monomer_component_stoichiometry'] = 0
complex_df['monomer_component_stoichiometry'] = complex_df['monomer_component_stoichiometry'].astype(object)

for i in range(len(complex_df.index)):
    complex_name = complex_df.loc[i, 'id']
    complex_tree_structure, nodes = recursive_component_tree(complex_name, complex_df, filter_protein_df)

    monomer_components = {node['name']: node['multiplier'] for node in nodes if node['children'] is None}

    complex_df.at[i, 'monomer_component_stoichiometry'] = monomer_components

component CPLX0-7701 not found in complex or protein tables
component CPLX0-7677 not found in complex or protein tables
component MONOMER0-1781 not found in complex or protein tables
component CPLX0-7702 not found in complex or protein tables
component CSRB-RNA not found in complex or protein tables
component RNPB-RNA not found in complex or protein tables
component RRSA-RRNA not found in complex or protein tables
component RRLA-RRNA not found in complex or protein tables
component RRFA-RRNA not found in complex or protein tables
component IS061-RNA not found in complex or protein tables
component CSRC-RNA not found in complex or protein tables
component FFS-RNA not found in complex or protein tables


In [155]:
filter_complex_df = complex_df.loc[:, ["id", "monomer_component_stoichiometry", "cofactors"]]
filter_complex_df

Unnamed: 0,id,monomer_component_stoichiometry,cofactors
0,1-PFK,{'1-PFK-MONOMER': 2},[MG+2]
1,2OXOGLUTARATEDEH-CPLX,"{'E1O-MONOMER': 12, 'E2O-MONOMER': 24, 'E3-MON...","[FAD, LIPOIC-ACID, MG+2, THIAMINE-PYROPHOSPHATE]"
2,3-ISOPROPYLMALDEHYDROG-CPLX,{'3-ISOPROPYLMALDEHYDROG-MONOMER': 2},"[MN+2, MG+2]"
3,3-ISOPROPYLMALISOM-CPLX,"{'LEUC-MONOMER': 1, 'LEUD-MONOMER': 1}",[CPD-7]
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,{'3-CH3-2-OXOBUTANOATE-OH-CH3-XFER-MONOMER': 10},[MG+2]
...,...,...,...
1063,CPLX0-8053,{'EG10942-MONOMER': 1},[]
1064,CPLX0-8253,{'EG11447-MONOMER': 9},[]
1065,SRP-CPLX,{'EG10300-MONOMER': 1},[]
1066,CPLX0-7796APO,{'PD04032': 2},[]


In [156]:
filter_complex_df.to_parquet('notebooks/fbagd/data/processed_complexes.parquet', index=False)

# Get cofactor elemental composition and more

In [188]:
compound = 'CPD-6'
req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{compound}&detail=full'
r = s.get(req_str)
if r.status_code != 200:
    print(compound, r.status_code)

o = xmltodict.parse(r.content)['ptools-xml']
print(compound)
pp.pprint(o['Compound']['cml']['molecule']['atomArray']['atom'])

CPD-6
[{'@elementType': 'FE',
  '@id': 'CPD-6-atom1',
  '@x2': '-52053.0',
  '@y2': '-9036.0'},
 {'@elementType': 'FE',
  '@id': 'CPD-6-atom2',
  '@x2': '-29830.0',
  '@y2': '-8839.0'},
 {'@elementType': 'S',
  '@id': 'CPD-6-atom3',
  '@x2': '-25705.0',
  '@y2': '-15984.0'},
 {'@elementType': 'S',
  '@id': 'CPD-6-atom4',
  '@x2': '-23996.0',
  '@y2': '-3006.0'},
 {'@elementType': 'S',
  '@id': 'CPD-6-atom5',
  '@x2': '-56178.0',
  '@y2': '-1891.0'},
 {'@elementType': 'S',
  '@id': 'CPD-6-atom6',
  '@x2': '-56178.0',
  '@y2': '-16180.0'},
 {'@elementType': 'S',
  '@id': 'CPD-6-atom7',
  '@x2': '-40155.0',
  '@y2': '-13528.0'},
 {'@elementType': 'S',
  '@id': 'CPD-6-atom8',
  '@x2': '-40043.0',
  '@y2': '-4248.0'},
 {'@elementType': 'R',
  '@id': 'CPD-6-atom9',
  '@x2': '-64428.0',
  '@y2': '-1891.0'},
 {'@elementType': 'R',
  '@id': 'CPD-6-atom10',
  '@x2': '-17455.0',
  '@y2': '-15984.0'},
 {'@elementType': 'R',
  '@id': 'CPD-6-atom11',
  '@x2': '-15746.0',
  '@y2': '-3006.0'},
 {'@ele

In [204]:
# get set of all cofactors
all_metal_cofactors = set()
all_other_cofactors = set()

for i in range(len(filter_protein_df.index)):

    metal_cofactors = filter_protein_df.loc[i, 'metal_features_processed']
    other_cofactors = filter_protein_df.loc[i, 'other_features_processed']

    all_metal_cofactors = all_metal_cofactors | set(metal_cofactors.keys())
    all_other_cofactors = all_other_cofactors | set(other_cofactors.keys())

filter_cofactor_df = pd.DataFrame(columns=['id', 'elemental_composition'])
filter_cofactor_df['id'] = list(all_metal_cofactors | all_other_cofactors)

filter_cofactor_df['elemental_composition'] = 0
filter_cofactor_df['elemental_composition'] = filter_cofactor_df['elemental_composition'].astype(object)

# for each cofactor, get elemental composition
for i in range(len(filter_cofactor_df.index)):

    compound = filter_cofactor_df.loc[i, 'id']
    atom_dict = {}

    url_name = compound.replace('+', '%2b')

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{url_name}&detail=full'

    r = s.get(req_str)
    if r.status_code != 200:
        print(compound, r.status_code)
        filter_cofactor_df.at[i, 'elemental_composition'] = {}
        continue

    o = xmltodict.parse(r.content)['ptools-xml']

    atoms = o['Compound']['cml']['molecule']['atomArray']['atom']
    if type(atoms) is dict:
        atoms = [atoms]

    for atom in atoms:
        element = atom['@elementType']

        # either add new element or add to existing element
        if element not in atom_dict:
            atom_dict[element] = 1
        else:
            atom_dict[element] += 1

    filter_cofactor_df.at[i, 'elemental_composition'] = atom_dict

Any+2 404
heme 404


In [205]:
filter_cofactor_df

Unnamed: 0,id,elemental_composition
0,FAD,"{'C': 27, 'N': 9, 'O': 15, 'P': 2}"
1,3FE-4S,"{'FE': 3, 'S': 4}"
2,MN+2,{'MN': 1}
3,CA+2,{'CA': 1}
4,THIAMINE-PYROPHOSPHATE,"{'C': 12, 'N': 4, 'O': 7, 'P': 2, 'S': 1}"
5,Any+2,{}
6,heme,{}
7,FE+2,{'FE': 1}
8,MG+2,{'MG': 1}
9,FMN,"{'C': 17, 'N': 4, 'O': 9, 'P': 1}"


In [206]:
# save as parquet
filter_cofactor_df.to_parquet('notebooks/fbagd/data/processed_cofactors.parquet', index=False)

# For each complex, look up components and check how many are missing from protein features.

In [207]:
# filter_protein_df = pd.read_parquet('notebooks/fbagd/data/processed_proteins.parquet')
# filter_complex_df = pd.read_parquet('notebooks/fbagd/data/processed_complexes.parquet')
# filter_cofactor_df = pd.read_parquet('notebooks/fbagd/data/processed_cofactors.parquet')

In [211]:
filter_complex_df

Unnamed: 0,id,monomer_component_stoichiometry,cofactors
0,1-PFK,"{'1-PFK-MONOMER': 2.0, '2-OCTAPRENYL-METHOXY-B...",[MG+2]
1,2OXOGLUTARATEDEH-CPLX,"{'1-PFK-MONOMER': None, '2-OCTAPRENYL-METHOXY-...","[FAD, LIPOIC-ACID, MG+2, THIAMINE-PYROPHOSPHATE]"
2,3-ISOPROPYLMALDEHYDROG-CPLX,"{'1-PFK-MONOMER': None, '2-OCTAPRENYL-METHOXY-...","[MN+2, MG+2]"
3,3-ISOPROPYLMALISOM-CPLX,"{'1-PFK-MONOMER': None, '2-OCTAPRENYL-METHOXY-...",[CPD-7]
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,"{'1-PFK-MONOMER': None, '2-OCTAPRENYL-METHOXY-...",[MG+2]
...,...,...,...
1063,CPLX0-8053,"{'1-PFK-MONOMER': None, '2-OCTAPRENYL-METHOXY-...",[]
1064,CPLX0-8253,"{'1-PFK-MONOMER': None, '2-OCTAPRENYL-METHOXY-...",[]
1065,SRP-CPLX,"{'1-PFK-MONOMER': None, '2-OCTAPRENYL-METHOXY-...",[]
1066,CPLX0-7796APO,"{'1-PFK-MONOMER': None, '2-OCTAPRENYL-METHOXY-...",[]


In [None]:
 # for each complex, look up components and get cofactors for those components.
for i in range(len(filter_complex_df.index)):
    complex_id = filter_complex_df.loc[i, 'id']
    complex_components = filter_complex_df.loc[i, 'monomer_component_stoichiometry']
    complex_cofactors = filter_complex_df.loc[i, 'cofactors']

    current_combined_features = set()


    for component_name, component_count in complex_components.items():
        # get index of component in filter_protein_df
        component_index = filter_protein_df.index[filter_protein_df['id'] == component_name].tolist()[0]


        component_metals = filter_protein_df.at[component_index, 'metal_features_processed']
        component_other = filter_protein_df.at[component_index, 'other_features_processed']

        current_combined_features.update(set(component_metals.keys()))
        current_combined_features.update(set(component_other.keys()))

        # print(f'component_name: {component_name}, component_metals: {component_metals}, component_other: {component_other}, cofactors: {complex_cofactors}')

    # print difference between combined features and complex cofactors
    if len(complex_cofactors - current_combined_features) > 0:
        print(f'{complex_cofactors - current_combined_features}, {complex_id}, {current_combined_features}, {complex_cofactors}')


    filter_complex_df.at[i, 'cofactors'] = complex_cofactors

Seems neglible. Let's just use the protein features.

# Create matrices to get cofactor counts

In [224]:
filter_protein_df

Unnamed: 0,id,common_name,enzyme_reaction,cofactors,metal_features_processed,other_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,"[ENZRXN0-7992, ENZRXN0-7993, ENZRXN0-8629, ENZ...",[],"{'3FE-4S': None, 'Any+2': None, 'CA+2': None, ...","{'FAD': None, 'FMN': None, 'PYRIDOXAL_PHOSPHAT..."
1,1-PFK-MONOMER,1-phosphofructokinase,[],[],"{'3FE-4S': None, 'Any+2': None, 'CA+2': None, ...","{'FAD': None, 'FMN': None, 'PYRIDOXAL_PHOSPHAT..."
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[2-DEHYDROPANTOATE-REDUCT-ENZRXN],[],"{'3FE-4S': None, 'Any+2': None, 'CA+2': None, ...","{'FAD': None, 'FMN': None, 'PYRIDOXAL_PHOSPHAT..."
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[ENZRXN0-6250],[],"{'3FE-4S': None, 'Any+2': None, 'CA+2': None, ...","{'FAD': None, 'FMN': None, 'PYRIDOXAL_PHOSPHAT..."
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...","[ADOMET-DMK-METHYLTRANSFER-ENZRXN, 2-OCTAPRENY...",[],"{'3FE-4S': None, 'Any+2': None, 'CA+2': None, ...","{'FAD': None, 'FMN': None, 'PYRIDOXAL_PHOSPHAT..."
...,...,...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[],[],"{'3FE-4S': None, 'Any+2': None, 'CA+2': None, ...","{'FAD': None, 'FMN': None, 'PYRIDOXAL_PHOSPHAT..."
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],[],"{'3FE-4S': None, 'Any+2': None, 'CA+2': None, ...","{'FAD': None, 'FMN': None, 'PYRIDOXAL_PHOSPHAT..."
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,[],[],"{'3FE-4S': None, 'Any+2': None, 'CA+2': None, ...","{'FAD': None, 'FMN': None, 'PYRIDOXAL_PHOSPHAT..."
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],[],"{'3FE-4S': None, 'Any+2': None, 'CA+2': None, ...","{'FAD': None, 'FMN': None, 'PYRIDOXAL_PHOSPHAT..."


In [247]:
# create protein name to index mapping
protein_name_to_index = {}
for i in range(len(filter_protein_df.index)):
    protein_name_to_index[filter_protein_df.at[i, 'id']] = i

# C matrix: complexes x proteins
C = np.zeros((len(filter_complex_df.index), len(filter_protein_df.index)))

for i in range(10): # range(len(filter_complex_df.index)):

    complex_components = filter_complex_df.loc[i, 'monomer_component_stoichiometry']

    # TODO consider cofactors
    # complex_cofactors = filter_complex_df.loc[i, 'cofactors']

    for component_name, component_count in complex_components.items():
        if component_count is not None:             # side effect of parquet
            # get index of component in filter_protein_df
            component_index = protein_name_to_index[component_name]

            C[i, component_index] = component_count

# append an identity matrix to C
C = np.concatenate((C, np.identity(len(filter_protein_df.index))), axis=0)

C_names = list(filter_complex_df['id']) + list(filter_protein_df['id'])

In [248]:
C

array([[0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [389]:
# create cofactor name to index mapping
cofactor_name_to_index = {}
for i in range(len(filter_cofactor_df.index)):
    cofactor_name_to_index[filter_cofactor_df.at[i, 'id']] = i

cofactor_names = list(filter_cofactor_df['id'])

# P matrix: proteins x cofactors
P = np.zeros((len(filter_protein_df.index), len(filter_cofactor_df.index)))

for i in range(len(filter_protein_df.index)):
    protein_metals = filter_protein_df.loc[i, 'metal_features_processed']
    protein_other = filter_protein_df.loc[i, 'other_features_processed']

    for metal, count in protein_metals.items():
        if count is not None:             # side effect of parquet
            cofactor_index = cofactor_name_to_index[metal]
            P[i, cofactor_index] = count

    for other, count in protein_other.items():
        if count is not None:             # side effect of parquet
            cofactor_index = cofactor_name_to_index[other]
            P[i, cofactor_index] = count


In [250]:
P

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [251]:
# create list of unique elements
unique_elements = set()
for i in range(len(filter_cofactor_df.index)):
    cofactor = filter_cofactor_df.at[i, 'elemental_composition']
    unique_elements.update(cofactor.keys())

unique_elements = list(unique_elements)

# create E matrix: cofactors x elements
E = np.zeros((len(filter_cofactor_df.index), len(unique_elements)))

for i in range(len(filter_cofactor_df.index)):
    cofactor = filter_cofactor_df.at[i, 'elemental_composition']

    for element, count in cofactor.items():
        if count is not None:             # side effect of parquet
            element_index = unique_elements.index(element)
            E[i, element_index] = count

In [255]:
unique_elements

['S',
 'FE',
 'CA',
 'NA',
 'MG',
 'N',
 'R',
 'K',
 'MN',
 'NI',
 'CO',
 'C',
 'MO',
 'O',
 'P',
 'CU',
 'H',
 'ZN']

In [245]:
C_to_E = C @ P @ E

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [265]:
C_names[1577]

'EG10471-MONOMER'

# Now ... add the counts >:o

In [266]:
time = '50'
date = '2023-06-09'
experiment = 'fba-redux'
entry = f'{experiment}_{time}_{date}'
folder = f'out/fbagd/{entry}/'

In [267]:
output = np.load(folder + 'output.npy',allow_pickle='TRUE').item()
# output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
output = output['agents']['0']
fba = output['listeners']['fba_results']
mass = output['listeners']['mass']
bulk = pd.DataFrame(output['bulk'])

In [268]:
bulk

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40879,40880,40881,40882,40883,40884,40885,40886,40887,40888
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1934,1006,864,1590,2772,3232
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1936,1006,864,1590,2774,3232
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1938,1006,864,1590,2774,3234
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1940,1006,864,1590,2775,3234
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1941,1006,864,1591,2776,3234
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1941,1006,864,1591,2776,3235
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1942,1007,866,1592,2776,3237
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1941,1007,866,1594,2778,3237
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1942,1007,866,1595,2780,3238
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1942,1008,866,1595,2781,3239


In [286]:
f = open(folder + 'agent_steps.pkl', 'rb')
agent = dill.load(f)
f.close()

metabolism = agent['ecoli-metabolism-redux']
stoichiometry = metabolism.stoichiometry


In [319]:
initial_state = json.load(open('data/wcecoli_t0.json'))

bulk_ids = [item[0] for item in initial_state['bulk']]

bulk.columns = bulk_ids

In [344]:
ecocyc_to_wcm_map = {}

# combined complex and protein names
protein_names = list(filter_protein_df['id']) + list(filter_complex_df['id'])

for name in protein_names:

    # find complex name in bulk_ids
    found = False

    try:
        idx = bulk_ids.index(name+'[c]')
        ecocyc_to_wcm_map[name] = name+'[c]'
        found = True
        # print(f'found {complex_name} at {idx}')

    except ValueError:
        # delete key
        found = False


    if found == False:

        for id in bulk_ids:
            if name+'[' in id and bulk.loc[:, id].sum() > 0:
                #print(f'found {name} in {id} with nonzero count')
                ecocyc_to_wcm_map[name] = id
                found = True
                break           # ensures preferring nonzero counts

            elif name+'[' in id:
                # print(f'found {name} in {id} with zero count')
                ecocyc_to_wcm_map[name] = id
                found = True

    if found == False:
        ecocyc_to_wcm_map[name] = '--TRANS-ACENAPHTHENE-12-DIOL[j]' # should be none
        print(f'could not find {name}')


could not find MONOMER0-1241
could not find MONOMER0-4223


In [347]:
complex_wcm_names = [ecocyc_to_wcm_map[name] for name in C_names]

counts = bulk.loc[0, complex_wcm_names]

# Finally ... add the counts >:o

In [390]:
factored_cofactor_elements = np.array(counts).reshape(-1,1) * C @ P

In [391]:
factored_cofactor_elements

array([[  0.,   0.,   0., ...,   0.,   0.,   0.],
       [172.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.]])

In [392]:
cofactor_names

['FAD',
 '3FE-4S',
 'MN+2',
 'CA+2',
 'THIAMINE-PYROPHOSPHATE',
 'Any+2',
 'heme',
 'FE+2',
 'MG+2',
 'FMN',
 'CPD-6',
 'CO+2',
 'ZN+2',
 'CPD-3',
 'K+',
 'NI+2',
 'CPD-7',
 'NA+',
 'CU+2',
 'CPD-8123',
 'PYRIDOXAL_PHOSPHATE']

In [393]:
C_names[5262]

'THRESYN-MONOMER'