In [113]:
import numpy as np

import seaborn as sns
import pandas as pd
import os
import pprint
import ast
import re
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict

pp = pprint.PrettyPrinter(depth=6)

os.chdir(os.path.expanduser('~/vivarium-ecoli'))


In [114]:
complexation_rxn_df = pd.read_csv('reconstruction/ecoli/flat/complexation_reactions.tsv', sep='\t')
stoich_series = complexation_rxn_df.loc[:,['id', 'stoichiometry']].dropna().reset_index(drop=True)

stoich_list = []
for i, stoich in enumerate(stoich_series.loc[:, 'stoichiometry']):

    stoich = stoich.replace('null', '-1')

    stoich = ast.literal_eval(stoich)
    for k,v in stoich.items():
        stoich_list.append([k, v, stoich_series.loc[i, 'id'], None, None, None])

In [115]:
pre_complex_df = pd.DataFrame(stoich_list, columns=['complex', 'stoichiometry', 'reaction', 'enzyme-reaction', 'cofactors', 'metal_binding'])
pre_complex_df = pre_complex_df[pre_complex_df['stoichiometry'] > 0].reset_index(drop=True)
pre_complex_df

Unnamed: 0,complex,stoichiometry,reaction,enzyme-reaction,cofactors,metal_binding
0,1-PFK,1,1-PFK_RXN,,,
1,2OXOGLUTARATEDEH-CPLX,1,2OXOGLUTARATEDEH-CPLX_RXN,,,
2,3-ISOPROPYLMALDEHYDROG-CPLX,1,3-ISOPROPYLMALDEHYDROG-CPLX_RXN,,,
3,3-ISOPROPYLMALISOM-CPLX,1,3-ISOPROPYLMALISOM-CPLX_RXN,,,
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,1,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX_RXN,,,
...,...,...,...,...,...,...
1096,CPLX0-8053,1,CPLX0-8053_RXN,,,
1097,CPLX0-8253,1,CPLX0-8253_RXN,,,
1098,SRP-CPLX,1,SRP-CPLX_RXN,,,
1099,CPLX0-7796APO,1,CPLX0-7796APO_RXN,,,


## Connect to EcoCyc to go from Complex -> Cofactor + Metal sites

In [116]:
s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password':'Cellman0451'})

<Response [200]>

In [124]:
protein = 'CPLX0-8167'

# req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{protein}&detail=high'
req_str = f'https://websvc.biocyc.org/apixml?fn=monomers-of-protein&id=ECOLI:{protein}&detail=none'

r = s.get(req_str)

o = xmltodict.parse(r.content)['ptools-xml']
pp.pprint(o)

{'@ptools-version': '27.0',
 '@xml:base': 'http://BioCyc.org/apixml?fn=monomers-of-protein%26id=ECOLI:CPLX0-8167%26detail=NONE',
 'Protein': [{'@frameid': 'HYAC-MONOMER',
              '@orgid': 'ECOLI',
              '@resource': 'getxml?ECOLI:HYAC-MONOMER'},
             {'@frameid': 'HYAB-MONOMER',
              '@orgid': 'ECOLI',
              '@resource': 'getxml?ECOLI:HYAB-MONOMER'},
             {'@frameid': 'HYAA-MONOMER',
              '@orgid': 'ECOLI',
              '@resource': 'getxml?ECOLI:HYAA-MONOMER'}],
 'metadata': {'num_results': '3',
              'query': 'fn=monomers-of-protein&id=ECOLI:CPLX0-8167&detail=NONE',
              'service_name': 'apixml',
              'url': 'http://BioCyc.org/'}}


In [27]:
for i in range(len(pre_complex_df.index)): #

    complex = pre_complex_df.loc[i, 'complex']

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{complex}&detail=low'

    r = s.get(req_str)
    if r.status_code != 200:
        print(complex, r.status_code)
        continue

    o = xmltodict.parse(r.content)['ptools-xml']['Protein']

    # don't allow non-top-level complexes that dont have enzymatic reactions.
    if 'component-of' in o and 'catalyzes' not in o:
        continue

    # check if components have enzymatic-reaction:
    components = o['component']
    if type(components) is dict:
        components = [components]

    for component in components:

        if "Protein" in component:
            component_id = component['Protein']['@frameid']

            req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{component_id}&detail=high'
            rc = s.get(req_str)
            oc = xmltodict.parse(rc.content)['ptools-xml']['Protein']

            feature_set = set()
            if 'has-feature' in oc:
                features = oc['has-feature']

                if type(features) is dict:
                    features = [features]

                for feature in features:
                    if 'parent' not in feature['Feature']:
                        continue

                    category = feature['Feature']['parent']['Feature']['@frameid']
                    if category == 'Metal-Binding-Sites' and 'comment' in feature['Feature']:

                        # Detect match to any of the allowed metal names and allowed cofactor names and add to list
                        comment = feature['Feature']['comment']['#text']
                        feature_set.add(comment)

                        pre_complex_df.at[i, 'metal_binding'] = list(feature_set)


    # if enzyme
    if 'catalyzes' in o:
        o = o['catalyzes']['Enzymatic-Reaction']

        if type(o) is dict:
            o = [o]

        cofactor_set = set()
        for enzrxn in o:
            enz_id = enzrxn['@frameid']

            enz_req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{enz_id}&detail=high'

            rz = s.get(enz_req_str)
            oe = xmltodict.parse(rz.content)['ptools-xml']['Enzymatic-Reaction']

            pre_complex_df.at[i, 'enzyme-reaction'] = enz_id

            if "cofactor" in oe:
                oe = oe['cofactor']

                if type(oe) is dict:
                    oe = [oe]

                for cofactor in oe:
                    cof = cofactor['Compound']['@frameid']
                    cofactor_set.add(cof)

        pre_complex_df.at[i, 'cofactors'] = cofactor_set



CPLX0-2423 404
CPLX0-3976 404
RECFOR-CPLX 404
CPLX0-7796APO 404


In [28]:
total_features = set()

for i in range(len(pre_complex_df.index)):

        if pre_complex_df.loc[i, 'metal_binding'] is not None:
            # only include set elements that have Uniprot in string
            metal_binding = pre_complex_df.loc[i, 'metal_binding']
            for feature in metal_binding:
                if 'UniProt' in feature:
                    total_features.add(feature)
            metal_binding = [x for x in metal_binding if 'UniProt' in x]

            pre_complex_df.at[i, 'metal_binding'] = metal_binding

            total_features.update(metal_binding)

total_features

{'UniProt: Calcium 1.',
 'UniProt: Calcium 1; via carbonyl oxygen.',
 'UniProt: Calcium 1; via carbonyl oxygen; in dimeric form.',
 'UniProt: Calcium 1; via carbonyl oxygen; shared with dimeric partner.',
 'UniProt: Calcium 2.',
 'UniProt: Calcium 2; in dimeric form.',
 'UniProt: Calcium 2; via carbonyl oxygen.',
 'UniProt: Calcium 2; via carbonyl oxygen; in dimeric form.',
 'UniProt: Calcium 3; in monomeric form.',
 'UniProt: Calcium 3; shared with dimeric partner.',
 'UniProt: Calcium 3; via carbonyl oxygen; shared with dimeric partner.',
 'UniProt: Calcium.',
 'UniProt: Calcium; via carbonyl oxygen.',
 'UniProt: Cobalt (adenosylcob(III)alamin axial ligand).',
 'UniProt: Cobalt.',
 'UniProt: Cobalt; via tele nitrogen.',
 'UniProt: Copper.',
 'UniProt: Copper; via pros nitrogen.',
 'UniProt: Copper; via tele nitrogen.',
 'UniProt: Divalent metal cation 1.',
 'UniProt: Divalent metal cation 1; via carbamate group.',
 'UniProt: Divalent metal cation 1; via carbonyl oxygen.',
 'UniProt: 

In [29]:
ALLOWED_METAL_NAMES = {'Iron': 'FE+2', 'Cobalt': 'CO+2', 'Copper': 'CU+2', 'Manganese': 'MN+2', 'Molybdenum': 'MO+2', 'Nickel': 'NI+2', 'Tungsten': 'W', 'Zinc': 'ZN+2',
                       'Calcium': 'CA+2', 'Magnesium': 'MG+2', 'Sodium': 'NA+', 'Potassium': 'K+',
                       'Iron-sulfur \(4Fe-4S\)': 'CPD-7', 'Iron-sulfur \(2Fe-2S\)': 'CPD-6', 'heme': 'heme', 'Molybdate': 'CPD-3'}

NON_REGEX_METAL = {'Iron': 'FE+2', 'Cobalt': 'CO+2', 'Copper': 'CU+2', 'Manganese': 'MN+2', 'Molybdenum': 'MO+2', 'Nickel': 'NI+2', 'Tungsten': 'W', 'Zinc': 'ZN+2',
                       'Calcium': 'CA+2', 'Magnesium': 'MG+2', 'Sodium': 'NA+', 'Potassium': 'K+',
                       'Iron-sulfur (4Fe-4S)': 'CPD-7', 'Iron-sulfur (2Fe-2S)': 'CPD-6', 'heme': 'heme', 'Molybdate': 'CPD-3'}

pre_complex_df['metal_binding_processed'] = 0
pre_complex_df['metal_binding_processed'] = pre_complex_df['metal_binding_processed'].astype(object)

metal_pattern = '|'.join(ALLOWED_METAL_NAMES.keys())
metal_regex = re.compile(f'(({metal_pattern})(\s\d[\.,;]|[\.,;]|\s\())')


for i in range(len(pre_complex_df.index)):

    if pre_complex_df.loc[i, 'metal_binding'] is not None:
        metal_binding = pre_complex_df.loc[i, 'metal_binding']

        metal_count_dict = {}
        existing_matches = set()

        for feature in metal_binding:
            matches = metal_regex.search(feature)
            if matches:
                metal = matches.group(0)[:-1]

                # eliminate duplicates
                if metal not in existing_matches:

                    existing_matches.add(metal)

                    if 'heme' in feature:
                        metal = metal.replace('Iron', 'heme')

                    # check if last char of metal is a number, then crop
                    if metal[-1].isdigit():
                        metal = metal[:-2]

                    metal = metal.strip()

                    # replace metal name with allowed metal name
                    metal = NON_REGEX_METAL[metal]

                    if metal in metal_count_dict:
                        metal_count_dict[metal] += 1
                    else:
                        metal_count_dict[metal] = 1




        pre_complex_df.at[i, 'metal_binding_processed'] = metal_count_dict

In [30]:
pre_complex_df

Unnamed: 0,complex,stoichiometry,reaction,enzyme-reaction,cofactors,metal_binding,metal_binding_processed
0,1-PFK,1,1-PFK_RXN,1PFRUCTPHOSPHN-ENZRXN,[MG+2],,0
1,2OXOGLUTARATEDEH-CPLX,1,2OXOGLUTARATEDEH-CPLX_RXN,2OXOGLUTARATEDEH-ENZRXN,"[LIPOIC-ACID, THIAMINE-PYROPHOSPHATE, FAD, MG+2]",,0
2,3-ISOPROPYLMALDEHYDROG-CPLX,1,3-ISOPROPYLMALDEHYDROG-CPLX_RXN,3-ISOPROPYLMALDEHYDROG-ENZRXN,"[MG+2, MN+2]","[UniProt: Magnesium or manganese., UniProt: Ma...",{'MG+2': 1}
3,3-ISOPROPYLMALISOM-CPLX,1,3-ISOPROPYLMALISOM-CPLX_RXN,3-ISOPROPYLMALISOM-ENZRXN,[CPD-7],[UniProt: Iron-sulfur (4Fe-4S).],{'CPD-7': 1}
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,1,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX_RXN,3-METHYL-2-OXOBUT-OHCH3XFER-ENZRXN,[MG+2],[UniProt: Magnesium.],{'MG+2': 1}
...,...,...,...,...,...,...,...
1096,CPLX0-8053,1,CPLX0-8053_RXN,,,,0
1097,CPLX0-8253,1,CPLX0-8253_RXN,,,,0
1098,SRP-CPLX,1,SRP-CPLX_RXN,,,,0
1099,CPLX0-7796APO,1,CPLX0-7796APO_RXN,,,,0


In [31]:
# for row of pre_complex_df, if metal_binding_processed is 0, set as empty dict
for i in range(len(pre_complex_df.index)):
    if pre_complex_df.loc[i, 'metal_binding_processed'] == 0:
        pre_complex_df.at[i, 'metal_binding_processed'] = {}

    if pre_complex_df.loc[i, 'cofactors'] == None:
        pre_complex_df.at[i, 'cofactors'] = []

pre_complex_df.loc[:, ['complex', 'cofactors', 'metal_binding_processed']]


Unnamed: 0,complex,cofactors,metal_binding_processed
0,1-PFK,[MG+2],{}
1,2OXOGLUTARATEDEH-CPLX,"[LIPOIC-ACID, THIAMINE-PYROPHOSPHATE, FAD, MG+2]",{}
2,3-ISOPROPYLMALDEHYDROG-CPLX,"[MG+2, MN+2]",{'MG+2': 1}
3,3-ISOPROPYLMALISOM-CPLX,[CPD-7],{'CPD-7': 1}
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,[MG+2],{'MG+2': 1}
...,...,...,...
1096,CPLX0-8053,[],{}
1097,CPLX0-8253,[],{}
1098,SRP-CPLX,[],{}
1099,CPLX0-7796APO,[],{}


In [40]:
# only include rows that have cofactors or metal binding
pre_complex_df.loc[(pre_complex_df['cofactors'].map(len) > 0) | (pre_complex_df['metal_binding_processed'].map(len) > 0),['complex', 'cofactors', 'metal_binding_processed']]

Unnamed: 0,complex,cofactors,metal_binding_processed
0,1-PFK,[MG+2],{}
1,2OXOGLUTARATEDEH-CPLX,"[LIPOIC-ACID, THIAMINE-PYROPHOSPHATE, FAD, MG+2]",{}
2,3-ISOPROPYLMALDEHYDROG-CPLX,"[MG+2, MN+2]",{'MG+2': 1}
3,3-ISOPROPYLMALISOM-CPLX,[CPD-7],{'CPD-7': 1}
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,[MG+2],{'MG+2': 1}
...,...,...,...
1083,UPPSYN-CPLX,[MG+2],{'MG+2': 1}
1084,URACIL-PRIBOSYLTRANS-CPLX,[MG+2],{}
1085,URPHOS-CPLX,[K+],{}
1088,XYLISOM-CPLX,"[MG+2, MN+2]",{'MG+2': 2}


In [55]:
filtered_complex_df =  pre_complex_df.loc[(pre_complex_df['cofactors'].map(len) > 0) | (pre_complex_df['metal_binding_processed'].map(len) > 0),
                                          ['complex', 'cofactors', 'metal_binding_processed']].reset_index(drop=True)

filtered_complex_df

Unnamed: 0,complex,cofactors,metal_binding_processed
0,1-PFK,[MG+2],{}
1,2OXOGLUTARATEDEH-CPLX,"[LIPOIC-ACID, THIAMINE-PYROPHOSPHATE, FAD, MG+2]",{}
2,3-ISOPROPYLMALDEHYDROG-CPLX,"[MG+2, MN+2]",{'MG+2': 1}
3,3-ISOPROPYLMALISOM-CPLX,[CPD-7],{'CPD-7': 1}
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,[MG+2],{'MG+2': 1}
...,...,...,...
442,UPPSYN-CPLX,[MG+2],{'MG+2': 1}
443,URACIL-PRIBOSYLTRANS-CPLX,[MG+2],{}
444,URPHOS-CPLX,[K+],{}
445,XYLISOM-CPLX,"[MG+2, MN+2]",{'MG+2': 2}


In [57]:
filtered_complex_df['combined'] = 0
filtered_complex_df['combined'] = filtered_complex_df['combined'].astype(object)

# for each complex, if there are both cofactors and metal binding, combine them
for i in range(len(filtered_complex_df.index)):
    cofactors = filtered_complex_df.at[i, 'cofactors']
    metal_binding = filtered_complex_df.at[i, 'metal_binding_processed']

    # if both are not empty
    if len(cofactors) > 0 and len(metal_binding) > 0:

        new_cofactors = set(cofactors) - set(metal_binding)

        # create new dict that adds new cofactors to metal binding with stoichiometry of 1
        new_metal_binding = metal_binding.copy()
        for cofactor in new_cofactors:
            new_metal_binding[cofactor] = 1


        filtered_complex_df.at[i, 'combined'] = new_metal_binding

    elif len(cofactors) > 0:
        # create dict with stoichiometry of 1 for each cofactor
        new_cofactors = {}
        for cofactor in cofactors:
            new_cofactors[cofactor] = 1

        filtered_complex_df.at[i, 'combined'] = new_cofactors

    elif len(metal_binding) > 0:
        filtered_complex_df.at[i, 'combined'] = metal_binding

filtered_complex_df

Unnamed: 0,complex,cofactors,metal_binding_processed,combined
0,1-PFK,[MG+2],{},{'MG+2': 1}
1,2OXOGLUTARATEDEH-CPLX,"[LIPOIC-ACID, THIAMINE-PYROPHOSPHATE, FAD, MG+2]",{},"{'LIPOIC-ACID': 1, 'THIAMINE-PYROPHOSPHATE': 1..."
2,3-ISOPROPYLMALDEHYDROG-CPLX,"[MG+2, MN+2]",{'MG+2': 1},"{'MG+2': 1, 'MN+2': 1}"
3,3-ISOPROPYLMALISOM-CPLX,[CPD-7],{'CPD-7': 1},{'CPD-7': 1}
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,[MG+2],{'MG+2': 1},{'MG+2': 1}
...,...,...,...,...
442,UPPSYN-CPLX,[MG+2],{'MG+2': 1},{'MG+2': 1}
443,URACIL-PRIBOSYLTRANS-CPLX,[MG+2],{},{'MG+2': 1}
444,URPHOS-CPLX,[K+],{},{'K+': 1}
445,XYLISOM-CPLX,"[MG+2, MN+2]",{'MG+2': 2},"{'MG+2': 2, 'MN+2': 1}"


In [59]:
filtered_complex_df.to_csv('cofactors_new.csv', index=False)

# Experiment with UniProt API

In [104]:
# Use UniProt api to fetch cofactor for protein P00448
import urllib.request
import json

url = 'https://www.ebi.ac.uk/proteins/api/proteins/P00448'

req = urllib.request.Request(url)
with urllib.request.urlopen(req) as f:
    response = f.read()
    data = json.loads(response.decode('utf-8'))

    # get cofactors
    for comment in data['comments']:
        if 'cofactors' in comment:
            print(comment['cofactors'], comment['text'], sep='\n')


[{'name': 'Mn(2+)', 'dbReference': {'type': 'CHEBI', 'id': 'CHEBI:29035'}}]
[{'value': 'Binds 1 Mn(2+) ion per subunit'}]


In [100]:
pp.pprint(data['comments'])

[{'text': [{'value': 'Destroys superoxide anion radicals which are normally '
                     'produced within the cells and which are toxic to '
                     'biological systems'}],
  'type': 'FUNCTION'},
 {'reaction': {'dbReferences': [{'id': 'RHEA:20696', 'type': 'Rhea'},
                                {'id': 'CHEBI:15378', 'type': 'ChEBI'},
                                {'id': 'CHEBI:15379', 'type': 'ChEBI'},
                                {'id': 'CHEBI:16240', 'type': 'ChEBI'},
                                {'id': 'CHEBI:18421', 'type': 'ChEBI'}],
               'ecNumber': '1.15.1.1',
               'name': '2 H(+) + 2 superoxide = H2O2 + O2'},
  'type': 'CATALYTIC_ACTIVITY'},
 {'cofactors': [{'dbReference': {'id': 'CHEBI:29035', 'type': 'CHEBI'},
                 'name': 'Mn(2+)'}],
  'text': [{'value': 'Binds 1 Mn(2+) ion per subunit'}],
  'type': 'COFACTOR'},
 {'text': [{'value': 'Homodimer'}], 'type': 'SUBUNIT'},
 {'text': [{'evidences': [{'code': 'ECO:0000

# Annotate table with counts

In [62]:
complex_df = pd.read_csv('notebooks/fbagd/cofactors_new.csv', sep=',').fillna('None').loc[:, ['complex', 'combined']]
complex_df.combined = complex_df.combined.apply(ast.literal_eval)


complex_df

Unnamed: 0,complex,combined
0,1-PFK,{'MG+2': 1}
1,2OXOGLUTARATEDEH-CPLX,"{'LIPOIC-ACID': 1, 'THIAMINE-PYROPHOSPHATE': 1..."
2,3-ISOPROPYLMALDEHYDROG-CPLX,"{'MG+2': 1, 'MN+2': 1}"
3,3-ISOPROPYLMALISOM-CPLX,{'CPD-7': 1}
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,{'MG+2': 1}
...,...,...
442,UPPSYN-CPLX,{'MG+2': 1}
443,URACIL-PRIBOSYLTRANS-CPLX,{'MG+2': 1}
444,URPHOS-CPLX,{'K+': 1}
445,XYLISOM-CPLX,"{'MG+2': 2, 'MN+2': 1}"


In [63]:
time = '50'
date = '2023-06-13'
experiment = 'fba-redux'
entry = f'{experiment}_{time}_{date}'
folder = f'out/fbagd/{entry}/'

In [64]:
output = np.load(folder + 'output.npy',allow_pickle='TRUE').item()
# output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
output = output['agents']['0']
fba = output['listeners']['fba_results']
mass = output['listeners']['mass']


In [65]:
f = open(folder + 'agent_steps.pkl', 'rb')
agent = dill.load(f)
f.close()

In [66]:
metabolism = agent['ecoli-metabolism-redux']
stoichiometry = metabolism.stoichiometry
bulk_ids = metabolism.bulk_ids

In [67]:
bulk = pd.DataFrame(output['bulk'], columns=bulk_ids)

In [74]:
for name in bulk.keys():
    if 'CPLX0-7533' in name:
        print(name)

CPLX0-7533[o]


In [80]:
for complex_name in complex_df['complex']:
    bulk_name = complex_name + '[c]'

    if bulk_name in bulk_ids:
        complex_df.loc[complex_df['complex'] == complex_name, 'counts'] = bulk.at[24, bulk_name]

    elif bulk_name not in bulk_ids:
        for cpd in list(bulk_ids):
            if complex_name in cpd and len(cpd) < len(complex_name) + 4:
                bulk_name = cpd
                complex_df.loc[complex_df['complex'] == complex_name, 'counts'] = bulk.at[24, bulk_name]


In [85]:
complex_df.sort_values(by='counts', ascending=False)

Unnamed: 0,complex,combined,counts
359,ISOCITHASE-CPLX,"{'MG+2': 1, 'MN+2': 1}",12315.0
427,SUPEROX-DISMUTMN-CPLX,{'MN+2': 1},10096.0
399,PHOSGLYCMUTASE,{'23-DIPHOSPHOGLYCERATE': 1},10058.0
305,ENOLASE-CPLX,{'MG+2': 1},6645.0
18,ACSERLYA-CPLX,{'PYRIDOXAL_PHOSPHATE': 1},6269.0
...,...,...,...
307,ENTMULTI-CPLX,{'MG+2': 1},
321,GCVMULTI-CPLX,{'LIPOIC-ACID': 1},
383,NQOR-CPLX,{'FMN': 1},
400,PHOSPHASERDECARB-DIMER,{'CPD0-2654': 1},


In [89]:
# create a new column in complex_df that is a list with each cofactor multiplied by its stoichiometry

for i in range(len(complex_df.index)):
    cofactor_dict = complex_df.at[i, 'combined']
    cofactor_list = []

    for cofactor in cofactor_dict:
        for j in range(cofactor_dict[cofactor]):
            cofactor_list.append(cofactor)

    complex_df.at[i, 'combined'] = cofactor_list

complex_df


Unnamed: 0,complex,combined,counts
0,1-PFK,[MG+2],31.0
1,2OXOGLUTARATEDEH-CPLX,"[LIPOIC-ACID, THIAMINE-PYROPHOSPHATE, FAD, MG+2]",86.0
2,3-ISOPROPYLMALDEHYDROG-CPLX,"[MG+2, MN+2]",1065.0
3,3-ISOPROPYLMALISOM-CPLX,[CPD-7],2631.0
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,[MG+2],174.0
...,...,...,...
442,UPPSYN-CPLX,[MG+2],56.0
443,URACIL-PRIBOSYLTRANS-CPLX,[MG+2],1068.0
444,URPHOS-CPLX,[K+],176.0
445,XYLISOM-CPLX,"[MG+2, MG+2, MN+2]",1.0


In [84]:
cofactor_df = complex_df[['combined', 'counts']].explode('combined').groupby('combined').sum().sort_values(by='counts', ascending=False)
cofactor_df

Unnamed: 0,combined,counts
0,MG+2,31.0
1,LIPOIC-ACID,86.0
1,THIAMINE-PYROPHOSPHATE,86.0
1,FAD,86.0
1,MG+2,86.0
...,...,...
443,MG+2,1068.0
444,K+,176.0
445,MG+2,1.0
445,MN+2,1.0


In [180]:
# change name of index value FeS-Centers to CPD-7

cofactor_df['molecular_composition'] = 0
cofactor_df['molecular_composition'] = cofactor_df['molecular_composition'].astype(object)

cofactor_df

Unnamed: 0_level_0,counts,molecular_composition
cofactors,Unnamed: 1_level_1,Unnamed: 2_level_1
MG+2,55738,0
MN+2,27211,0
PYRIDOXAL_PHOSPHATE,24413,0
ZN+2,12807,0
23-DIPHOSPHOGLYCERATE,10058,0
FAD,8954,0
K+,5157,0
CPD-7,4556,0
FE+2,3256,0
THIAMINE-PYROPHOSPHATE,1868,0


In [181]:
for cofactor_name in cofactor_df.index:
    mol = cofactor_name
    mol = mol.replace('+', '%2b')
    mol_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{mol}&detail=low'

    r = s.get(mol_str)
    o = xmltodict.parse(r.content)['ptools-xml']['Compound']

    if 'cml' not in o:
        print(f'No cml for {cofactor_name}')
        cofactor_df.at[cofactor_name, 'molecular_composition'] = {}
        continue
    else:
        atom_array = o['cml']['molecule']['atomArray']['atom']
    #pp.pprint(o)

    if type(atom_array) is dict:
        atom_array = [atom_array]

    element_dict = {}
    # go through entries of atom array and add 1 to key of element_dict
    for atom in atom_array:
        element = atom['@elementType']
        if element in element_dict:
            element_dict[element] += 1
        else:
            element_dict[element] = 1

    cofactor_df.at[cofactor_name, 'molecular_composition'] = element_dict

No cml for FeS-Centers
No cml for CPD-17649


In [182]:
for cofactor_name in ['FeS-Centers', 'CPD-17649']:
    cofactor_df.at[cofactor_name, 'molecular_composition'] = {'FE': 4, 'S': 4}

In [183]:
element_matrix = cofactor_df['molecular_composition'].apply(pd.Series).fillna(0)
element_matrix

Unnamed: 0_level_0,MG,MN,C,N,O,P,ZN,K,FE,S,NI,CU,R,CO,CA,H,MO,Proteins
cofactors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
MG+2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MN+2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PYRIDOXAL_PHOSPHATE,0.0,0.0,8.0,1.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZN+2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23-DIPHOSPHOGLYCERATE,0.0,0.0,3.0,0.0,10.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAD,0.0,0.0,27.0,9.0,15.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
K+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CPD-7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FE+2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
THIAMINE-PYROPHOSPHATE,0.0,0.0,12.0,4.0,7.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [184]:
metals = ['MG', 'MN', 'ZN', 'FE', 'NI', 'CU', 'CO', 'CA', 'MO']
metal_distribution = pd.DataFrame(index=element_matrix.index, columns=metals).fillna(0)
for element in metals:
    metal_distribution.loc[:, element] = cofactor_df['counts'] * element_matrix.loc[:, element]


In [185]:
metal_distribution.sort_values(by='FE', ascending=False)['FE']

cofactors
CPD-7                     18224.0
FE+2                       3256.0
CPD-6                      1270.0
Heme-b                     1025.0
HEME_O                      814.0
3FE-4S                      636.0
FeS-Centers                 304.0
FE+3                        161.0
PROTOHEME                   139.0
SIROHEME                     98.0
CPD-23429                    46.0
HEME_D                       13.0
HEME_C                       13.0
NADPH                         0.0
CPD-17649                     0.0
AMMONIUM                      0.0
CPD-24862                     0.0
ADENOSYLCOBALAMIN             0.0
CPD-15873                     0.0
CPD-18260                     0.0
TOPAQUINONE                   0.0
CPD-8123                      0.0
FMNH2                         0.0
CPD0-1882                     0.0
CPD0-2654                     0.0
MG+2                          0.0
LYS                           0.0
CA+2                          0.0
PYRIDOXAL_PHOSPHATE           0.0
ZN+2

# Chain iron content back to enzymes

In [186]:
complex_df.loc[:]

Unnamed: 0,complex,stoichiometry,reaction,enzyme-reaction,cofactors,counts
0,1-PFK,1,1-PFK_RXN,1PFRUCTPHOSPHN-ENZRXN,[MG+2],31
1,2OXOGLUTARATEDEH-CPLX,1,2OXOGLUTARATEDEH-CPLX_RXN,2OXOGLUTARATEDEH-ENZRXN,"[LIPOIC-ACID, THIAMINE-PYROPHOSPHATE, FAD, MG+2]",86
2,3-ISOPROPYLMALDEHYDROG-CPLX,1,3-ISOPROPYLMALDEHYDROG-CPLX_RXN,3-ISOPROPYLMALDEHYDROG-ENZRXN,"[MG+2, MN+2]",1065
3,3-ISOPROPYLMALISOM-CPLX,1,3-ISOPROPYLMALISOM-CPLX_RXN,3-ISOPROPYLMALISOM-ENZRXN,[CPD-7],2631
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,1,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX_RXN,3-METHYL-2-OXOBUT-OHCH3XFER-ENZRXN,[MG+2],174
...,...,...,...,...,...,...
1096,CPLX0-8053,1,CPLX0-8053_RXN,,[],208
1097,CPLX0-8253,1,CPLX0-8253_RXN,,[],0
1098,SRP-CPLX,1,SRP-CPLX_RXN,,[],0
1099,CPLX0-7796APO,1,CPLX0-7796APO_RXN,,[],50


In [223]:
# filter cofactors in element matrix that have FE coefficient > 0
fe_cofactors = list(element_matrix[element_matrix['FE'] > 0].index)

# filter complex_df to only include complexes where one of the cofactors is in fe_cofactors
exploded_complexes = complex_df.explode('cofactors').dropna().reset_index(drop=True)
fe_complexes = exploded_complexes.loc[exploded_complexes.cofactors.isin(fe_cofactors), ['complex', 'cofactors', 'counts']]
fe_complexes

# add column that multiplies counts by number of FE atoms in cofactor based on element matrix
fe_complexes['fe_counts'] = 0
for cofactor in fe_cofactors:
    fe_complexes.loc[fe_complexes['cofactors'] == cofactor, 'fe_counts'] = fe_complexes.loc[fe_complexes['cofactors'] == cofactor, 'counts'] * element_matrix.loc[cofactor, 'FE']

# add a column that divides the fe_counts by the total sum of fe_counts
fe_complexes['fe_counts_norm'] = 100 * fe_complexes['fe_counts'] / fe_complexes['fe_counts'].sum()

fe_complexes

Unnamed: 0,complex,cofactors,counts,fe_counts,fe_counts_norm
7,3-ISOPROPYLMALISOM-CPLX,CPD-7,2631,10524.0,40.478480
31,ADHE-CPLX,FE+2,93,93.0,0.357706
43,ANGLYC3PDEHYDROG-CPLX,FeS-Centers,69,276.0,1.061579
48,APP-UBIOX-CPLX,HEME_D,13,13.0,0.050002
49,APP-UBIOX-CPLX,Heme-b,13,13.0,0.050002
...,...,...,...,...,...
415,SULFITE-REDUCT-CPLX,CPD-7,93,372.0,1.430824
418,SULFITE-REDUCT-CPLX,SIROHEME,93,93.0,0.357706
419,SUPEROX-DISMUTFE-CPLX,FE+3,161,161.0,0.619255
424,THREODEHYD-CPLX,FE+2,33,33.0,0.126928


In [219]:
element_matrix

Unnamed: 0_level_0,MG,MN,C,N,O,P,ZN,K,FE,S,NI,CU,R,CO,CA,H,MO,Proteins
cofactors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
MG+2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MN+2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PYRIDOXAL_PHOSPHATE,0.0,0.0,8.0,1.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZN+2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23-DIPHOSPHOGLYCERATE,0.0,0.0,3.0,0.0,10.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAD,0.0,0.0,27.0,9.0,15.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
K+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CPD-7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FE+2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
THIAMINE-PYROPHOSPHATE,0.0,0.0,12.0,4.0,7.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
