In [1]:
import numpy as np

import seaborn as sns
import pandas as pd
import os
import pprint
import ast
import re
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict

pp = pprint.PrettyPrinter(depth=6)

os.chdir(os.path.expanduser('~/vivarium-ecoli'))

s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password':'Cellman0451'})


<Response [200]>

In [2]:
# get a set of all monomers with an associated uniprot id
proteins_df = pd.read_csv('reconstruction/ecoli/flat/proteins.tsv', sep='\t').loc[:, ["id", "common_name"]]

proteins_df["metal_features"] = 0
proteins_df["metal_features"] = proteins_df["metal_features"].astype(object)

proteins_df["other_features"] = 0
proteins_df["other_features"] = proteins_df["other_features"].astype(object)

proteins_df

Unnamed: 0,id,common_name,metal_features,other_features
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,0,0
1,1-PFK-MONOMER,1-phosphofructokinase,0,0
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,0,0
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,0,0
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",0,0
...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,0,0
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,0,0
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,0,0
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,0,0


## Get relevant EcoCyc data and annotate protein monomer table

In [3]:
for i in range(len(proteins_df.index)):
    if i % 100 == 0:
        print(i)

    protein = proteins_df.loc[i, 'id']

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{protein}&detail=high'

    r = s.get(req_str)
    if r.status_code != 200:
        print(protein, r.status_code)
        proteins_df.loc[i, 'other_features'] = 'NO ECOCYC ENTRY'
        continue

    o = xmltodict.parse(r.content)['ptools-xml']

    metal_set = set()
    other_feature_set = set()
    if 'Protein' in o and 'has-feature' in o['Protein']:
        features = o['Protein']['has-feature']

        if type(features) is dict:
            features = [features]

        for feature in features:
            if 'parent' not in feature['Feature']:
                continue

            category = feature['Feature']['parent']['Feature']['@frameid']
            if category == 'Metal-Binding-Sites' and 'comment' in feature['Feature']:

                # Detect match to any of the allowed metal names and allowed cofactor names and add to list
                comment = feature['Feature']['comment']['#text']
                metal_set.add(comment)

                proteins_df.at[i, 'metal_features'] = list(metal_set)

            if category == 'Nucleotide-Phosphate-Binding-Regions' and 'attached-group' in feature['Feature'] and 'Compound' in feature['Feature']['attached-group']:
                attached_group = feature['Feature']['attached-group']['Compound']['@frameid']
                other_feature_set.add(attached_group)

            if category == 'N6-pyridoxal-phosphate-Lys-Modifications':
                other_feature_set.add('PYRIDOXAL_PHOSPHATE')

            if category == 'Protein-Segments' and 'comment' in feature['Feature'] and 'Thiamine' in feature['Feature']['comment']['#text']:
                other_feature_set.add('THIAMINE-PYROPHOSPHATE')

            if category == 'Selenocysteine-site':
                metal_set.add('L-SELENOCYSTEINE')

            proteins_df.at[i, 'metal_features'] = list(metal_set)
            proteins_df.at[i, 'other_features'] = list(other_feature_set)

    else:
        proteins_df.loc[i, 'other_features'] = list()
        proteins_df.loc[i, 'metal_features'] = list()



0
100
200
300
400
500
600
700
800
900
1000
1100
EG11708-MONOMER 404
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400


In [4]:
proteins_df

Unnamed: 0,id,common_name,metal_features,other_features
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,[],[]
1,1-PFK-MONOMER,1-phosphofructokinase,[],[ATP]
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[],[NADP]
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[],[]
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",[],[]
...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[],[ATP]
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],[]
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,"[implicated in a second zinc binding site, zin...",[]
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],[]


In [5]:
proteins_df.to_parquet('notebooks/fbagd/protein_features.parquet', index=False)

ArrowInvalid: ('cannot mix list and non-list, non-null values', 'Conversion failed for column metal_features with type object')

## Reload protein feature table

In [8]:
proteins_df = pd.read_csv('notebooks/fbagd/protein_features.csv')
proteins_df['metal_features'] = proteins_df['metal_features'].apply(ast.literal_eval)
# for rows of proteins with where other_features is set, convert from string to set with literal_eval
proteins_df.loc[proteins_df['other_features'].str.startswith('['), 'other_features'] = \
    proteins_df.loc[proteins_df['other_features'].str.startswith('['), 'other_features'].apply(ast.literal_eval)

filter_protein_df = proteins_df.copy().loc[:, ['id', 'common_name', 'metal_features', 'other_features']]
filter_protein_df

Unnamed: 0,id,common_name,metal_features,other_features
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,[],[]
1,1-PFK-MONOMER,1-phosphofructokinase,[],[ATP]
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[],[NADP]
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[],[]
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",[],[]
...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[],[ATP]
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],[]
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,"[zinc-coordinating residues, implicated in a s...",[]
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],[]


## Process raw EcoCyc annotations into standard EcoCyc names

In [18]:
ALLOWED_METAL_NAMES =   {'Iron': 'FE+2', 'Cobalt': 'CO+2', 'Copper': 'CU+2', 'Manganese': 'MN+2', 'Molybdenum': 'MO+2', 'Nickel': 'NI+2', 'Tungsten': 'W', 'Zinc': 'ZN+2',
                        'Calcium': 'CA+2', 'Magnesium': 'MG+2', 'Sodium': 'NA+', 'Potassium': 'K+',
                        'Iron-sulfur \(4Fe-4S\)': 'CPD-7', 'Iron-sulfur \(2Fe-2S\)': 'CPD-6',
                         'Iron-sulfur \(4Fe-4S-S-AdoMet\)': 'CPD-7', 'Iron-sulfur \(3Fe-4S\)': '3FE-4S', 'Iron-oxo-sulfur \(4Fe-2O-2S\)': 'CPD-7',
                        'heme': 'heme', 'Molybdate': 'CPD-3', 'heme B': 'heme',
                         'L-SELENOCYSTEINE': 'L-SELENOCYSTEINE',
                        'Divalent metal cation': 'Any+2'}

# remove all \ characters from keys in ALLOWED_METAL_NAMES
NON_REGEX_METAL = {key.replace('\\', ''): value for key, value in ALLOWED_METAL_NAMES.items()}

filter_protein_df['metal_features_processed'] = 0
filter_protein_df['metal_features_processed'] = filter_protein_df['metal_features_processed'].astype(object)

metal_pattern = '|'.join(ALLOWED_METAL_NAMES.keys())
metal_regex = re.compile(f'(({metal_pattern})(\s\d[\.,;]|[\.,;]|\s\())')


for i in range(len(filter_protein_df.index)):

    if filter_protein_df.loc[i, 'metal_features'] is not None and filter_protein_df.loc[i, 'metal_features'] != 0:
        metal_binding = filter_protein_df.loc[i, 'metal_features']

        metal_count_dict = {}
        existing_matches = set()

        for feature in metal_binding:
            matches = metal_regex.search(feature)
            if matches:
                metal = matches.group(0)[:-1]

                # eliminate duplicates
                if metal not in existing_matches:

                    existing_matches.add(metal)

                    if 'heme' in feature:
                        metal = metal.replace('Iron', 'heme')

                    # check if last char of metal is a number, then crop
                    if metal[-1].isdigit():
                        metal = metal[:-2]

                    metal = metal.strip()

                    # replace metal name with allowed metal name
                    metal = NON_REGEX_METAL[metal]

                    if metal in metal_count_dict:
                        metal_count_dict[metal] += 1
                    else:
                        metal_count_dict[metal] = 1

            else:
                print(f'No match for {feature} in {filter_protein_df.loc[i, "id"]}')




        filter_protein_df.at[i, 'metal_features_processed'] = metal_count_dict

No match for UniProt: Magnesium or manganese. in 3-ISOPROPYLMALDEHYDROG-MONOMER
No match for conserved, Fe(III) binding motif in BASS-MONOMER
No match for predicted heme d ligand in CYDA-MONOMER
No match for UniProt: Fe(2+); catalytic. in CYTDEAM-MONOMER
No match for UniProt: Zn(2+); catalytic. in CYTDEAM-MONOMER
No match for The amino-terminus of ClpA contains a Zinc binding site. in EG10156-MONOMER
No match for The active-site magnesium ion is coordinated by three aspartate residues (401, 403, 555). Two of them form part of 
the PDXD active-site motif. in EG10238-MONOMER
No match for Divalent magnesium ions are chelated by three aspartate residues, two in the conserved DPD sequence (345, 347) 
and one in the conserved EGYMD sequence (269). in EG10239-MONOMER
No match for Based on crystal structures, Glu-265 and Asp-309 coordinate a divalent cation. in EG10239-MONOMER
No match for These residues are thought to coordinate the one or two divalent magnesium ions required for the 
gyrase 

In [19]:
filter_protein_df = filter_protein_df.drop(columns=['metal_features'])
filter_protein_df

Unnamed: 0,id,common_name,other_features,metal_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,[],{}
1,1-PFK-MONOMER,1-phosphofructokinase,[ATP],{}
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[NADP],{}
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[],{}
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",[],{}
...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[ATP],{}
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],{}
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,[],{'ZN+2': 1}
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],{}


In [20]:
filter_protein_df['other_features_processed'] = 0
filter_protein_df['other_features_processed'] = filter_protein_df['other_features_processed'].astype(object)

ACCEPTED_OTHER_FEATURES = {'PYRIDOXAL_PHOSPHATE', 'THIAMINE-PYROPHOSPHATE', 'FMN', 'FAD'}

for i in range(len(filter_protein_df.index)):

    if filter_protein_df.loc[i, 'other_features'] is not None and filter_protein_df.loc[i, 'other_features'] != 0:
        other_features = filter_protein_df.loc[i, 'other_features']

        other_feature_count_dict = {}
        existing_matches = set()

        for feature in other_features:

            # eliminate duplicates
            if feature not in existing_matches:

                existing_matches.add(feature)

                if feature in ACCEPTED_OTHER_FEATURES:
                    if feature in other_feature_count_dict:
                        other_feature_count_dict[feature] += 1
                    else:
                        other_feature_count_dict[feature] = 1

        filter_protein_df.at[i, 'other_features_processed'] = other_feature_count_dict

In [90]:
filter_protein_df = filter_protein_df.drop(columns=['other_features'])

filter_protein_df

KeyError: "['other_features'] not found in axis"

In [92]:
filter_protein_df.to_csv('notebooks/fbagd/processed_proteins.tsv', sep='\t', index=False)

# Annotate complexation with EcoCyc data

In [78]:
complex_df = pd.read_csv('reconstruction/ecoli/flat/complexation_reactions.tsv', sep='\t').loc[:, ['id', 'stoichiometry']]


removed_complexes = pd.read_csv('reconstruction/ecoli/flat/complexation_reactions_removed.tsv', sep='\t')

# remove rows where id starts with '#'
complex_df = complex_df[~complex_df['id'].str.startswith('#')].reset_index(drop=True)

# remove rows of complex_df where id matches an id in removed_complexes
complex_df = complex_df[~complex_df['id'].isin(removed_complexes['id'])].reset_index(drop=True)
complex_df.stoichiometry = complex_df.stoichiometry.astype(object)

for i, stoich in enumerate(complex_df.loc[:, 'stoichiometry']):

    if type(stoich) is str and stoich[0] == '{':
        stoich = stoich.replace('null', '-1')
        stoich = ast.literal_eval(stoich)

        complex_df.at[i, 'stoichiometry'] = stoich

    else:
        complex_df.at[i, 'stoichiometry'] = {}


# for each row, find dict entry with positive value
for i in range(len(complex_df.index)):

    stoich = complex_df.loc[i, 'stoichiometry']

    for k,v in stoich.items():
        if v > 0:
            complex_df.at[i, 'id'] = k

complex_df

Unnamed: 0,id,stoichiometry
0,1-PFK,"{'1-PFK': 1, '1-PFK-MONOMER': -2}"
1,2OXOGLUTARATEDEH-CPLX,"{'2OXOGLUTARATEDEH-CPLX': 1, 'E1O': -1, 'E2O':..."
2,3-ISOPROPYLMALDEHYDROG-CPLX,"{'3-ISOPROPYLMALDEHYDROG-CPLX': 1, '3-ISOPROPY..."
3,3-ISOPROPYLMALISOM-CPLX,"{'3-ISOPROPYLMALISOM-CPLX': 1, 'LEUC-MONOMER':..."
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,"{'3-METHYL-2-OXOBUT-OHCH3XFER-CPLX': 1, '3-CH3..."
...,...,...
1063,CPLX0-8053,"{'CPLX0-8053': 1, 'EG10942-MONOMER': -1}"
1064,CPLX0-8253,"{'CPLX0-8253': 1, 'CSRC-RNA': -1, 'EG11447-MON..."
1065,SRP-CPLX,"{'SRP-CPLX': 1, 'EG10300-MONOMER': -1, 'FFS-RN..."
1066,CPLX0-7796APO,"{'CPLX0-7796APO': 1, 'PD04032': -2}"


In [79]:
for i in range(len(complex_df.index)):

    if i % 100 == 0:
        print(i)

    complex = complex_df.loc[i, 'id']

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{complex}&detail=low'

    r = s.get(req_str)
    if r.status_code != 200:
        print(complex, r.status_code)
        continue

    o = xmltodict.parse(r.content)['ptools-xml']['Protein']


    # note if not top-level complex
    if 'component-of' in o:
        complex_df.at[i, 'top_level'] = False
    else:
        complex_df.at[i, 'top_level'] = True


    # if enzyme
    if 'catalyzes' in o:
        o = o['catalyzes']['Enzymatic-Reaction']

        if type(o) is dict:
            o = [o]

        cofactor_set = set()
        enz_rxn_set = set()

        for enzrxn in o:
            enz_id = enzrxn['@frameid']

            enz_rxn_set.add(enz_id)

            enz_req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{enz_id}&detail=high'

            rz = s.get(enz_req_str)
            oe = xmltodict.parse(rz.content)['ptools-xml']['Enzymatic-Reaction']

            if "cofactor" in oe:
                oe = oe['cofactor']

                if type(oe) is dict:
                    oe = [oe]

                for cofactor in oe:
                    cof = cofactor['Compound']['@frameid']
                    cofactor_set.add(cof)

        complex_df.at[i, 'enzyme_reaction'] = enz_rxn_set
        complex_df.at[i, 'cofactors'] = cofactor_set

0
100
200
CPLX0-2423 404
300
CPLX0-3976 404
400
500
600
700
800
900
RECFOR-CPLX 404
1000
CPLX0-7796APO 404


In [80]:
complex_df

Unnamed: 0,id,stoichiometry,top_level,enzyme_reaction,cofactors
0,1-PFK,"{'1-PFK': 1, '1-PFK-MONOMER': -2}",True,1PFRUCTPHOSPHN-ENZRXN,MG+2
1,2OXOGLUTARATEDEH-CPLX,"{'2OXOGLUTARATEDEH-CPLX': 1, 'E1O': -1, 'E2O':...",True,{2OXOGLUTARATEDEH-ENZRXN},"{LIPOIC-ACID, FAD, THIAMINE-PYROPHOSPHATE, MG+2}"
2,3-ISOPROPYLMALDEHYDROG-CPLX,"{'3-ISOPROPYLMALDEHYDROG-CPLX': 1, '3-ISOPROPY...",True,{3-ISOPROPYLMALDEHYDROG-ENZRXN},"{MN+2, MG+2}"
3,3-ISOPROPYLMALISOM-CPLX,"{'3-ISOPROPYLMALISOM-CPLX': 1, 'LEUC-MONOMER':...",True,{3-ISOPROPYLMALISOM-ENZRXN},{CPD-7}
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,"{'3-METHYL-2-OXOBUT-OHCH3XFER-CPLX': 1, '3-CH3...",True,{3-METHYL-2-OXOBUT-OHCH3XFER-ENZRXN},{MG+2}
...,...,...,...,...,...
1063,CPLX0-8053,"{'CPLX0-8053': 1, 'EG10942-MONOMER': -1}",True,,
1064,CPLX0-8253,"{'CPLX0-8253': 1, 'CSRC-RNA': -1, 'EG11447-MON...",True,,
1065,SRP-CPLX,"{'SRP-CPLX': 1, 'EG10300-MONOMER': -1, 'FFS-RN...",True,,
1066,CPLX0-7796APO,"{'CPLX0-7796APO': 1, 'PD04032': -2}",,,


## Create new column for monomer component stoichiometry

In [81]:
complex_names = complex_df['id'].tolist()

# create new column for complex name

complex_df['monomer_component_stoichiometry'] = 0
complex_df['monomer_component_stoichiometry'] = complex_df['monomer_component_stoichiometry'].astype(object)

monomer_names = filter_protein_df['id'].tolist()



for i in range(len(complex_df.index)):
    stoich_1 = complex_df.loc[i, 'stoichiometry']

    monomer_component_stoichiometry = {}

    for component_name_1, component_count_1 in stoich_1.items():
        if component_count_1 < 0:
            # get monomer components of complex
            if component_name_1 in monomer_names:
                monomer_component_stoichiometry[component_name_1] = abs(component_count_1)

            elif component_name_1 in complex_names:
                count_1 = abs(component_count_1)

                # find row of complex_df where id matches component_name
                stoich_2 = complex_df.loc[complex_names.index(component_name_1), 'stoichiometry']

                for component_name_2, component_count_2 in stoich_2.items():
                    if component_count_2 < 0:
                        if component_name_2 in monomer_names:
                            monomer_component_stoichiometry[component_name_2] = abs(component_count_2) * count_1

                        elif component_name_2 in complex_names:
                            count_2 = abs(component_count_2)

                            # find row of complex_df where id matches component_name
                            stoich_3 = complex_df.loc[complex_names.index(component_name_2), 'stoichiometry']

                            for component_name_3, component_count_3 in stoich_3.items():
                                if component_count_3 < 0:
                                    if component_name_3 in monomer_names:
                                        monomer_component_stoichiometry[component_name_3] = abs(component_count_3) * count_2 * count_1

                                    elif component_name_3 in complex_names:
                                        count_3 = abs(component_count_3)

                                        # find row of complex_df where id matches component_name
                                        stoich_4 = complex_df.loc[complex_names.index(component_name_3), 'stoichiometry']

                                        for component_name_4, component_count_4 in stoich_4.items():
                                            if component_count_4 < 0:
                                                monomer_component_stoichiometry[component_name_4] =  abs(component_count_4) * count_3 * count_2 * count_1

                                            if component_name_4 not in monomer_names:
                                                print(f'4th degree {component_name_4} not in monomer_names')


    complex_df.at[i, 'monomer_component_stoichiometry'] = monomer_component_stoichiometry

complex_df.loc[:, ['id', 'monomer_component_stoichiometry']]

Unnamed: 0,id,monomer_component_stoichiometry
0,1-PFK,{'1-PFK-MONOMER': 2}
1,2OXOGLUTARATEDEH-CPLX,"{'E1O-MONOMER': 12, 'E2O-MONOMER': 24, 'E3-MON..."
2,3-ISOPROPYLMALDEHYDROG-CPLX,{'3-ISOPROPYLMALDEHYDROG-MONOMER': 2}
3,3-ISOPROPYLMALISOM-CPLX,"{'LEUC-MONOMER': 1, 'LEUD-MONOMER': 1}"
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,{'3-CH3-2-OXOBUTANOATE-OH-CH3-XFER-MONOMER': 10}
...,...,...
1063,CPLX0-8053,{'EG10942-MONOMER': 1}
1064,CPLX0-8253,{'EG11447-MONOMER': 9}
1065,SRP-CPLX,{'EG10300-MONOMER': 1}
1066,CPLX0-7796APO,{'PD04032': 2}


In [82]:
filter_complex_df = complex_df.loc[:, ["id", "monomer_component_stoichiometry", "top_level", "cofactors"]]

In [95]:
filter_complex_df.to_parquet('notebooks/fbagd/processed_complexes.parquet', index=False)

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

# For each complex, look up components and annotate complex with components.

In [89]:
# for each complex, look up components and get cofactors for those components.
for i in range(len(filter_complex_df.index)):
    complex_id = filter_complex_df.loc[i, 'id']
    complex_components = filter_complex_df.loc[i, 'monomer_component_stoichiometry']
    complex_cofactors = filter_complex_df.loc[i, 'cofactors']

    for component_name, component_count in complex_components.items():
        # get index of component in filter_protein_df
        component_index = filter_protein_df.index[filter_protein_df['id'] == component_name].tolist()[0]


        component_metals = filter_protein_df.at[component_index, 'metal_features_processed']
        component_other = filter_protein_df.at[component_index, 'other_features_processed']
        print(f'component_name: {component_name}, component_metals: {component_metals}, component_other: {component_other}')


    filter_complex_df.at[i, 'cofactors'] = complex_cofactors

component_name: 1-PFK-MONOMER, component_metals: {}, component_other: {}
component_name: E1O-MONOMER, component_metals: {}, component_other: {}
component_name: E2O-MONOMER, component_metals: {}, component_other: {}
component_name: E3-MONOMER, component_metals: {}, component_other: {'FAD': 1}
component_name: 3-ISOPROPYLMALDEHYDROG-MONOMER, component_metals: {'MG+2': 1}, component_other: {}
component_name: LEUC-MONOMER, component_metals: {'CPD-7': 1}, component_other: {}
component_name: LEUD-MONOMER, component_metals: {}, component_other: {}
component_name: 3-CH3-2-OXOBUTANOATE-OH-CH3-XFER-MONOMER, component_metals: {'MG+2': 1}, component_other: {}
component_name: 3-OXOACYL-ACP-SYNTHII-MONOMER, component_metals: {}, component_other: {}
component_name: 6PFK-1-MONOMER, component_metals: {'MG+2': 1}, component_other: {}
component_name: 6PFK-2-MONOMER, component_metals: {'MG+2': 1, 'K+': 1}, component_other: {}
component_name: 6PGLUCONDEHYDROG-MONOMER, component_metals: {}, component_other: 

# Old components EcoCyc code, insert in loop for cofactors for complex_df

In [None]:
components = o['component']
if type(components) is dict:
    components = [components]

for component in components:
    if 'Protein' in component and '@frameid' in component['Protein']:
        component_id = component['Protein']['@frameid']

        component_req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{component_id}&detail=low'

        r = s.get(component_req_str)
        if r.status_code != 200:
            print(component_id, r.status_code)
            continue

        component_o = xmltodict.parse(r.content)['ptools-xml']['Protein']

        if 'coefficient' in component:
            component_coefficient = int(component['coefficient']['#text'])
        else:
            component_coefficient = 1

        if 'gene' in component_o:
            # print(f'{component_o["@frameid"]} has gene {component_o["gene"]["Gene"]["@frameid"]}')
            complex_component_dict[component_o["@frameid"]] = component_coefficient
        else:
            # print(f'{component_o["@frameid"]} does not have gene')

            nested_components = component_o['component']

            if type(nested_components) is dict:
                nested_components = [nested_components]

            for nested_component in nested_components:
                if 'Protein' in nested_component and '@frameid' in nested_component['Protein']:
                    nested_component_id = nested_component['Protein']['@frameid']

                    nested_component_req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{nested_component_id}&detail=low'

                    r = s.get(nested_component_req_str)
                    if r.status_code != 200:
                        print(nested_component_id, r.status_code)
                        continue

                    nested_component_o = xmltodict.parse(r.content)['ptools-xml']['Protein']

                    if 'coefficient' in nested_component_o:
                        nested_component_coefficient = int(nested_component_o['coefficient']['#text'])
                    else:
                        nested_component_coefficient = 1

                    if 'gene' in nested_component_o:
                        # print(f'{nested_component_o["@frameid"]} has gene {nested_component_o["gene"]["Gene"]["@frameid"]}')
                        complex_component_dict[nested_component_o["@frameid"]] = nested_component_coefficient * component_coefficient
                    else:
                        print(f'Doubly nested protein {nested_component_o["@frameid"]} does not have gene?')
                        continue

        # pp.pprint(component_o)

complex_df.at[i, 'components'] = complex_component_dict
# print(complex_component_dict)