In [65]:
import numpy as np

import seaborn as sns
import pandas as pd
import os
import pprint
import ast
import re
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict

pp = pprint.PrettyPrinter(depth=6)

os.chdir(os.path.expanduser('~/vivarium-ecoli'))

s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password':'Cellman0451'})


<Response [200]>

In [66]:
# get a set of all monomers with an associated uniprot id
proteins_df = pd.read_csv('reconstruction/ecoli/flat/proteins.tsv', sep='\t').loc[:, ["id", "common_name"]]

proteins_df["metal_features"] = 0
proteins_df["metal_features"] = proteins_df["metal_features"].astype(object)

proteins_df["other_features"] = 0
proteins_df["other_features"] = proteins_df["other_features"].astype(object)

proteins_df

Unnamed: 0,id,common_name,metal_features,other_features
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,0,0
1,1-PFK-MONOMER,1-phosphofructokinase,0,0
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,0,0
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,0,0
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",0,0
...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,0,0
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,0,0
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,0,0
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,0,0


In [67]:
protein = 'DALADEHYDROGA-MONOMER'

req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{protein}&detail=high'

r = s.get(req_str)

o = xmltodict.parse(r.content)['ptools-xml']
pp.pprint(o['Protein']['has-feature'])

[{'Feature': {'@ID': 'ECOLI:FTR0-1606133',
              '@detail': 'full',
              '@frameid': 'FTR0-1606133',
              '@orgid': 'ECOLI',
              'accession-1': {'#text': 'PF01266.27', '@datatype': 'string'},
              'evidence': {'Evidence-Code': {'@ID': 'ECOLI:EV-COMP-AINF',
                                             '@class': 'true',
                                             '@detail': 'full',
                                             '@frameid': 'EV-COMP-AINF',
                                             '@orgid': 'ECOLI',
                                             'comment': {'#text': 'Automated '
                                                                  'inference.  '
                                                                  'A computer '
                                                                  'inferred '
                                                                  'this '
                                          

In [71]:
for i in range(len(proteins_df.index)):
    if i % 100 == 0:
        print(i)

    protein = proteins_df.loc[i, 'id']

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{protein}&detail=high'

    r = s.get(req_str)
    if r.status_code != 200:
        print(protein, r.status_code)
        proteins_df.loc[i, 'other_features'] = 'NO ECOCYC ENTRY'
        continue

    o = xmltodict.parse(r.content)['ptools-xml']

    metal_set = set()
    other_feature_set = set()
    if 'Protein' in o and 'has-feature' in o['Protein']:
        features = o['Protein']['has-feature']

        if type(features) is dict:
            features = [features]

        for feature in features:
            if 'parent' not in feature['Feature']:
                continue

            category = feature['Feature']['parent']['Feature']['@frameid']
            if category == 'Metal-Binding-Sites' and 'comment' in feature['Feature']:

                # Detect match to any of the allowed metal names and allowed cofactor names and add to list
                comment = feature['Feature']['comment']['#text']
                metal_set.add(comment)

                proteins_df.at[i, 'metal_features'] = list(metal_set)

            if category == 'Nucleotide-Phosphate-Binding-Regions' and 'attached-group' in feature['Feature'] and 'Compound' in feature['Feature']['attached-group']:
                attached_group = feature['Feature']['attached-group']['Compound']['@frameid']
                other_feature_set.add(attached_group)

            if category == 'N6-pyridoxal-phosphate-Lys-Modifications':
                other_feature_set.add('PYRIDOXAL_PHOSPHATE')

            if category == 'Protein-Segments' and 'comment' in feature['Feature'] and 'Thiamine' in feature['Feature']['comment']['#text']:
                other_feature_set.add('THIAMINE-PYROPHOSPHATE')

            proteins_df.at[i, 'metal_features'] = list(metal_set)
            proteins_df.at[i, 'other_features'] = list(other_feature_set)



0
100
200
300
400
500
600
700
800
900
1000
1100
EG11708-MONOMER 404
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400


In [73]:
proteins_df

Unnamed: 0,id,common_name,metal_features,other_features
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,[],[]
1,1-PFK-MONOMER,1-phosphofructokinase,[],[ATP]
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[],[NADP]
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[],[]
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",[],[]
...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[],[ATP]
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],[]
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,"[UniProt: Zinc., implicated in a second zinc b...",[]
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],[]


In [74]:
proteins_df.to_csv('notebooks/fbagd/protein_features.csv', index=False)

# Filter protein features.

In [75]:
filter_protein_df = proteins_df.copy().loc[:, ['id', 'common_name', 'metal_features', 'other_features']]
filter_protein_df

Unnamed: 0,id,common_name,metal_features,other_features
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,[],[]
1,1-PFK-MONOMER,1-phosphofructokinase,[],[ATP]
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[],[NADP]
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[],[]
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",[],[]
...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[],[ATP]
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],[]
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,"[UniProt: Zinc., implicated in a second zinc b...",[]
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],[]


## First process metal names

In [76]:
ALLOWED_METAL_NAMES = {'Iron': 'FE+2', 'Cobalt': 'CO+2', 'Copper': 'CU+2', 'Manganese': 'MN+2', 'Molybdenum': 'MO+2', 'Nickel': 'NI+2', 'Tungsten': 'W', 'Zinc': 'ZN+2',
                       'Calcium': 'CA+2', 'Magnesium': 'MG+2', 'Sodium': 'NA+', 'Potassium': 'K+',
                       'Iron-sulfur \(4Fe-4S\)': 'CPD-7', 'Iron-sulfur \(2Fe-2S\)': 'CPD-6', 'heme': 'heme', 'Molybdate': 'CPD-3'}

NON_REGEX_METAL = {'Iron': 'FE+2', 'Cobalt': 'CO+2', 'Copper': 'CU+2', 'Manganese': 'MN+2', 'Molybdenum': 'MO+2', 'Nickel': 'NI+2', 'Tungsten': 'W', 'Zinc': 'ZN+2',
                       'Calcium': 'CA+2', 'Magnesium': 'MG+2', 'Sodium': 'NA+', 'Potassium': 'K+',
                       'Iron-sulfur (4Fe-4S)': 'CPD-7', 'Iron-sulfur (2Fe-2S)': 'CPD-6', 'heme': 'heme', 'Molybdate': 'CPD-3'}

filter_protein_df['metal_features_processed'] = 0
filter_protein_df['metal_features_processed'] = filter_protein_df['metal_features_processed'].astype(object)

metal_pattern = '|'.join(ALLOWED_METAL_NAMES.keys())
metal_regex = re.compile(f'(({metal_pattern})(\s\d[\.,;]|[\.,;]|\s\())')


for i in range(len(filter_protein_df.index)):

    if filter_protein_df.loc[i, 'metal_features'] is not None and filter_protein_df.loc[i, 'metal_features'] != 0:
        metal_binding = filter_protein_df.loc[i, 'metal_features']

        metal_count_dict = {}
        existing_matches = set()

        for feature in metal_binding:
            matches = metal_regex.search(feature)
            if matches:
                metal = matches.group(0)[:-1]

                # eliminate duplicates
                if metal not in existing_matches:

                    existing_matches.add(metal)

                    if 'heme' in feature:
                        metal = metal.replace('Iron', 'heme')

                    # check if last char of metal is a number, then crop
                    if metal[-1].isdigit():
                        metal = metal[:-2]

                    metal = metal.strip()

                    # replace metal name with allowed metal name
                    metal = NON_REGEX_METAL[metal]

                    if metal in metal_count_dict:
                        metal_count_dict[metal] += 1
                    else:
                        metal_count_dict[metal] = 1




        filter_protein_df.at[i, 'metal_features_processed'] = metal_count_dict

In [78]:
filter_protein_df = filter_protein_df.drop(columns=['metal_features'])
filter_protein_df

Unnamed: 0,id,common_name,other_features,metal_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,[],{}
1,1-PFK-MONOMER,1-phosphofructokinase,[ATP],{}
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[NADP],{}
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[],{}
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",[],{}
...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[ATP],{}
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],{}
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,[],{'ZN+2': 1}
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],{}


In [79]:
filter_protein_df['other_features_processed'] = 0
filter_protein_df['other_features_processed'] = filter_protein_df['other_features_processed'].astype(object)

ACCEPTED_OTHER_FEATURES = {'PYRIDOXAL_PHOSPHATE', 'THIAMINE-PYROPHOSPHATE', 'FMN', 'FAD'}

for i in range(len(filter_protein_df.index)):

    if filter_protein_df.loc[i, 'other_features'] is not None and filter_protein_df.loc[i, 'other_features'] != 0:
        other_features = filter_protein_df.loc[i, 'other_features']

        other_feature_count_dict = {}
        existing_matches = set()

        for feature in other_features:

            # eliminate duplicates
            if feature not in existing_matches:

                existing_matches.add(feature)

                if feature in ACCEPTED_OTHER_FEATURES:
                    if feature in other_feature_count_dict:
                        other_feature_count_dict[feature] += 1
                    else:
                        other_feature_count_dict[feature] = 1

        filter_protein_df.at[i, 'other_features_processed'] = other_feature_count_dict

In [81]:
filter_protein_df = filter_protein_df.drop(columns=['other_features'])
filter_protein_df

Unnamed: 0,id,common_name,metal_features_processed,other_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,{},{}
1,1-PFK-MONOMER,1-phosphofructokinase,{},{}
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,{},{}
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,{},{}
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",{},{}
...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,{},{}
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,{},{}
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,{'ZN+2': 1},{}
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,{},{}


In [101]:
complex_df = pd.read_csv('reconstruction/ecoli/flat/complexation_reactions.tsv', sep='\t').loc[:, ['id', 'stoichiometry', 'common_name']]


removed_complexes = pd.read_csv('reconstruction/ecoli/flat/complexation_reactions_removed.tsv', sep='\t')

# remove rows where id starts with '#'
complex_df = complex_df[~complex_df['id'].str.startswith('#')].reset_index(drop=True)

# remove rows of complex_df where id matches an id in removed_complexes
complex_df = complex_df[~complex_df['id'].isin(removed_complexes['id'])].reset_index(drop=True)
complex_df.stoichiometry = complex_df.stoichiometry.astype(object)

for i, stoich in enumerate(complex_df.loc[:, 'stoichiometry']):

    if type(stoich) is str and stoich[0] == '{':
        stoich = stoich.replace('null', '-1')
        stoich = ast.literal_eval(stoich)

        complex_df.at[i, 'stoichiometry'] = stoich

    else:
        complex_df.at[i, 'stoichiometry'] = {}

complex_df

Unnamed: 0,id,stoichiometry,common_name
0,1-PFK_RXN,"{'1-PFK': 1, '1-PFK-MONOMER': -2}",
1,2OXOGLUTARATEDEH-CPLX_RXN,"{'2OXOGLUTARATEDEH-CPLX': 1, 'E1O': -1, 'E2O':...",2-oxoglutarate dehydrogenase complex
2,3-ISOPROPYLMALDEHYDROG-CPLX_RXN,"{'3-ISOPROPYLMALDEHYDROG-CPLX': 1, '3-ISOPROPY...",
3,3-ISOPROPYLMALISOM-CPLX_RXN,"{'3-ISOPROPYLMALISOM-CPLX': 1, 'LEUC-MONOMER':...",3-isopropylmalate dehydratase
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX_RXN,"{'3-METHYL-2-OXOBUT-OHCH3XFER-CPLX': 1, '3-CH3...",
...,...,...,...
1063,CPLX0-8053_RXN,"{'CPLX0-8053': 1, 'EG10942-MONOMER': -1}",SelB-L-selenocysteinyl-tRNA<sup>sec</sup>
1064,CPLX0-8253_RXN,"{'CPLX0-8253': 1, 'CSRC-RNA': -1, 'EG11447-MON...",CsrA complex with CsrC RNA
1065,SRP-CPLX_RXN,"{'SRP-CPLX': 1, 'EG10300-MONOMER': -1, 'FFS-RN...",Signal Recognition Particle
1066,CPLX0-7796APO_RXN,"{'CPLX0-7796APO': 1, 'PD04032': -2}",


In [128]:
# create new column for complex name
complex_df['complex_name'] = ''

complex_df['monomer_component_stoichiometry'] = 0
complex_df['monomer_component_stoichiometry'] = complex_df['monomer_component_stoichiometry'].astype(object)

monomer_names = filter_protein_df['id'].tolist()

# for row of complex_df, find key of stoichiometry dict with positive value, then set complex_name to that key
for i in range(len(complex_df.index)):
    stoichiometry = complex_df.loc[i, 'stoichiometry']

    monomer_component_stoichiometry = {}

    for component_name, component_count in stoichiometry.items():

        if component_count > 0:
            complex_df.at[i, 'complex_name'] = component_name

        else:
            # get monomer components of complex
            if component_name in monomer_names:
                monomer_component_stoichiometry[component_name] = component_count

            else:
                outer_count = abs(component_count)

                # find row of complex_df where id matches component_name
                component_stoich = complex_df.loc[complex_df['id'] == component_name, 'stoichiometry']

                for inner_component_name, inner_component_count in component_stoich.items():
                    if inner_component_count < 0:
                        monomer_component_stoichiometry[inner_component_name] = abs(inner_component_count) * outer_count

    complex_df.at[i, 'monomer_component_stoichiometry'] = monomer_component_stoichiometry

complex_df.loc[:, ['id', 'complex_name', 'monomer_component_stoichiometry']]

Unnamed: 0,id,complex_name,monomer_component_stoichiometry
0,1-PFK_RXN,1-PFK,{'1-PFK-MONOMER': -2}
1,2OXOGLUTARATEDEH-CPLX_RXN,2OXOGLUTARATEDEH-CPLX,{}
2,3-ISOPROPYLMALDEHYDROG-CPLX_RXN,3-ISOPROPYLMALDEHYDROG-CPLX,{'3-ISOPROPYLMALDEHYDROG-MONOMER': -2}
3,3-ISOPROPYLMALISOM-CPLX_RXN,3-ISOPROPYLMALISOM-CPLX,"{'LEUC-MONOMER': -1, 'LEUD-MONOMER': -1}"
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX_RXN,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,{'3-CH3-2-OXOBUTANOATE-OH-CH3-XFER-MONOMER': -10}
...,...,...,...
1063,CPLX0-8053_RXN,CPLX0-8053,{'EG10942-MONOMER': -1}
1064,CPLX0-8253_RXN,CPLX0-8253,{'EG11447-MONOMER': -9}
1065,SRP-CPLX_RXN,SRP-CPLX,{'EG10300-MONOMER': -1}
1066,CPLX0-7796APO_RXN,CPLX0-7796APO,{'PD04032': -2}


In [125]:
filter_protein_df['id'].str.contains('1-PFK')

0       False
1        True
2       False
3       False
4       False
        ...  
4415    False
4416    False
4417    False
4418    False
4419    False
Name: id, Length: 4420, dtype: bool

In [126]:
filter_protein_df

Unnamed: 0,id,common_name,metal_features_processed,other_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,{},{}
1,1-PFK-MONOMER,1-phosphofructokinase,{},{}
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,{},{}
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,{},{}
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",{},{}
...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,{},{}
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,{},{}
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,{'ZN+2': 1},{}
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,{},{}


In [120]:
# check if 'name' is in filter_protein_df['id']
'1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER' in filter_protein_df.loc[:, 'id']

False