In [23]:
import numpy as np

import seaborn as sns
import pandas as pd
import os
import pprint
import ast
import re
pp = pprint.PrettyPrinter(depth=6)

os.chdir(os.path.expanduser('~/vivarium-ecoli'))

import matplotlib.pyplot as plt
import dill
import requests
import xmltodict

s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password':'Cellman0451'})

<Response [200]>

In [24]:
complexation_rxn_df = pd.read_csv('reconstruction/ecoli/flat/complexation_reactions.tsv', sep='\t')
stoich_series = complexation_rxn_df.loc[:,['id', 'stoichiometry']].dropna().reset_index(drop=True)

stoich_list = []
for i, stoich in enumerate(stoich_series.loc[:, 'stoichiometry']):

    stoich = stoich.replace('null', '-1')

    stoich = ast.literal_eval(stoich)
    for k,v in stoich.items():
        stoich_list.append([k, v, stoich_series.loc[i, 'id']])

pre_complex_df = pd.DataFrame(stoich_list, columns=['complex', 'stoichiometry', 'reaction'])
pre_complex_df

Unnamed: 0,complex,stoichiometry,reaction
0,1-PFK,1,1-PFK_RXN
1,1-PFK-MONOMER,-2,1-PFK_RXN
2,2OXOGLUTARATEDEH-CPLX,1,2OXOGLUTARATEDEH-CPLX_RXN
3,E1O,-1,2OXOGLUTARATEDEH-CPLX_RXN
4,E2O,-1,2OXOGLUTARATEDEH-CPLX_RXN
...,...,...,...
2811,FFS-RNA,-1,SRP-CPLX_RXN
2812,CPLX0-7796APO,1,CPLX0-7796APO_RXN
2813,PD04032,-2,CPLX0-7796APO_RXN
2814,ARCB-CPLX,1,ARCB-CPLX_RXN


In [25]:
# get a set of all monomers with an associated uniprot id
proteins_df = pd.read_csv('reconstruction/ecoli/flat/proteins.tsv', sep='\t').loc[:, ["id", "common_name"]]
proteins_df["uniprot_id"] = ""

proteins_df

Unnamed: 0,id,common_name,uniprot_id
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,
1,1-PFK-MONOMER,1-phosphofructokinase,
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",
...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,


In [55]:
for i in range(len(proteins_df.index)):
    if i % 100 == 0:
        print(i)

    protein = proteins_df.loc[i, 'id']

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{protein}&detail=high'

    r = s.get(req_str)
    if r.status_code != 200:
        print(protein, r.status_code)
        proteins_df.loc[i, 'uniprot_id'] = 'NO ECOCYC ENTRY'
        continue

    o = xmltodict.parse(r.content)['ptools-xml']

    if 'Protein' not in o.keys():
        print(f'no protein for {protein}')
        proteins_df.loc[i, 'uniprot_id'] = 'NO PROTEIN ENTRY'
        continue
    elif 'dblink' not in o['Protein'].keys():
        print(f'no dblink for {protein}')
        proteins_df.loc[i, 'uniprot_id'] = 'NO DBLINK ENTRY'
        continue

    o = o['Protein']['dblink']

    if type(o) is dict:
        o = [o]

    found = False
    for data_source in o:
        if data_source['dblink-db'] == 'UNIPROT' and 'dblink-oid' in data_source.keys():
            uniprot_id = data_source['dblink-oid']

            proteins_df.loc[i, 'uniprot_id'] = uniprot_id

            found = True

    if not found:
        print(f'no uniprot id for {protein}')
        proteins_df.loc[i, 'uniprot_id'] = 'NO UNIPROT ENTRY'

0
no protein for AGAA-MONOMER
100
200
300
no protein for EG10010-MONOMER
400
500
no protein for EG10659-MONOMER
600
700
no protein for EG11092-MONOMER
800
900
no protein for EG11304-MONOMER
no protein for EG11386-MONOMER
1000
no protein for EG11496-MONOMER
1100
EG11708-MONOMER 404
no protein for EG11766-MONOMER
no protein for EG11770-MONOMER
1200
no protein for EG11778-MONOMER
no protein for EG11780-MONOMER
1300
no protein for EG11927-MONOMER
no protein for EG11986-MONOMER
no protein for EG12020-MONOMER
no protein for EG12051-MONOMER
1400
no protein for EG12161-MONOMER
no protein for EG12208-MONOMER
no protein for EG12227-MONOMER
no protein for EG12238-MONOMER
no protein for EG12261-MONOMER
1500
no protein for EG12364-MONOMER
1600
no protein for EG12881-MONOMER
1700
no protein for G0-16653-MONOMER
no protein for G0-16659-MONOMER
no protein for G0-16666-MONOMER
no protein for G0-16672-MONOMER
no protein for G0-16713-MONOMER
no protein for G0-16715-MONOMER
no protein for G0-16716-MONOMER

In [56]:
proteins_df.to_csv('notebooks/fbagd/proteins.csv', index=False)

In [95]:
proteins_df = pd.read_csv('notebooks/fbagd/proteins.csv')
proteins_df

Unnamed: 0,id,common_name,uniprot_id
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,P26647
1,1-PFK-MONOMER,1-phosphofructokinase,P0AEW9
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,P0A9J4
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,P09151
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",P0A887
...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,Q6BEX0
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,P39328
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,P39172
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,P39832


# Use uniprot id to get cofactor comments

In [105]:
cofactor_slot_df = pd.DataFrame(columns=['protein', 'cofactor_uniprot_name', 'cofactor_uniprot_id', 'cofactor_uniprot_db', 'cofactor_comment', 'subunit_comment'])
cofactor_slot_df

Unnamed: 0,protein,cofactor_uniprot_name,cofactor_uniprot_id,cofactor_uniprot_db,comment


In [94]:
for i in range(100): # range(len(proteins_df.index)):
    if i % 100 == 0:
        print(i)

    protein = proteins_df.loc[i, 'uniprot_id']

    if protein in ['NO UNIPROT ENTRY', 'NO PROTEIN ENTRY', 'NO DBLINK ENTRY', 'NO ECOCYC ENTRY']:
        continue

    # use uniprot api to get protein associated with uniprot id
    # req_str = f'https://www.uniprot.org/uniprot/{protein}.xml'
    req_str = f'https://rest.uniprot.org/uniprotkb/{protein}.xml'

    r = s.get(req_str)
    o = xmltodict.parse(r.content)

    print('\n\n' + protein)

    cofactor_n = 0

    if 'comment' in o['uniprot']['entry'].keys():

        comments = o['uniprot']['entry']['comment']

        if type(comments) is dict:
            comments = [comments]

        for comment in comments:
            if comment['@type'] == 'cofactor':


                cofactor_n += 1
                cofactor_slot = comment['cofactor']

                if type(cofactor_slot) is dict:
                    cofactor_slot = [cofactor_slot]

                for j, cofactor in enumerate(cofactor_slot):
                    print(f'Cofactor {cofactor_n}, option {j+1}', cofactor['name'], cofactor['dbReference']['@id'], cofactor['dbReference']['@type'])
                    cofactor_slot_df.append({'protein':protein,
                                                                'cofactor_uniprot_name':cofactor['name'],
                                                                'cofactor_uniprot_id':cofactor['dbReference']['@id'],
                                                                'cofactor_uniprot_db':cofactor['dbReference']['@type']}, ignore_index=True)

                    # proteins_df.loc[i, 'cofactor_uniprot_name'] = cofactor['cofactor']['name']
                    # proteins_df.loc[i, 'cofactor_uniprot_id'] = cofactor['cofactor']['dbReference']['@id']
                    # proteins_df.loc[i, 'cofactor_uniprot_db'] = cofactor['cofactor']['dbReference']['@type']

                if 'text' in comment.keys():
                    if type(comment['text']) is dict:
                        print('Comment: ',comment['text']['#text'])
                    else:
                        print('Comment: ',comment['text'])
                else:
                    print('No cofactor slot comment')

            if comment['@type'] == 'subunit':
                if 'text' in comment.keys():
                    if type(comment['text']) is dict:
                        print('Subunit: ', comment['text']['#text'])
                    else:
                        print('Subunit: ', comment['text'])
                else:
                    print('Subunit: no subunit comment')

    else:
        print('no protein comment')

0


P26647


P0AEW9
Cofactor 1, option 1 Mg(2+) CHEBI:18420 ChEBI
Comment:  Can also use Mn(2+) or Co(2+), with lower efficiency.
Subunit:  Homodimer.


P0A9J4
Subunit:  Monomer.


P09151
Cofactor 1, option 1 Mn(2+) CHEBI:29035 ChEBI
No cofactor slot comment
Subunit:  Homodimer.


P0A887
Subunit:  Component of the Ubi complex metabolon, which regroups five ubiquinone biosynthesis proteins (UbiE, UbiF, UbiG, UbiH and UbiI) and two accessory factors (UbiK and the lipid-binding protein UbiJ).


P0A6A0


P31057
Cofactor 1, option 1 Mg(2+) CHEBI:18420 ChEBI
Cofactor 1, option 2 Mn(2+) CHEBI:29035 ChEBI
Cofactor 1, option 3 Co(2+) CHEBI:48828 ChEBI
Cofactor 1, option 4 Zn(2+) CHEBI:29105 ChEBI
Comment:  Binds 1 Mg(2+) ion per subunit. Can also use Mn(2+), Co(2+) and Zn(2+) to a lesser extent.
Subunit:  Homodecamer; pentamer of dimers.


P30125
Cofactor 1, option 1 Mg(2+) CHEBI:18420 ChEBI
Cofactor 1, option 2 Mn(2+) CHEBI:29035 ChEBI
Comment:  Binds 1 Mg(2+) or Mn(2+) ion per subunit.
Subuni

In [93]:
protein = 'P09152'

# use uniprot api to get protein associated with uniprot id
# req_str = f'https://www.uniprot.org/uniprot/{protein}.xml'
req_str = f'https://rest.uniprot.org/uniprotkb/{protein}.xml'

r = s.get(req_str)
o = xmltodict.parse(r.content)
pp.pprint(o['uniprot']['entry']['comment'])

[{'@type': 'function',
  'text': 'The nitrate reductase enzyme complex allows E.coli to use nitrate '
          'as an electron acceptor during anaerobic growth. The alpha chain is '
          'the actual site of nitrate reduction.'},
 {'@type': 'catalytic activity',
  'reaction': {'dbReference': [{'@id': 'RHEA:56144', '@type': 'Rhea'},
                               {'@id': 'CHEBI:15377', '@type': 'ChEBI'},
                               {'@id': 'CHEBI:16301', '@type': 'ChEBI'},
                               {'@id': 'CHEBI:17632', '@type': 'ChEBI'},
                               {'@id': 'CHEBI:24646', '@type': 'ChEBI'},
                               {'@id': 'CHEBI:132124', '@type': 'ChEBI'},
                               {'@id': '1.7.5.1', '@type': 'EC'}],
               'text': 'a quinol + nitrate = a quinone + H2O + nitrite'}},
 {'@type': 'cofactor',
  'cofactor': {'dbReference': {'@id': 'CHEBI:49883', '@type': 'ChEBI'},
               'name': '[4Fe-4S] cluster'},
  'text': 'Bin

# Get the monomers of a complex

In [5]:
s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password':'Cellman0451'})

<Response [200]>

In [10]:
complex = 'LEUC-MONOMER'

req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{complex}&detail=high'

r = s.get(req_str)

o = xmltodict.parse(r.content)['ptools-xml']['Protein']
pp.pprint(o)

{'@ID': 'ECOLI:LEUC-MONOMER',
 '@detail': 'full',
 '@frameid': 'LEUC-MONOMER',
 '@orgid': 'ECOLI',
 'citation': [{'Publication': {'@ID': 'ECOLI:PUB-320178',
                               '@detail': 'full',
                               '@frameid': 'PUB-320178',
                               '@orgid': 'ECOLI',
                               'author': [{'#text': 'Davis MG',
                                           '@datatype': 'string'},
                                          {'#text': 'Calvo JM',
                                           '@datatype': 'string'}],
                               'pubmed-id': {'#text': '320178',
                                             '@datatype': 'string'},
                               'source': {'#text': 'J Bacteriol 129(2);1078-90',
                                          '@datatype': 'string'},
                               'title': {'#text': 'Isolation and '
                                                  'characterization of lambd

In [11]:
req_str = 'https://websvc.biocyc.org/apixml?fn=monomers-of-protein&id=ECOLI:LEUC-MONOMER&detail=low'

r = s.get(req_str)

o = xmltodict.parse(r.content)
pp.pprint(o)

{'ptools-xml': {'@ptools-version': '27.0',
                '@xml:base': 'http://BioCyc.org/apixml?fn=monomers-of-protein%26id=ECOLI:LEUC-MONOMER%26detail=LOW',
                'Protein': {'@ID': 'ECOLI:LEUC-MONOMER',
                            '@detail': 'low',
                            '@frameid': 'LEUC-MONOMER',
                            '@orgid': 'ECOLI',
                            'common-name': {'#text': '3-isopropylmalate '
                                                     'dehydratase subunit LeuC',
                                            '@datatype': 'string'},
                            'component-of': {'Protein': {'@frameid': '3-ISOPROPYLMALISOM-CPLX',
                                                         '@orgid': 'ECOLI',
                                                         '@resource': 'getxml?ECOLI:3-ISOPROPYLMALISOM-CPLX'}},
                            'gene': {'Gene': {'@frameid': 'EG11576',
                                              '@orgid': 'E