# Setup

In [1]:
import numpy as np

import seaborn as sns
import pandas as pd
import os
import pprint
import ast
import re
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict
import json

pp = pprint.PrettyPrinter(depth=6)

os.chdir(os.path.expanduser('~/vivarium-ecoli'))

ALLOWED_METAL_NAMES =   {'Iron': 'FE+2', 'Cobalt': 'CO+2', 'Copper': 'CU+2', 'Manganese': 'MN+2', 'Molybdenum': 'CPD-8123', 'Nickel': 'NI+2', 'Zinc': 'ZN+2',
                        'Calcium': 'CA+2', 'Magnesium': 'MG+2', 'Sodium': 'NA+', 'Potassium': 'K+',
                        'Iron-sulfur \(4Fe-4S\)': 'CPD-7', 'Iron-sulfur \(2Fe-2S\)': 'CPD-6',
                         'Iron-sulfur \(4Fe-4S-S-AdoMet\)': 'CPD-7', 'Iron-sulfur \(3Fe-4S\)': '3FE-4S', 'Iron-oxo-sulfur \(4Fe-2O-2S\)': 'CPD-7',
                         'Iron-sulfur': 'CPD-7', # has to be after others since it is a substring of others
                        'heme': 'Heme-b', 'Molybdate': 'CPD-3', 'heme B': 'Heme-b', 'Cobalamin': 'COB-I-ALAMIN',
                         'Selenocysteine': 'L-SELENOCYSTEINE',
                        'Divalent metal cation': 'Any+2'}



# What else should I include, copilot?
# 1. Transcription
# 2. Translation
# 3. Iron-sulfur cluster assembly
# 4. DNA repair
# 5. DNA replication
# 6. Cell division
# 7. Cell cycle
# 8. Cell wall biosynthesis


ACCEPTED_OTHER_FEATURES = {'PYRIDOXAL_PHOSPHATE', 'THIAMINE-PYROPHOSPHATE', 'FMN', 'FAD', 'LIPOIC-ACID', 'BIOTIN'}

## Connect to api

In [2]:
password = input("Enter Password: ")

In [4]:
s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password': password})

<Response [200]>

## api test

In [6]:
# example entry
# entity = 'PWY0-1356'
entity = '6PFRUCTPHOS-RXN'
req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{entity}&detail=full'

r = s.get(req_str)
if r.status_code != 200:
    print(entity, r.status_code)

o = xmltodict.parse(r.content)
pp.pprint(o['ptools-xml'])

{'@ptools-version': '27.0',
 '@xml:base': 'http://BioCyc.org/getxml?ECOLI:6PFRUCTPHOS-RXN',
 'Reaction': {'@ID': 'ECOLI:6PFRUCTPHOS-RXN',
              '@detail': 'full',
              '@frameid': '6PFRUCTPHOS-RXN',
              '@orgid': 'ECOLI',
              'comment': {'#text': 'This is a key control step in glycolysis '
                                   '[<a '
                                   'href="http://www.ncbi.nlm.nih.gov/pubmed/153704">Uyeda79</a>]',
                          '@datatype': 'string'},
              'ec-number': {'#text': 'EC-2.7.1.11', 'official': 'T'},
              'enzymatic-reaction': {'Enzymatic-Reaction': [{'@ID': 'ECOLI:6PFRUCTPHOS-ENZRXN',
                                                             '@detail': 'low',
                                                             '@frameid': '6PFRUCTPHOS-ENZRXN',
                                                             '@orgid': 'ECOLI',
                                                            

In [5]:
entity = 'Fermentation'

fn = 'get-class-all-instances'
req_str = f'https://websvc.biocyc.org/apixml?fn={fn}&id=ECOLI:{entity}&detail=none'

r = s.get(req_str)
if r.status_code != 200:
    print(entity, r.status_code)

o = xmltodict.parse(r.content)
pp.pprint(o['ptools-xml'])

{'@ptools-version': '27.0',
 '@xml:base': 'http://BioCyc.org/apixml?fn=get-class-all-instances%26id=ECOLI:Fermentation%26detail=NONE',
 'Pathway': [{'@frameid': 'PWY-5437',
              '@orgid': 'ECOLI',
              '@resource': 'getxml?ECOLI:PWY-5437'},
             {'@frameid': 'PWY0-1312',
              '@orgid': 'ECOLI',
              '@resource': 'getxml?ECOLI:PWY0-1312'},
             {'@frameid': 'PWY-5480',
              '@orgid': 'ECOLI',
              '@resource': 'getxml?ECOLI:PWY-5480'},
             {'@frameid': 'PWY-8274',
              '@orgid': 'ECOLI',
              '@resource': 'getxml?ECOLI:PWY-8274'},
             {'@frameid': 'PWY-5485',
              '@orgid': 'ECOLI',
              '@resource': 'getxml?ECOLI:PWY-5485'},
             {'@frameid': 'FERMENTATION-PWY',
              '@orgid': 'ECOLI',
              '@resource': 'getxml?ECOLI:FERMENTATION-PWY'}],
 'metadata': {'num_results': '6',
              'query': 'fn=get-class-all-instances&id=ECOLI:Fermenta

In [6]:
# example entry
# entity = 'PWY0-1356'
entity = 'GO:0008150'
req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{entity}&detail=full'

r = s.get(req_str)
if r.status_code != 200:
    print(entity, r.status_code)

o = xmltodict.parse(r.content)
pp.pprint(o['ptools-xml'])

{'@ptools-version': '27.0',
 '@xml:base': 'http://BioCyc.org/getxml?ECOLI:GO:0008150',
 'GO-Term': {'@ID': 'ECOLI:GO:0008150',
             '@class': 'true',
             '@detail': 'full',
             '@frameid': 'GO:0008150',
             '@orgid': 'ECOLI',
             'comment': {'#text': 'Note that, in addition to forming the root '
                                  'of the biological process ontology, this '
                                  'term is recommended for use for the '
                                  'annotation of gene products whose '
                                  'biological process is unknown. When this '
                                  'term is used for annotation, it indicates '
                                  'that no information was available about the '
                                  'biological process of the gene product '
                                  'annotated as of the date the annotation was '
                                  "made; t

# Getting raw data

## Fetch EcoCyc pathway tree

In [7]:
def recursive_pathway_tree(current_node, parent_node, node_dict, level):


    if current_node in node_dict.keys():

        if level < node_dict[current_node]['level']:
            node_dict[current_node]['level'] = level

        node_dict[current_node]['parents'].append(parent_node)

        return

    else:

        req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{current_node}&detail=high'

        node_dict[current_node] = {'parents': [], 'children': [], 'level': level, 'common_name': 'N/A'}

        if parent_node is not None:
            node_dict[current_node]['parents'].append(parent_node)

        r = s.get(req_str)
        if r.status_code != 200:
            print(current_node, r.status_code)
            return

        o = xmltodict.parse(r.content)

        if 'common-name' in o['ptools-xml']['Pathway']:
            node_dict[current_node]['common_name'] = o['ptools-xml']['Pathway']['common-name']['#text']


        subclasses = o['ptools-xml']['Pathway']['subclass'] if 'subclass' in o['ptools-xml']['Pathway'] else []
        if type(subclasses) is dict:
            subclasses = [subclasses]

        instances = o['ptools-xml']['Pathway']['instance'] if 'instance' in o['ptools-xml']['Pathway'] else []
        if type(instances) is dict:
            instances = [instances]

        pathways = subclasses + instances

        # print(f'{current_node}: {len(subclasses)}, {len(instances)}')

        for pathway in pathways:
            pathway_id = pathway['Pathway']['@frameid']

            node_dict[current_node]['children'].append(pathway_id)
            recursive_pathway_tree(pathway_id, current_node, node_dict, level+1)


    return


def get_pathway_ith_level_parents(cur_pathway_idx, pathway_matrix, name_list, level_vector, level=2, parent_dict=None):

    if parent_dict is None:
        parent_dict = {}

    cur_pathway_level = level_vector[cur_pathway_idx]

    if cur_pathway_level == level:
        parent_dict[name_list[cur_pathway_idx]] = cur_pathway_level

    parent_slice = pathway_matrix[:, cur_pathway_idx]
    parent_idxs = np.where(parent_slice != 0)[0]


    for idx in parent_idxs:

        _ = get_pathway_ith_level_parents(idx, pathway_matrix, name_list, level_vector, level, parent_dict)

    return parent_dict



In [8]:
entity = 'Pathways'

fn = 'get-class-direct-subs'
req_str = f'https://websvc.biocyc.org/apixml?fn={fn}&id=ECOLI:{entity}&detail=none'

r = s.get(req_str)
o = xmltodict.parse(r.content)['ptools-xml']['Pathway']
top_level_pathways = [pathway['@frameid'] for pathway in o]

pathway_node_dict = {}

for top_node in top_level_pathways:

    recursive_pathway_tree(top_node, None, pathway_node_dict, level=1)

ARG+POLYAMINE-SYN 404


In [9]:
pathway_df = pd.DataFrame(pathway_node_dict).T.reset_index(names='id')
pathway_df

Unnamed: 0,id,parents,children,level,common_name
0,Transport-Pathways,[],[],1,Transport
1,Metabolic-Clusters,[],"[OANTIGEN-PWY, PWY-7206, PWY-7184, PWY-7801, P...",1,Metabolic Clusters
2,OANTIGEN-PWY,"[Metabolic-Clusters, SUGAR-NUCLEOTIDES, Super-...",[],2,<i>O</i>-antigen building blocks biosynthesis ...
3,PWY-7206,"[Metabolic-Clusters, Pyrimidine-Deoxyribonucle...",[],2,pyrimidine deoxyribonucleotides dephosphorylation
4,PWY-7184,"[Metabolic-Clusters, Pyrimid-Deoxyribonucleot-...",[],2,pyrimidine deoxyribonucleotides <i>de novo</i>...
...,...,...,...,...,...
1168,PWY0-1299,[Acid-Resistance],[],3,arginine dependent acid resistance
1169,Mercury-Detoxification,[Detoxification],[],2,Mercury Detoxification
1170,Antibiotic-Resistance,[Detoxification],"[Vancomycin-Resistnace, PWY0-1338]",2,Antibiotic Resistance
1171,Vancomycin-Resistnace,[Antibiotic-Resistance],[],3,Vancomycin Resistance


## Fetch protein monomer annotations

In [10]:
# get a set of all monomers with an associated uniprot id
proteins_df = pd.read_csv('reconstruction/ecoli/flat/proteins.tsv', sep='\t', comment='#').loc[:, ["id", "common_name", "seq"]]

for column in ["enzyme_reaction", "cofactors", "metal_features", "other_features", "direct_annotations", "go_annotations"]:
    proteins_df[column] = 0
    proteins_df[column] = proteins_df[column].astype(object)

proteins_df

Unnamed: 0,id,common_name,seq,enzyme_reaction,cofactors,metal_features,other_features,direct_annotations,go_annotations
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,MLYIFRLIITVIYSILVCVFGSIYCLFSPRNPKHVATFGHMFGRLA...,0,0,0,0,0,0
1,1-PFK-MONOMER,1-phosphofructokinase,MSRRVATITLNPAYDLVGFCPEIERGEVNLVKTTGLHAAGKGINVA...,0,0,0,0,0,0
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,MKITVLGCGALGQLWLTALCKQGHEVQGWLRVPQPYCSVNLVETDG...,0,0,0,0,0,0
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,MSQQVIIFDTTLRDGEQALQASLSVKEKLQIALALERMGVDVMEVG...,0,0,0,0,0,0
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",MVDKSQETTHFGFQTVAKEQKADMVAHVFHSVASKYDVMNDLMSFG...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
4429,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,MTTDQHQEILRTEGLSKFFPGVKALDNVDFSLRRGEIMALLGENGA...,0,0,0,0,0,0
4430,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,MMPQSLPDTTTPKRRFRWPTGMPQLVALLLVLLVDSLVAPHFWQVV...,0,0,0,0,0,0
4431,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,MLHKKTLLFAALSAALWGGATQAADAAVVASLKPVGFIASAIADGV...,0,0,0,0,0,0
4432,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,MIELLFPGWLAGIMLACAAGPLGSFVVWRRMSYFGDTLAHASLLGV...,0,0,0,0,0,0


In [11]:
for i in range(len(proteins_df.index)):

    if i % 100 == 0:
        print(i)

    protein = proteins_df.loc[i, 'id']
    proteins_df.at[i, 'other_features'] = set()
    proteins_df.at[i, 'metal_features'] = set()
    proteins_df.at[i, 'enzyme_reaction'] = set()
    proteins_df.at[i, 'cofactors'] = set()
    proteins_df.at[i, 'direct_annotations'] = set()
    proteins_df.at[i, 'go_annotations'] = set()

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{protein}&detail=high'



    r = s.get(req_str)
    if r.status_code != 200:
        print(protein, r.status_code)
        continue

    o = xmltodict.parse(r.content)['ptools-xml']

    metal_set = set()
    other_feature_set = set()
    if 'Protein' in o and 'has-feature' in o['Protein']:
        features = o['Protein']['has-feature']

        if type(features) is dict:
            features = [features]

        for feature in features:
            if 'parent' not in feature['Feature']:
                continue

            category = feature['Feature']['parent']['Feature']['@frameid']
            if category == 'Metal-Binding-Sites' and 'comment' in feature['Feature']:

                # Detect match to any of the allowed metal names and allowed cofactor names and add to list
                comment = feature['Feature']['comment']['#text']
                metal_set.add(comment)

                proteins_df.at[i, 'metal_features'] = list(metal_set)

            if category == 'Nucleotide-Phosphate-Binding-Regions' and 'attached-group' in feature['Feature'] and 'Compound' in feature['Feature']['attached-group']:
                attached_group = feature['Feature']['attached-group']['Compound']['@frameid']
                other_feature_set.add(attached_group)

            if category == 'N6-pyridoxal-phosphate-Lys-Modifications':
                other_feature_set.add('PYRIDOXAL_PHOSPHATE')

            if category == 'Conserved-Regions' and 'comment' in feature['Feature'] and 'Lipoyl' in feature['Feature']['comment']['#text']:
                other_feature_set.add('LIPOIC-ACID')

            if category == 'Conserved-Regions' and 'comment' in feature['Feature'] and 'Biotinyl' in feature['Feature']['comment']['#text']:
                other_feature_set.add('BIOTIN')

            if category == 'Protein-Segments' and 'comment' in feature['Feature'] and 'Thiamine' in feature['Feature']['comment']['#text']:
                other_feature_set.add('THIAMINE-PYROPHOSPHATE')

            if category == 'Selenocysteine-sites':
                print('found selenocysteine')
                metal_set.add('Selenocysteine.')

            proteins_df.at[i, 'metal_features'] = list(metal_set)
            proteins_df.at[i, 'other_features'] = list(other_feature_set)

        if 'catalyzes' in o['Protein']:
            oc = o['Protein']['catalyzes']['Enzymatic-Reaction']

            if type(oc) is dict:
                oc = [oc]

            cofactor_set = set()
            enz_rxn_set = set()

            for enzrxn in oc:
                enz_id = enzrxn['@frameid']

                enz_rxn_set.add(enz_id)

                enz_req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{enz_id}&detail=high'

                rz = s.get(enz_req_str)
                oe = xmltodict.parse(rz.content)['ptools-xml']['Enzymatic-Reaction']

                if "cofactor" in oe:
                    oe = oe['cofactor']

                    if type(oe) is dict:
                        oe = [oe]

                    for cofactor in oe:
                        cof = cofactor['Compound']['@frameid']
                        cofactor_set.add(cof)

            proteins_df.at[i, 'enzyme_reaction'] = enz_rxn_set
            proteins_df.at[i, 'cofactors'] = cofactor_set

    # save uniprot ID
    if 'Protein' in o and 'dblink' in o['Protein']:
        db_links = o['Protein']['dblink']

        if type(db_links) is dict:
            db_links = [db_links]

        for db_link in db_links:
            if 'dblink-db' in db_link and db_link['dblink-db'] == 'UNIPROT':
                uniprot_id = db_link['dblink-oid']
                proteins_df.at[i, 'uniprot_id'] = uniprot_id

    # get gene pathway annotation
    if 'Protein' in o and 'gene' in o['Protein']:

        gene = o['Protein']['gene']['Gene']['@frameid']
        gene_req_str = f'https://websvc.biocyc.org/apixml?fn=pathways-of-gene&id=ECOLI:{gene}&detail=none'

        rg = s.get(gene_req_str)
        og = xmltodict.parse(rg.content)['ptools-xml']

        if 'Pathway' in og:
            pathways = og['Pathway']

            if type(pathways) is dict:
                pathways = [pathways]

            for pathway in pathways:
                pathway_id = pathway['@frameid']
                proteins_df.at[i, 'direct_annotations'].add(pathway_id)


    if 'Protein' in o and 'has-go-term' in o['Protein']:
        go_terms = o['Protein']['has-go-term']

        annotation_set = set()

        if type(go_terms) is dict:
            go_terms = [go_terms]

        for go_term in go_terms:

            annotation_set.add(go_term['GO-Term']['@frameid'])



        proteins_df.at[i, 'go_annotations'] = annotation_set




0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
found selenocysteine
found selenocysteine
found selenocysteine
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400


In [12]:
proteins_df

Unnamed: 0,id,common_name,seq,enzyme_reaction,cofactors,metal_features,other_features,direct_annotations,go_annotations,uniprot_id
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,MLYIFRLIITVIYSILVCVFGSIYCLFSPRNPKHVATFGHMFGRLA...,"{ENZRXN0-7991, 1-ACYLGLYCEROL-3-P-ACYLTRANSFER...",{},[],[],"{PWY0-1319, PWY-5667}","{GO:0005886, GO:0016020, GO:0016740, GO:001602...",P26647
1,1-PFK-MONOMER,1-phosphofructokinase,MSRRVATITLNPAYDLVGFCPEIERGEVNLVKTTGLHAAGKGINVA...,{},{},[],[ATP],{PWY0-1314},"{GO:0000166, GO:0016301, GO:0008443, GO:001631...",P0AEW9
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,MKITVLGCGALGQLWLTALCKQGHEVQGWLRVPQPYCSVNLVETDG...,{2-DEHYDROPANTOATE-REDUCT-ENZRXN},{},[],[NADP],{PANTO-PWY},"{GO:0015940, GO:0005737, GO:0008677, GO:001661...",P0A9J4
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,MSQQVIIFDTTLRDGEQALQASLSVKEKLQIALALERMGVDVMEVG...,{ENZRXN0-6250},{},[],[],{LEUSYN-PWY},"{GO:0009082, GO:0046912, GO:0005737, GO:000398...",P09151
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",MVDKSQETTHFGFQTVAKEQKADMVAHVFHSVASKYDVMNDLMSFG...,"{2-OCTAPRENYL-METHOXY-BENZOQ-METH-ENZRXN, ADOM...",{},[],[],"{PWY-6708, MENAQUINONESYN-PWY}","{GO:0102027, GO:0030580, GO:0008168, GO:000674...",P0A887
...,...,...,...,...,...,...,...,...,...,...
4429,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,MTTDQHQEILRTEGLSKFFPGVKALDNVDFSLRRGEIMALLGENGA...,{},{},[],[ATP],{},"{GO:0000166, GO:0005886, GO:0016020, GO:001688...",Q6BEX0
4430,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,MMPQSLPDTTTPKRRFRWPTGMPQLVALLLVLLVDSLVAPHFWQVV...,{},{},[],[],{},"{GO:0005886, GO:0016020, GO:0140271, GO:005505...",P39328
4431,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,MLHKKTLLFAALSAALWGGATQAADAAVVASLKPVGFIASAIADGV...,{},{},"[zinc-coordinating residues, implicated in a s...",[],{},"{GO:0006811, GO:0042597, GO:0030001, GO:001602...",P39172
4432,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,MIELLFPGWLAGIMLACAAGPLGSFVVWRRMSYFGDTLAHASLLGV...,{},{},[],[],{},"{GO:0006811, GO:0005886, GO:0016020, GO:005508...",P39832


In [13]:
# use uniprot ID to fetch uniprot entry for each protein
for i in range(len(proteins_df.index)):

    if i == 20:
        break


    protein = proteins_df.at[i, 'id']
    uniprot_id = proteins_df.at[i, 'uniprot_id']

    if uniprot_id is None:
        continue

    req_str = f'https://rest.uniprot.org/uniprotkb/{uniprot_id}.xml'

    r = s.get(req_str)
    if r.status_code != 200:
        print(protein, r.status_code)
        continue

    o = xmltodict.parse(r.content)['uniprot']

    if 'entry' in o and 'comment' in o['entry']:

        comments = o['entry']['comment']

        if type(comments) is dict:
            comments = [comments]

        for comment in comments:
            if 'text' in comment and 'Metal binding' in comment['text']:
                print(protein, comment['text'])
                proteins_df.at[i, 'metal_features'] = ['METAL-BINDING']
                break

## Reload protein feature table

In [14]:
# proteins_df = pd.read_parquet('notebooks/fbagd/data/raw_protein_features.parquet')
# proteins_df['metal_features'] = proteins_df['metal_features'].apply(ast.literal_eval)
# # for rows of proteins with where other_features is set, convert from string to set with literal_eval
# proteins_df.loc[proteins_df['other_features'].str.startswith('['), 'other_features'] = \
#     proteins_df.loc[proteins_df['other_features'].str.startswith('['), 'other_features'].apply(ast.literal_eval)

filter_protein_df = proteins_df.copy().loc[:, ['id', 'common_name', 'metal_features', 'other_features', 'enzyme_reaction', 'cofactors', 'direct_annotations', 'go_annotations']]
filter_protein_df

Unnamed: 0,id,common_name,metal_features,other_features,enzyme_reaction,cofactors,direct_annotations,go_annotations
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,[],[],"{ENZRXN0-7991, 1-ACYLGLYCEROL-3-P-ACYLTRANSFER...",{},"{PWY0-1319, PWY-5667}","{GO:0005886, GO:0016020, GO:0016740, GO:001602..."
1,1-PFK-MONOMER,1-phosphofructokinase,[],[ATP],{},{},{PWY0-1314},"{GO:0000166, GO:0016301, GO:0008443, GO:001631..."
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,[],[NADP],{2-DEHYDROPANTOATE-REDUCT-ENZRXN},{},{PANTO-PWY},"{GO:0015940, GO:0005737, GO:0008677, GO:001661..."
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,[],[],{ENZRXN0-6250},{},{LEUSYN-PWY},"{GO:0009082, GO:0046912, GO:0005737, GO:000398..."
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",[],[],"{2-OCTAPRENYL-METHOXY-BENZOQ-METH-ENZRXN, ADOM...",{},"{PWY-6708, MENAQUINONESYN-PWY}","{GO:0102027, GO:0030580, GO:0008168, GO:000674..."
...,...,...,...,...,...,...,...,...
4429,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,[],[ATP],{},{},{},"{GO:0000166, GO:0005886, GO:0016020, GO:001688..."
4430,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,[],[],{},{},{},"{GO:0005886, GO:0016020, GO:0140271, GO:005505..."
4431,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,"[zinc-coordinating residues, implicated in a s...",[],{},{},{},"{GO:0006811, GO:0042597, GO:0030001, GO:001602..."
4432,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,[],[],{},{},{},"{GO:0006811, GO:0005886, GO:0016020, GO:005508..."


## Annotate complexation with EcoCyc data

In [17]:
complex_df = pd.read_csv('reconstruction/ecoli/flat/complexation_reactions.tsv', sep='\t', comment='#').loc[:, ['id','common_name', 'stoichiometry']]


removed_complexes = pd.read_csv('reconstruction/ecoli/flat/complexation_reactions_removed.tsv', sep='\t', comment='#')
removed_complex_ids = removed_complexes['id'].tolist()
# add ribosome back in to complex_df
removed_complex_ids.remove('CPLX0-3964_RXN')

# remove rows where id starts with '#'
complex_df = complex_df[~complex_df['id'].str.startswith('#')].reset_index(drop=True)

# remove rows of complex_df where id matches an id in removed_complexes
complex_df = complex_df[~complex_df['id'].isin(removed_complex_ids)].reset_index(drop=True)
complex_df.stoichiometry = complex_df.stoichiometry.astype(object)


for i, stoich in enumerate(complex_df.loc[:, 'stoichiometry']):

    if type(stoich) is str and stoich[0] == '{':
        stoich = stoich.replace('null', '-1')
        stoich = ast.literal_eval(stoich)

        complex_df.at[i, 'stoichiometry'] = stoich

    else:
        complex_df.at[i, 'stoichiometry'] = {}


# for each row, find dict entry with positive value
for i in range(len(complex_df.index)):

    stoich = complex_df.loc[i, 'stoichiometry']

    for k,v in stoich.items():
        if v > 0:
            complex_df.at[i, 'id'] = k

complex_df

Unnamed: 0,id,common_name,stoichiometry
0,1-PFK,,"{'1-PFK': 1, '1-PFK-MONOMER': -2}"
1,2OXOGLUTARATEDEH-CPLX,2-oxoglutarate dehydrogenase complex,"{'2OXOGLUTARATEDEH-CPLX': 1, 'E1O': -1, 'E2O':..."
2,3-ISOPROPYLMALDEHYDROG-CPLX,,"{'3-ISOPROPYLMALDEHYDROG-CPLX': 1, '3-ISOPROPY..."
3,3-ISOPROPYLMALISOM-CPLX,3-isopropylmalate dehydratase,"{'3-ISOPROPYLMALISOM-CPLX': 1, 'LEUC-MONOMER':..."
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,,"{'3-METHYL-2-OXOBUT-OHCH3XFER-CPLX': 1, '3-CH3..."
...,...,...,...
1088,CPLX0-3964,ribosome,"{'CPLX0-3964': 1, 'CPLX0-3953': -1, 'CPLX0-396..."
1089,CPLX0-8028,CsrA complex with McaS RNA,"{'CPLX0-8028': 1, 'IS061-RNA': -1, 'CPLX0-7956..."
1090,CPLX0-8053,SelB-L-selenocysteinyl-tRNA<sup>sec</sup>,"{'CPLX0-8053': 1, 'EG10942-MONOMER': -1}"
1091,CPLX0-8253,CsrA complex with CsrC RNA,"{'CPLX0-8253': 1, 'CSRC-RNA': -1, 'EG11447-MON..."


In [18]:
complex_df["cofactors"] = 0
complex_df["cofactors"] = complex_df["cofactors"].astype(object)

complex_df["enzyme_reaction"] = 0
complex_df["enzyme_reaction"] = complex_df["enzyme_reaction"].astype(object)

for i in range(len(complex_df.index)):

    if i % 100 == 0:
        print(i)

    complex = complex_df.loc[i, 'id']

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{complex}&detail=low'

    r = s.get(req_str)
    if r.status_code != 200:
        print(complex, r.status_code)
        complex_df.at[i, 'enzyme_reaction'] = set()
        complex_df.at[i, 'cofactors'] = set()
        continue

    oo = xmltodict.parse(r.content)['ptools-xml']['Protein']


    # if enzyme
    if 'catalyzes' in oo:
        o = oo['catalyzes']['Enzymatic-Reaction']

        if type(o) is dict:
            o = [o]

        cofactor_set = set()
        enz_rxn_set = set()

        for enzrxn in o:

            # i assume common names will be the same, if no existing replace it
            if 'common-name' not in oo:
                complex_df.at[i, 'common_name'] = enzrxn['common-name']['#text']

            enz_id = enzrxn['@frameid']

            enz_rxn_set.add(enz_id)

            enz_req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{enz_id}&detail=high'

            rz = s.get(enz_req_str)
            oe = xmltodict.parse(rz.content)['ptools-xml']['Enzymatic-Reaction']

            if "cofactor" in oe:
                oe = oe['cofactor']

                if type(oe) is dict:
                    oe = [oe]

                for cofactor in oe:
                    cof = cofactor['Compound']['@frameid']
                    cofactor_set.add(cof)

        complex_df.at[i, 'enzyme_reaction'] = enz_rxn_set
        complex_df.at[i, 'cofactors'] = cofactor_set

    else:
        complex_df.at[i, 'enzyme_reaction'] = set()
        complex_df.at[i, 'cofactors'] = set()

0
100
200
300
CPLX0-7450 404
400
500
600
700
800
900
1000


In [19]:
complex_df

Unnamed: 0,id,common_name,stoichiometry,cofactors,enzyme_reaction
0,1-PFK,1-phosphofructokinase,"{'1-PFK': 1, '1-PFK-MONOMER': -2}",{MG+2},{1PFRUCTPHOSPHN-ENZRXN}
1,2OXOGLUTARATEDEH-CPLX,2-oxoglutarate dehydrogenase complex,"{'2OXOGLUTARATEDEH-CPLX': 1, 'E1O': -1, 'E2O':...","{THIAMINE-PYROPHOSPHATE, FAD, LIPOIC-ACID, MG+2}",{2OXOGLUTARATEDEH-ENZRXN}
2,3-ISOPROPYLMALDEHYDROG-CPLX,3-isopropylmalate dehydrogenase,"{'3-ISOPROPYLMALDEHYDROG-CPLX': 1, '3-ISOPROPY...","{MN+2, MG+2}",{3-ISOPROPYLMALDEHYDROG-ENZRXN}
3,3-ISOPROPYLMALISOM-CPLX,3-isopropylmalate dehydratase,"{'3-ISOPROPYLMALISOM-CPLX': 1, 'LEUC-MONOMER':...",{CPD-7},{3-ISOPROPYLMALISOM-ENZRXN}
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,3-methyl-2-oxobutanoate hydroxymethyltransferase,"{'3-METHYL-2-OXOBUT-OHCH3XFER-CPLX': 1, '3-CH3...",{MG+2},{3-METHYL-2-OXOBUT-OHCH3XFER-ENZRXN}
...,...,...,...,...,...
1088,CPLX0-3964,ribosome,"{'CPLX0-3964': 1, 'CPLX0-3953': -1, 'CPLX0-396...",{},{}
1089,CPLX0-8028,CsrA complex with McaS RNA,"{'CPLX0-8028': 1, 'IS061-RNA': -1, 'CPLX0-7956...",{},{}
1090,CPLX0-8053,SelB-L-selenocysteinyl-tRNA<sup>sec</sup>,"{'CPLX0-8053': 1, 'EG10942-MONOMER': -1}",{},{}
1091,CPLX0-8253,CsrA complex with CsrC RNA,"{'CPLX0-8253': 1, 'CSRC-RNA': -1, 'EG11447-MON...",{},{}


## Get cofactor elemental composition and more

In [20]:
# get set of all cofactors
all_metal_cofactors = set(ALLOWED_METAL_NAMES.values())
all_other_cofactors = ACCEPTED_OTHER_FEATURES

filter_cofactor_df = pd.DataFrame(columns=['id', 'common_name', 'elemental_composition'])
filter_cofactor_df['id'] = list(all_metal_cofactors | all_other_cofactors)

filter_cofactor_df['elemental_composition'] = 0
filter_cofactor_df['elemental_composition'] = filter_cofactor_df['elemental_composition'].astype(object)

# for each cofactor, get elemental composition
for i in range(len(filter_cofactor_df.index)):

    compound = filter_cofactor_df.loc[i, 'id']
    atom_dict = {}

    url_name = compound.replace('+', '%2b')

    req_str = f'https://websvc.biocyc.org/getxml?id=ECOLI:{url_name}&detail=full'

    r = s.get(req_str)
    if r.status_code != 200:
        print(compound, r.status_code)
        filter_cofactor_df.at[i, 'elemental_composition'] = {}
        continue

    o = xmltodict.parse(r.content)['ptools-xml']

    filter_cofactor_df['common_name'][i] = o['Compound']['common-name']['#text']

    atoms = o['Compound']['cml']['molecule']['atomArray']['atom']
    if type(atoms) is dict:
        atoms = [atoms]

    for atom in atoms:
        element = atom['@elementType']

        # either add new element or add to existing element
        if element not in atom_dict:
            atom_dict[element] = 1
        else:
            atom_dict[element] += 1

    filter_cofactor_df.at[i, 'elemental_composition'] = atom_dict

Any+2 404


In [21]:
filter_cofactor_df

Unnamed: 0,id,common_name,elemental_composition
0,PYRIDOXAL_PHOSPHATE,pyridoxal 5'-phosphate,"{'C': 8, 'N': 1, 'O': 6, 'P': 1}"
1,LIPOIC-ACID,(<i>R</i>)-lipoate,"{'C': 8, 'O': 2, 'S': 2}"
2,3FE-4S,a [3Fe-4S] iron-sulfur cluster,"{'FE': 3, 'S': 4}"
3,ZN+2,Zn<SUP>2+</SUP>,{'ZN': 1}
4,THIAMINE-PYROPHOSPHATE,thiamine diphosphate,"{'C': 12, 'N': 4, 'O': 7, 'P': 2, 'S': 1}"
5,FMN,FMN,"{'C': 17, 'N': 4, 'O': 9, 'P': 1}"
6,Any+2,,{}
7,NA+,Na<SUP>+</SUP>,{'NA': 1}
8,MG+2,Mg<SUP>2+</SUP>,{'MG': 1}
9,FE+2,Fe<sup>2+</sup>,{'FE': 1}


## Save tables to avoid having to re-download

In [22]:
# save complex_df to csv in a way that preserves dicts
complex_df.to_csv('notebooks/cofactors/data/raw_complexes.csv', index=False)
proteins_df.to_csv('notebooks/cofactors/data/raw_proteins.csv', index=False)
filter_cofactor_df.to_csv('notebooks/cofactors/data/raw_cofactors.csv', index=False)
pathway_df.to_csv('notebooks/cofactors/data/raw_pathways.csv', index=False)

In [23]:
proteins_df

Unnamed: 0,id,common_name,seq,enzyme_reaction,cofactors,metal_features,other_features,direct_annotations,go_annotations,uniprot_id
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,MLYIFRLIITVIYSILVCVFGSIYCLFSPRNPKHVATFGHMFGRLA...,"{ENZRXN0-7991, 1-ACYLGLYCEROL-3-P-ACYLTRANSFER...",{},[],[],"{PWY0-1319, PWY-5667}","{GO:0005886, GO:0016020, GO:0016740, GO:001602...",P26647
1,1-PFK-MONOMER,1-phosphofructokinase,MSRRVATITLNPAYDLVGFCPEIERGEVNLVKTTGLHAAGKGINVA...,{},{},[],[ATP],{PWY0-1314},"{GO:0000166, GO:0016301, GO:0008443, GO:001631...",P0AEW9
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,MKITVLGCGALGQLWLTALCKQGHEVQGWLRVPQPYCSVNLVETDG...,{2-DEHYDROPANTOATE-REDUCT-ENZRXN},{},[],[NADP],{PANTO-PWY},"{GO:0015940, GO:0005737, GO:0008677, GO:001661...",P0A9J4
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,MSQQVIIFDTTLRDGEQALQASLSVKEKLQIALALERMGVDVMEVG...,{ENZRXN0-6250},{},[],[],{LEUSYN-PWY},"{GO:0009082, GO:0046912, GO:0005737, GO:000398...",P09151
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",MVDKSQETTHFGFQTVAKEQKADMVAHVFHSVASKYDVMNDLMSFG...,"{2-OCTAPRENYL-METHOXY-BENZOQ-METH-ENZRXN, ADOM...",{},[],[],"{PWY-6708, MENAQUINONESYN-PWY}","{GO:0102027, GO:0030580, GO:0008168, GO:000674...",P0A887
...,...,...,...,...,...,...,...,...,...,...
4429,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,MTTDQHQEILRTEGLSKFFPGVKALDNVDFSLRRGEIMALLGENGA...,{},{},[],[ATP],{},"{GO:0000166, GO:0005886, GO:0016020, GO:001688...",Q6BEX0
4430,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,MMPQSLPDTTTPKRRFRWPTGMPQLVALLLVLLVDSLVAPHFWQVV...,{},{},[],[],{},"{GO:0005886, GO:0016020, GO:0140271, GO:005505...",P39328
4431,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,MLHKKTLLFAALSAALWGGATQAADAAVVASLKPVGFIASAIADGV...,{},{},"[zinc-coordinating residues, implicated in a s...",[],{},"{GO:0006811, GO:0042597, GO:0030001, GO:001602...",P39172
4432,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,MIELLFPGWLAGIMLACAAGPLGSFVVWRRMSYFGDTLAHASLLGV...,{},{},[],[],{},"{GO:0006811, GO:0005886, GO:0016020, GO:005508...",P39832


In [None]:

if 'Protein' in o and 'has-go-term' in o['Protein']:
    go_terms = o['Protein']['has-go-term']

    annotation_set = set()

    if type(go_terms) is dict:
        go_terms = [go_terms]

    for go_term in go_terms:
        # example entry
        entity = go_term['GO-Term']['@frameid']
        entity = entity.replace(':', '%3A')
        req_str = f'https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/{entity}'

        r = s.get(req_str)
        if r.status_code != 200:
            print(entity, r.status_code)

        o = json.loads(r.content)
        if len(o['results']) != 1:
            print('error', entity, len(o['results']))
            continue
        elif o['results'][0]['aspect'] == 'biological_process':
            annotation_set.add(o['results'][0]['name'])

            o['ptools-xml']['Protein']['gene']['Gene']['@frameid']

    proteins_df.at[i, 'direct_annotations'] = annotation_set

In [None]:
# example entry
entity = go_term['GO-Term']['@frameid']
entity = entity.replace(':', '%3A')
req_str = f'https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/{entity}'

r = s.get(req_str)
if r.status_code != 200:
    print(entity, r.status_code)

o = json.loads(r.content)
if len(o['results']) != 1:
    print('error', entity, len(o['results']))
    continue
elif o['results'][0]['aspect'] == 'biological_process':
    annotation_set.add(o['results'][0]['name'])