In [1]:
import numpy as np
import plotly.express as px
import seaborn as sns
import pandas as pd
import os

os.chdir(os.path.expanduser('~/dev/vivarium-ecoli'))

import matplotlib.pyplot as plt
import dill
import requests
import xmltodict
from ecoli.processes.metabolism_redux import NetworkFlowModel, FlowResult, MetabolismRedux
%matplotlib inline

FREE_RXNS = ["TRANS-RXN-145", "TRANS-RXN0-545", "TRANS-RXN0-474"]

sns.set(style='darkgrid', palette='viridis', context='talk')

In [2]:
notebook_path = f'notebooks/Heena notebooks/Metabolism_New Genes/'
df_WCM_gene = pd.read_csv(notebook_path + 'WCM gene.csv')
df_WCM_rxn = pd.read_csv(notebook_path + 'WCM genes.csv')
df_WCM_gene.iloc[200:206,:]
df_WCM_rxn.head(10)

Unnamed: 0,Gene ID (EcoCyc),Gene locus ID,Gene name,RNA type,Category,Macklin et al. (2020),Latest version (20220602),Description,Belongs in metabolism,proposed category,comment,Activity expected,new demand,general enzyme
0,G0-16636,b4759,3'ETS-<i>leuZ</i>,miscRNA,Uncategorized,False,False,,,,,,,
1,G0-16718,b4634,aaaD,pseudo,Uncategorized,False,False,,,,,,,
2,G0-16720,b4693,aaaE,pseudo,Uncategorized,False,False,,,,,,,
3,G7686,b3241,aaeA,mRNA,Unclear/under-characterized,False,True,,,,,,,
4,G7685,b3240,aaeB,mRNA,Unclear/under-characterized,False,True,,,,,,,
5,G7688,b3243,aaeR,mRNA,Transcription regulation,False,False,,,,,,,
6,G7687,b3242,aaeX,mRNA,Unknown function,False,False,,,,,,,
7,EG11679,b2836,aas,mRNA,Metabolism,True,True,,,,,,,
8,EG11112,b0885,aat,mRNA,Protein maturation peptide addition,True,True,,,,,,,
9,G6670,b1338,abgA,mRNA,Unknown function,False,True,,,,,,,


In [3]:
print(df_WCM_gene.shape)

(4736, 9)


In [34]:
df_WCM_gene_metabolism = df_WCM_gene[(df_WCM_gene.Category == "Metabolism") & (df_WCM_gene["Macklin et al. (2020)"] == False) & \
                          (df_WCM_gene["Latest version (20220602)"] == True)]
df_WCM_rxn_metabolism  = df_WCM_rxn[(df_WCM_rxn.Category == "Metabolism") & (df_WCM_rxn["Macklin et al. (2020)"] == False) & \
                          (df_WCM_rxn["Latest version (20220602)"] == True) & (df_WCM_rxn["Belongs in metabolism"] != "no")]

In [35]:
print(df_WCM_gene_metabolism.shape)
# print(df_WCM_rxn_metabolism.shape)
# They share the same metabolism genes, the only difference between the two dataframes are the features

(307, 9)


In [36]:
# np.all(df_WCM_gene_metabolism["Gene ID (EcoCyc)"].isin(df_WCM_rxn_metabolism["Gene ID (EcoCyc)"]))

In [37]:
df_WCM_gene_metabolism.head(20)

Unnamed: 0,Gene ID (EcoCyc),Gene locus ID,Gene name,RNA type,Category,Macklin et al. (2020),Latest version (20220602),Machinery (from column E) Implemented?,Comment
20,EG10022,b4015,aceA,mRNA,Metabolism,False,True,,
21,EG10023,b4014,aceB,mRNA,Metabolism,False,True,,
42,EG11942,b4067,actP,mRNA,Metabolism,False,True,,
47,EG11724,b3714,adeP,mRNA,Metabolism,False,True,,
52,EG12462,b4115,adiC,mRNA,Metabolism,False,True,,
57,EG11101,b0476,aes,mRNA,Metabolism,False,True,,
66,G7634,b3136,agaS,mRNA,Metabolism,False,True,,
69,EG10033,b1002,agp,mRNA,Metabolism,False,True,,
72,EG11384,b0605,ahpC,mRNA,Metabolism,False,True,,
91,G6275,b0505,allA,mRNA,Metabolism,False,True,,


## Connect to EcoCyc to allocate descriptions to each metabolic gene

In [4]:
# Connect to Ecocyc API
s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password':'Cellman0451'})

<Response [200]>

In [39]:
 df_WCM_gene_metabolism["Gene ID (EcoCyc)"][-10:]

4291    EG12439
4474      G6845
4475      G6846
4476      G6847
4477      G6848
4543      G7248
4573      G7408
4670    EG12517
4671    EG12518
4672    EG12520
Name: Gene ID (EcoCyc), dtype: object

In [40]:
from tqdm import tqdm
import sys
# Add Multi-Func Term from Ecocyc for each metabolic gene in the rxn dataframe
metabolic_genes = df_WCM_gene_metabolism["Gene ID (EcoCyc)"]
multifunction_id = []
pathways = []
pathways_parent = []
enzymes = []
products = []
reactions = []

for gene in tqdm(metabolic_genes):
    req_func = f"https://websvc.biocyc.org/getxml?id=ECOLI:{gene}&detail=full"
    req_path = f"https://websvc.biocyc.org/apixml?fn=pathways-of-gene&id=ECOLI:{gene}&detail=full"
    req_enzm = f"https://websvc.biocyc.org/apixml?fn=enzymes-of-gene&id=ECOLI:{gene}&detail=full"
    req_rxns = f"https://websvc.biocyc.org/apixml?fn=reactions-of-gene&id=ECOLI:{gene}&detail=full"
    
    response_func = s.get(req_func)
    response_path = s.get(req_path)
    response_enzm = s.get(req_enzm)
    response_rxns = s.get(req_rxns)
    
    if response_path.status_code != 200:
        sys.exit()
        
    output = xmltodict.parse(response_func.content)['ptools-xml']['Gene'] # can also get product of the gene from this request
    output_path = xmltodict.parse(response_path.content)['ptools-xml']
    output_enzyme = xmltodict.parse(response_enzm.content)['ptools-xml']
    output_rxns = xmltodict.parse(response_rxns.content)['ptools-xml']
    
    # parse through output for MultiFun ID
    if isinstance(output['parent'], list):
        output_parent = output['parent']
        multifun_id_arr = []
        for parent in output_parent:
            multifun_id_arr.append(parent['Gene']['@frameid'])
        multifunction_id.append(multifun_id_arr)
    else:
        multifunction_id.append(output['parent']['Gene']['@frameid'])

    # parse through output for product

    if isinstance(output['product']['Protein'], list):
        # import ipdb; ipdb.set_trace()
        output = output['product']['Protein']
        product_arr = []
        for product in output:
            product_arr.append(product['@frameid'])
        products.append(product_arr)
    else:
        products.append(output['product']['Protein']['@frameid'])

    
    # parse through output for pathway <- gene
    if "Pathway" in output_path.keys():
        output = output_path['Pathway']       
        if isinstance(output, list):
            pathway_arr = []
            pathways_parent_arr = []
            for pathway in output:
                pathway_arr.append(pathway['@frameid'])      
                if isinstance(pathway['parent'], list):
                    for parent in pathway['parent']:
                        # import ipdb ;ipdb.set_trace()
                        pathways_parent_arr.append(parent['Pathway']['@frameid'])
                else:
                    # if gene == "EG10130": import ipdb; ipdb.set_trace()
                    pathways_parent_arr.append(pathway['parent']['Pathway']['@frameid'])
            pathways.append(pathway_arr)
            pathways_parent.append(pathways_parent_arr)
        else:
            pathways.append(output['@frameid'])
            if isinstance(output['parent'], list):
                pathways_parent_arr = []
                for parent in output['parent']:
                    # import ipdb ;ipdb.set_trace()
                    pathways_parent_arr.append(parent['Pathway']['@frameid'])
                pathways_parent.append(pathways_parent_arr)
            else:
                pathways_parent.append(output['parent']['Pathway']['@frameid'])
    else:
        pathways.append(np.nan)
        pathways_parent.append(np.nan)

    
    # parse through output for enzyme <- gene
    if "Protein" in output_enzyme.keys():
        if isinstance(output_enzyme['Protein'], list):
            enzyme_arr = []
            for protein in output_enzyme['Protein']:
                enzyme_arr.append(protein['@frameid'])
            enzymes.append(enzyme_arr)
        else:
            enzymes.append(output_enzyme['Protein']['@frameid'])
    else:
        enzymes.append(np.nan)

    # parse through output for rxns <- gene
    if "Reaction" in output_rxns.keys():
        if isinstance(output_rxns['Reaction'], list):
            reactions_arr = []
            for rxn in output_rxns['Reaction']:
                reactions_arr.append(rxn['@frameid'])
            reactions.append(reactions_arr)
        else:
            reactions.append([output_rxns['Reaction']['@frameid'],])
    else:
        reactions.append(np.nan)
        


  0%|          | 0/307 [00:00<?, ?it/s][A
  0%|          | 1/307 [00:01<09:13,  1.81s/it][A
  1%|          | 2/307 [00:03<08:39,  1.70s/it][A
  1%|          | 3/307 [00:04<08:10,  1.61s/it][A
  1%|▏         | 4/307 [00:06<08:09,  1.61s/it][A
  2%|▏         | 5/307 [00:08<07:58,  1.58s/it][A
  2%|▏         | 6/307 [00:30<43:25,  8.66s/it][A
  2%|▏         | 7/307 [00:32<31:48,  6.36s/it][A
  3%|▎         | 8/307 [00:33<24:06,  4.84s/it][A
  3%|▎         | 9/307 [00:35<19:07,  3.85s/it][A
  3%|▎         | 10/307 [00:37<15:44,  3.18s/it][A
  4%|▎         | 11/307 [00:38<13:19,  2.70s/it][A
  4%|▍         | 12/307 [00:40<11:45,  2.39s/it][A
  4%|▍         | 13/307 [00:42<10:38,  2.17s/it][A
  5%|▍         | 14/307 [00:43<10:09,  2.08s/it][A
  5%|▍         | 15/307 [00:45<09:31,  1.96s/it][A
  5%|▌         | 16/307 [00:47<08:54,  1.84s/it][A
  6%|▌         | 17/307 [00:49<09:04,  1.88s/it][A
  6%|▌         | 18/307 [00:50<08:48,  1.83s/it][A
  6%|▌         | 19/307 [00:5

In [41]:
multifunction_id[-10:]

[['BC-4.3.A.1.m', 'BC-6.1'],
 'BC-1.8',
 'BC-1.8',
 'BC-1.8',
 'BC-1.8',
 'BC-1.2.3',
 'BC-1',
 'BC-4.3.A.1.p',
 'BC-4.3.A.1.a',
 ['BC-4.3.A.1.m', 'BC-6.1']]

In [42]:
from IPython.display import display, HTML
def get_multifunc_output(function_id):
    req_func = f"https://websvc.biocyc.org/getxml?id=ECOLI:{function_id}&detail=full"
    response_func = s.get(req_func)
    output = xmltodict.parse(response_func.content)['ptools-xml']['Gene']

    parent = output['parent']['Gene']['@frameid']
    common_name = output['common-name']['#text']
    return parent, common_name

def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))

In [43]:
# # parse through the multifunction id to get multifunction name
# # or I can try getting the unique funcs of all multifunction and request multifunction name and map them back
# # instead of parsing through a bigger loop. But no biggy, the dataframe isn't that huge. 
# multifunction_name = []
# for multifunction in tqdm(multifunction_id):
#     if isinstance(multifunction, tuple):
#         multifunction_name_str = ""
#         for function in multifunction:
#             parent, common_name = get_multifunc_output(function)
#             while parent != 'MultiFun' and common_name != "UNCLASSIFIED":
#                 parent, common_name_ = get_multifunc_output(parent)
#                 common_name = common_name_ + " -> " + common_name
#             multifunction_name_str = multifunction_name_str + common_name + '\n'
#             
#         multifunction_name.append(multifunction_name_str)
#     else:
#         parent, common_name = get_multifunc_output(multifunction)
#         # multifunction_name.append(common_name)
#         # if multifunction = 'Unclassified-Genes': import ipdb; ipdb.set_trace()
#         while parent != 'MultiFun' and common_name != "UNCLASSIFIED":
#             parent, common_name_ = get_multifunc_output(parent)
#             common_name = common_name_ + " -> " + common_name
#         multifunction_name.append(common_name)
# 


In [44]:
multifunction_name = []
for multifunction in tqdm(multifunction_id):
    if isinstance(multifunction, list):
        multifunction_name_list = []  # Use a list to store multiple functions
        for function in multifunction:
            parent, common_name = get_multifunc_output(function)
            while parent != 'MultiFun' and common_name != "UNCLASSIFIED":
                parent, common_name_ = get_multifunc_output(parent)
                common_name = common_name_ + " -> " + common_name
            multifunction_name_list.append(common_name)  
            
        multifunction_name.append(multifunction_name_list)  # Append the list for each multifunction
    else:
        parent, common_name = get_multifunc_output(multifunction)
        multifunction_name_list = []  # Use a list for single function cases
        while parent != 'MultiFun' and common_name != "UNCLASSIFIED":
            parent, common_name_ = get_multifunc_output(parent)
            common_name = common_name_ + " -> " + common_name
        multifunction_name_list.append(common_name)
        multifunction_name.append(multifunction_name_list) 


  0%|          | 0/307 [00:00<?, ?it/s][A
  0%|          | 1/307 [00:01<05:36,  1.10s/it][A
  1%|          | 2/307 [00:02<05:41,  1.12s/it][A
  1%|          | 3/307 [00:04<08:04,  1.59s/it][A
  1%|▏         | 4/307 [00:05<07:11,  1.42s/it][A
  2%|▏         | 5/307 [00:07<08:20,  1.66s/it][A
  2%|▏         | 6/307 [00:08<07:36,  1.52s/it][A
  2%|▏         | 7/307 [00:09<06:16,  1.26s/it][A
  3%|▎         | 8/307 [00:10<06:01,  1.21s/it][A
  3%|▎         | 9/307 [00:11<05:58,  1.20s/it][A
  3%|▎         | 10/307 [00:14<07:29,  1.51s/it][A
  4%|▎         | 11/307 [00:16<08:34,  1.74s/it][A
  4%|▍         | 12/307 [00:18<09:01,  1.84s/it][A
  4%|▍         | 13/307 [00:19<07:52,  1.61s/it][A
  5%|▍         | 14/307 [00:20<07:13,  1.48s/it][A
  5%|▍         | 15/307 [00:23<09:50,  2.02s/it][A
  5%|▌         | 16/307 [00:24<07:56,  1.64s/it][A
  6%|▌         | 17/307 [00:26<08:28,  1.75s/it][A
  6%|▌         | 18/307 [00:27<07:31,  1.56s/it][A
  6%|▌         | 19/307 [00:2

## Save as CSV

In [45]:
import csv
# create new dataframe
df_metabolic_gene_annotation = df_WCM_gene_metabolism.iloc[:,:3]
df_metabolic_gene_annotation["Enzyme encoded"] = enzymes
df_metabolic_gene_annotation["Pathways"] = pathways
df_metabolic_gene_annotation["Pathways parent"] = pathways_parent
df_metabolic_gene_annotation["Protein products"] = products
df_metabolic_gene_annotation["MultiFuntional ID"] = multifunction_id
df_metabolic_gene_annotation["MultiFuntional name"] = multifunction_name
df_metabolic_gene_annotation["Reactions"] = reactions
df_metabolic_gene_annotation["Description by Cyrus"] = df_WCM_rxn_metabolism.iloc[:,7]

# pretty_print(df_metabolic_gene_annotation)
df_metabolic_gene_annotation.to_csv('notebooks/Heena notebooks/Metabolism_New Genes/new_metabolic_gene_annotation.csv', index=False)
df_metabolic_gene_annotation.to_csv('notebooks/Heena notebooks/Metabolism_New Genes/new_metabolic_gene_annotation.tsv', sep="\t", quoting=csv.QUOTE_NONNUMERIC, index=False)
pretty_print(df_metabolic_gene_annotation.head(10))

Unnamed: 0,Gene ID (EcoCyc),Gene locus ID,Gene name,Enzyme encoded,Pathways,Pathways parent,Protein products,MultiFuntional ID,MultiFuntional name,Reactions,Description by Cyrus
20,EG10022,b4015,aceA,ISOCIT-LYASE,GLYOXYLATE-BYPASS,Energy-Metabolism,ISOCIT-LYASE-MONOMER,BC-1.7.2,[metabolism -> central intermediary metabolism -> glyoxylate bypass],[ISOCIT-CLEAV-RXN],acetate transport + metabolism
21,EG10023,b4014,aceB,MALATE-SYNTHASE,GLYOXYLATE-BYPASS,Energy-Metabolism,MALATE-SYNTHASE,BC-1.7.2,[metabolism -> central intermediary metabolism -> glyoxylate bypass],[MALSYN-RXN],acetate transport + metabolism
42,EG11942,b4067,actP,CPLX0-7955,,,YJCG-MONOMER,"[BC-4.2.A, BC-6.1]","[transport -> Electrochemical potential driven transporters -> Porters (Uni-, Sym- and Antiporters), cell structure -> membrane]","[RXN0-1981, RXN0-5111, TRANS-RXN0-576]",acetate transport + metabolism
47,EG11724,b3714,adeP,EG11724-MONOMER,,,EG11724-MONOMER,BC-4.2.A,"[transport -> Electrochemical potential driven transporters -> Porters (Uni-, Sym- and Antiporters)]",[TRANS-RXN0-447],adenine transport
52,EG12462,b4115,adiC,CPLX0-7535,,,YJDE-MONOMER,"[BC-4.2.A, BC-6.1]","[transport -> Electrochemical potential driven transporters -> Porters (Uni-, Sym- and Antiporters), cell structure -> membrane]",[RXN0-2162],"transport, acid resistance"
57,EG11101,b0476,aes,CPLX0-8033,,,EG11101-MONOMER,BC-1.1.1,[metabolism -> carbon utilization -> carbon compounds],[ACETYLESTERASE-RXN],general fatty acid degradation
66,G7634,b3136,agaS,G7634-MONOMER,,,G7634-MONOMER,BC-1.1,[metabolism -> carbon utilization],[RXN-13548],"enzyme, only in some strains"
69,EG10033,b1002,agp,GLUCOSE-1-PHOSPHAT-CPLX,,,GLUCOSE-1-PHOSPHAT-MONOMER,BC-1.7.9,[metabolism -> central intermediary metabolism -> misc. glucose metabolism],"[GLUCOSE-1-PHOSPHAT-RXN, RXN0-1001]",periplasmic phosphatase to import g6p
72,EG11384,b0605,ahpC,CPLX0-245,,,EG11384-MONOMER,BC-5.6.2,[cell processes -> protection -> detoxification],"[R4-RXN, RXN-19953, RXN-19954, RXN-20692, RXN-20691]",general peroxidase
91,G6275,b0505,allA,G6275-MONOMER,PWY-5705,"[Allantoin-degradation, Super-Pathways]",G6275-MONOMER,"[BC-1.7.26, BC-1.8.3]","[metabolism -> central intermediary metabolism -> allantoin assimilation, metabolism -> metabolism of other compounds -> nitrogen metabolism]",[UREIDOGLYCOLATE-LYASE-RXN],allantoin purine utilization nitrogen source


In [None]:
df.head()

In [None]:
# Sample dataframe with multiple pathways
data = df_metabolic_gene_annotation["MultiFuntional name"]

# Function to split functional terms into individual pathways and then split those pathways into components
def split_terms(term):
    pathways = term.split('\n')  # Split by newline first
    split_data = []
    for pathway in pathways:
        parts = pathway.split('->')
        split_data.extend(['->'.join(parts[:i+1]) for i in range(len(parts))])
    return split_data

# Create a new dataframe to hold split terms
split_data = []

for term in data:
    split_data.extend([{'id': split_term, 'parent': '->'.join(split_term.split('->')[:-1])} 
                       for split_term in split_terms(term)])


In [None]:
# Convert to a dataframe
df_split = pd.DataFrame(split_data)

# Count occurrences of each term
df_split_count = df_split.groupby(['id', 'parent']).size().reset_index(name='count')

# Plot the sunburst chart
fig = px.sunburst(
    df_split_count,
    names='id',
    parents='parent',
    values='count',
    title='Hierarchical Functional Terms',
)

# Save the figure as an HTML file
fig.write_html("notebooks/Heena notebooks/sunburst_chart.html")

# All Metabolic Genes

In [5]:
df_WCM_gene_metabolism_all = df_WCM_gene[(df_WCM_gene.Category == "Metabolism") & (df_WCM_gene["Latest version (20220602)"] == True)]

In [6]:
# Connect to Ecocyc API
s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password':'Cellman0451'})

<Response [200]>

In [10]:
from tqdm import tqdm
import sys
# Add Multi-Func Term from Ecocyc for each metabolic gene in the rxn dataframe
metabolic_genes = df_WCM_gene_metabolism_all["Gene ID (EcoCyc)"].tolist()
# remove problematic ones, which will be entered manually
metabolic_genes.remove('EG10637')
pathways = []
pathways_parent = []
enzymes = []
products = []
reactions = []

for gene in tqdm(metabolic_genes):
    req_func = f"https://websvc.biocyc.org/getxml?id=ECOLI:{gene}&detail=full"
    req_path = f"https://websvc.biocyc.org/apixml?fn=pathways-of-gene&id=ECOLI:{gene}&detail=full"
    req_enzm = f"https://websvc.biocyc.org/apixml?fn=enzymes-of-gene&id=ECOLI:{gene}&detail=full"
    req_rxns = f"https://websvc.biocyc.org/apixml?fn=reactions-of-gene&id=ECOLI:{gene}&detail=full"
    
    response_func = s.get(req_func)
    response_path = s.get(req_path)
    response_enzm = s.get(req_enzm)
    response_rxns = s.get(req_rxns)
    
    if response_path.status_code != 200:
        sys.exit()

    output = xmltodict.parse(response_func.content)['ptools-xml']['Gene'] # can also get product of the gene from this request
    output_path = xmltodict.parse(response_path.content)['ptools-xml']
    output_enzyme = xmltodict.parse(response_enzm.content)['ptools-xml']
    output_rxns = xmltodict.parse(response_rxns.content)['ptools-xml']
    
    # parse through output for product

    if isinstance(output['product']['Protein'], list):
        # import ipdb; ipdb.set_trace()
        output = output['product']['Protein']
        product_arr = []
        for product in output:
            product_arr.append(product['@frameid'])
        products.append(product_arr)
    else:
        products.append(output['product']['Protein']['@frameid'])

    
    # parse through output for pathway <- gene
    if "Pathway" in output_path.keys():
        output = output_path['Pathway']       
        if isinstance(output, list):
            pathway_arr = []
            pathways_parent_arr = []
            for pathway in output:
                pathway_arr.append(pathway['@frameid'])      
                if isinstance(pathway['parent'], list):
                    # import ipdb ;ipdb.set_trace()
                    for parent in pathway['parent']:
                        
                        pathways_parent_arr.append(parent['Pathway']['@frameid'])
                else:
                    # if gene == "EG10130": import ipdb; ipdb.set_trace()
                    pathways_parent_arr.append(pathway['parent']['Pathway']['@frameid'])
            pathways.append(pathway_arr)
            pathways_parent.append(pathways_parent_arr)
        else:
            pathways.append(output['@frameid'])
            if isinstance(output['parent'], list):
                pathways_parent_arr = []
                for parent in output['parent']:
                    # import ipdb ;ipdb.set_trace()
                    pathways_parent_arr.append(parent['Pathway']['@frameid'])
                pathways_parent.append(pathways_parent_arr)
            else:
                pathways_parent.append(output['parent']['Pathway']['@frameid'])
    else:
        pathways.append(np.nan)
        pathways_parent.append(np.nan)

    
    # parse through output for enzyme <- gene
    if "Protein" in output_enzyme.keys():
        if isinstance(output_enzyme['Protein'], list):
            enzyme_arr = []
            for protein in output_enzyme['Protein']:
                enzyme_arr.append(protein['@frameid'])
            enzymes.append(enzyme_arr)
        else:
            enzymes.append(output_enzyme['Protein']['@frameid'])
    else:
        enzymes.append(np.nan)

    # parse through output for rxns <- gene
    if "Reaction" in output_rxns.keys():
        if isinstance(output_rxns['Reaction'], list):
            reactions_arr = []
            for rxn in output_rxns['Reaction']:
                reactions_arr.append(rxn['@frameid'])
            reactions.append(reactions_arr)
        else:
            reactions.append([output_rxns['Reaction']['@frameid'],])
    else:
        reactions.append(np.nan)
        

100%|██████████| 1246/1246 [12:36<00:00,  1.65it/s] 


In [40]:
temp = df_WCM_gene_metabolism_all["Gene ID (EcoCyc)"].tolist()
temp.pop(743)

'EG10638'

In [14]:
import pandas as pd

gene_name = df_WCM_gene_metabolism_all['Gene name'].tolist()
gene_name.remove('nanA')
df_all_metabolic_gene_annotation = pd.DataFrame({'Gene ID (EcoCyc)': metabolic_genes, 'Gene name': gene_name, 'Enzyme encoded': enzymes, 'Pathways': pathways, 'Pathways parent': pathways_parent, 'Protein products': products, 'Reactions': reactions})

df_all_metabolic_gene_annotation.to_csv('notebooks/Heena notebooks/Metabolism_New Genes/all_metabolic_gene_annotation.csv', index=False)

In [8]:
xmltodict.parse(response_func.content)

ExpatError: reference to invalid character number: line 54, column 109

In [9]:
response_func.content

b'<?xml version="1.0" encoding="iso-8859-1"?>\n<ptools-xml ptools-version=\'29.0\' xml:base=\'http://BioCyc.org/getxml?ECOLI:EG10637\'><metadata><url>http://BioCyc.org/</url>\n<service_name>getxml</service_name>\n<query>ECOLI:EG10637</query>\n<num_results>1</num_results>\n<PGDB orgid=\'ECOLI\' version=\'29.0\'><species datatype=\'string\'>Escherichia coli</species>\n<strain datatype=\'string\'>K-12 substr. MG1655</strain>\n<dblink><dblink-db>NCBI-TAXONOMY-DB</dblink-db>\n<dblink-oid>511145</dblink-oid>\n<dblink-relationship>unification</dblink-relationship>\n<dblink-URL>http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&amp;id=511145</dblink-URL>\n</dblink>\n</PGDB>\n</metadata>\n<Gene ID=\'ECOLI:EG10637\' orgid=\'ECOLI\' frameid=\'EG10637\' detail=\'full\'><parent><Gene resource=\'getxml?ECOLI:Genes\' orgid=\'ECOLI\' frameid=\'Genes\' class=\'true\'/></parent>\n<replicon><Genetic-Element resource=\'getxml?ECOLI:COLI-K12\' orgid=\'ECOLI\' frameid=\'COLI-K12\'/></replicon