In [13]:
import numpy as np
import plotly.express as px
import seaborn as sns
import pandas as pd
import os

os.chdir(os.path.expanduser('~/dev/vivarium-ecoli'))

import matplotlib.pyplot as plt
import dill
import requests
import xmltodict
from ecoli.processes.metabolism_redux import NetworkFlowModel, FlowResult, MetabolismRedux
%matplotlib inline

FREE_RXNS = ["TRANS-RXN-145", "TRANS-RXN0-545", "TRANS-RXN0-474"]

sns.set(style='darkgrid', palette='viridis', context='talk')

In [14]:
notebook_path = f'notebooks/Heena notebooks/Metabolism_New Genes/'
df_WCM_gene = pd.read_csv(notebook_path + 'WCM gene.csv')
df_WCM_gene.iloc[200:206,:]
# df_WCM_rxn.head(10)

Unnamed: 0,Gene ID (EcoCyc),Gene locus ID,Gene name,RNA type,Category,Macklin et al. (2020),Latest version (20220602),Machinery (from column E) Implemented?,Comment
200,EG11624,b0864,artP,mRNA,Metabolism,True,True,,
201,EG11626,b0862,artQ,mRNA,Metabolism,True,True,,
202,EG10085,b2716,ascB,mRNA,Unknown function,False,True,,
203,EG10086,b2715,ascF,mRNA,Metabolism,False,True,,
204,EG10087,b2714,ascG,mRNA,Transcription regulation,False,False,,
205,EG10088,b3433,asd,mRNA,Metabolism,True,True,,


In [15]:
print(df_WCM_gene.shape)

(4736, 9)


In [16]:
df_WCM_gene_metabolism = df_WCM_gene[(df_WCM_gene.Category == "Metabolism") & (df_WCM_gene["Macklin et al. (2020)"] == False) & \
                          (df_WCM_gene["Latest version (20220602)"] == True)]
# df_WCM_rxn_metabolism  = df_WCM_rxn[(df_WCM_rxn.Category == "Metabolism") & (df_WCM_rxn["Macklin et al. (2020)"] == False) & \
#                           (df_WCM_rxn["Latest version (20220602)"] == True) & (df_WCM_rxn["Belongs in metabolism"] != "no")]

In [17]:
print(df_WCM_gene_metabolism.shape)
# print(df_WCM_rxn_metabolism.shape)
# They share the same metabolism genes, the only difference between the two dataframes are the features

(306, 9)


In [18]:
# np.all(df_WCM_gene_metabolism["Gene ID (EcoCyc)"].isin(df_WCM_rxn_metabolism["Gene ID (EcoCyc)"]))

In [19]:
df_WCM_gene_metabolism.head(20)

Unnamed: 0,Gene ID (EcoCyc),Gene locus ID,Gene name,RNA type,Category,Macklin et al. (2020),Latest version (20220602),Machinery (from column E) Implemented?,Comment
20,EG10022,b4015,aceA,mRNA,Metabolism,False,True,,
21,EG10023,b4014,aceB,mRNA,Metabolism,False,True,,
42,EG11942,b4067,actP,mRNA,Metabolism,False,True,,
47,EG11724,b3714,adeP,mRNA,Metabolism,False,True,,
52,EG12462,b4115,adiC,mRNA,Metabolism,False,True,,
57,EG11101,b0476,aes,mRNA,Metabolism,False,True,,
66,G7634,b3136,agaS,mRNA,Metabolism,False,True,,
69,EG10033,b1002,agp,mRNA,Metabolism,False,True,,
72,EG11384,b0605,ahpC,mRNA,Metabolism,False,True,,
91,G6275,b0505,allA,mRNA,Metabolism,False,True,,


## Connect to EcoCyc to allocate descriptions to each metabolic gene

In [20]:
# Connect to Ecocyc API
s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password':'Cellman0451'})

<Response [200]>

In [25]:
from tqdm import tqdm
import sys
# Add Multi-Func Term from Ecocyc for each metabolic gene in the rxn dataframe
metabolic_genes = df_WCM_gene_metabolism["Gene ID (EcoCyc)"]
multifunction_id = []
pathways = []
pathways_parent = []
enzymes = []
products = []
reactions = []

for gene in tqdm(metabolic_genes):
    req_func = f"https://websvc.biocyc.org/getxml?id=ECOLI:{gene}&detail=full"
    req_path = f"https://websvc.biocyc.org/apixml?fn=pathways-of-gene&id=ECOLI:{gene}&detail=full"
    req_enzm = f"https://websvc.biocyc.org/apixml?fn=enzymes-of-gene&id=ECOLI:{gene}&detail=full"
    req_rxns = f"https://websvc.biocyc.org/apixml?fn=reactions-of-gene&id=ECOLI:{gene}&detail=full"
    
    response_func = s.get(req_func)
    response_path = s.get(req_path)
    response_enzm = s.get(req_enzm)
    response_rxns = s.get(req_rxns)
    
    if response_path.status_code != 200:
        sys.exit()
        
    output = xmltodict.parse(response_func.content)['ptools-xml']['Gene'] # can also get product of the gene from this request
    output_path = xmltodict.parse(response_path.content)['ptools-xml']
    output_enzyme = xmltodict.parse(response_enzm.content)['ptools-xml']
    output_rxns = xmltodict.parse(response_rxns.content)['ptools-xml']
    
    # # parse through output for MultiFun ID
    # if isinstance(output['parent'], list):
    #     output_parent = output['parent']
    #     multifun_id_arr = []
    #     for parent in output_parent:
    #         multifun_id_arr.append(parent['Gene']['@frameid'])
    #     multifunction_id.append(multifun_id_arr)
    # else:
    #     multifunction_id.append(output['parent']['Gene']['@frameid'])

    # # parse through output for product

    # if isinstance(output['product']['Protein'], list):
    #     # import ipdb; ipdb.set_trace()
    #     output = output['product']['Protein']
    #     product_arr = []
    #     for product in output:
    #         product_arr.append(product['@frameid'])
    #     products.append(product_arr)
    # else:
    #     products.append(output['product']['Protein']['@frameid'])

    
    # # parse through output for pathway <- gene
    # if "Pathway" in output_path.keys():
    #     output = output_path['Pathway']       
    #     if isinstance(output, list):
    #         pathway_arr = []
    #         pathways_parent_arr = []
    #         for pathway in output:
    #             pathway_arr.append(pathway['@frameid'])      
    #             if isinstance(pathway['parent'], list):
    #                 for parent in pathway['parent']:
    #                     # import ipdb ;ipdb.set_trace()
    #                     pathways_parent_arr.append(parent['Pathway']['@frameid'])
    #             else:
    #                 # if gene == "EG10130": import ipdb; ipdb.set_trace()
    #                 pathways_parent_arr.append(pathway['parent']['Pathway']['@frameid'])
    #         pathways.append(pathway_arr)
    #         pathways_parent.append(pathways_parent_arr)
    #     else:
    #         pathways.append(output['@frameid'])
    #         if isinstance(output['parent'], list):
    #             pathways_parent_arr = []
    #             for parent in output['parent']:
    #                 # import ipdb ;ipdb.set_trace()
    #                 pathways_parent_arr.append(parent['Pathway']['@frameid'])
    #             pathways_parent.append(pathways_parent_arr)
    #         else:
    #             pathways_parent.append(output['parent']['Pathway']['@frameid'])
    # else:
    #     pathways.append(np.nan)
    #     pathways_parent.append(np.nan)

    
    # # parse through output for enzyme <- gene
    # if "Protein" in output_enzyme.keys():
    #     if isinstance(output_enzyme['Protein'], list):
    #         enzyme_arr = []
    #         for protein in output_enzyme['Protein']:
    #             enzyme_arr.append(protein['@frameid'])
    #         enzymes.append(enzyme_arr)
    #     else:
    #         enzymes.append(output_enzyme['Protein']['@frameid'])
    # else:
    #     enzymes.append(np.nan)

    # parse through output for rxns <- gene
    if "Reaction" in output_rxns.keys():
        if isinstance(output_rxns['Reaction'], list):
            reactions_arr = []
            for rxn in output_rxns['Reaction']:
                reactions_arr.append(rxn['@frameid'])
            reactions.append(reactions_arr)
        else:
            reactions.append([output_rxns['Reaction']['@frameid'],])
    else:
        reactions.append(np.nan)
        

 58%|█████▊    | 177/306 [01:22<01:00,  2.14it/s]


KeyboardInterrupt: 

In [None]:
fun = "reactions-of-gene"
gene = "EG10041"
req = f"https://websvc.biocyc.org/apixml?fn={fun}&id=ECOLI:{gene}&detail=full"
response_func = s.get(req)
output_path = xmltodict.parse(response_func.content)['ptools-xml']['Reaction']

In [None]:
from IPython.display import display, HTML
def get_multifunc_output(function_id):
    req_func = f"https://websvc.biocyc.org/getxml?id=ECOLI:{function_id}&detail=full"
    response_func = s.get(req_func)
    output = xmltodict.parse(response_func.content)['ptools-xml']['Gene']

    parent = output['parent']['Gene']['@frameid']
    common_name = output['common-name']['#text']
    return parent, common_name

def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))

In [None]:
# parse through the multifunction id to get multifunction name
# or I can try getting the unique funcs of all multifunction and request multifunction name and map them back
# instead of parsing through a bigger loop. But no biggy, the dataframe isn't that huge. 

multifunction_name = []
for multifunction in tqdm(multifunction_id):
    if isinstance(multifunction, tuple):
        multifunction_name_str = ""
        for function in multifunction:
            parent, common_name = get_multifunc_output(function)
            while parent != 'MultiFun' and common_name != "UNCLASSIFIED":
                parent, common_name_ = get_multifunc_output(parent)
                common_name = common_name_ + " -> " + common_name
            multifunction_name_str = multifunction_name_str + common_name + '\n'
            
        multifunction_name.append(multifunction_name_str)
    else:
        parent, common_name = get_multifunc_output(multifunction)
        # multifunction_name.append(common_name)
        # if multifunction = 'Unclassified-Genes': import ipdb; ipdb.set_trace()
        while parent != 'MultiFun' and common_name != "UNCLASSIFIED":
            parent, common_name_ = get_multifunc_output(parent)
            common_name = common_name_ + " -> " + common_name
        multifunction_name.append(common_name)



In [None]:
multifunction_name = []
for multifunction in tqdm(multifunction_id):
    if isinstance(multifunction, list):
        multifunction_name_list = []  # Use a list to store multiple functions
        for function in multifunction:
            parent, common_name = get_multifunc_output(function)
            while parent != 'MultiFun' and common_name != "UNCLASSIFIED":
                parent, common_name_ = get_multifunc_output(parent)
                common_name = common_name_ + " -> " + common_name
            multifunction_name_list.append(common_name)  
            
        multifunction_name.append(multifunction_name_list)  # Append the list for each multifunction
    else:
        parent, common_name = get_multifunc_output(multifunction)
        multifunction_name_list = []  # Use a list for single function cases
        while parent != 'MultiFun' and common_name != "UNCLASSIFIED":
            parent, common_name_ = get_multifunc_output(parent)
            common_name = common_name_ + " -> " + common_name
        multifunction_name_list.append(common_name)
        multifunction_name.append(multifunction_name_list) 

## Visualization

In [None]:
import csv
# create new dataframe
df_metabolic_gene_annotation = df_WCM_gene_metabolism.iloc[:,:3]
df_metabolic_gene_annotation["Enzyme encoded"] = enzymes
df_metabolic_gene_annotation["Pathways"] = pathways
df_metabolic_gene_annotation["Pathways parent"] = pathways_parent
df_metabolic_gene_annotation["Protein products"] = products
df_metabolic_gene_annotation["MultiFuntional ID"] = multifunction_id
df_metabolic_gene_annotation["MultiFuntional name"] = multifunction_name
df_metabolic_gene_annotation["Reactions"] = reactions
df_metabolic_gene_annotation["Description by Cyrus"] = df_WCM_rxn_metabolism.iloc[:,7]

# pretty_print(df_metabolic_gene_annotation)
df_metabolic_gene_annotation.to_csv('notebooks/Heena notebooks/Metabolism_New Genes/new_metabolic_gene_annotation.csv', index=False)
df_metabolic_gene_annotation.to_csv('notebooks/Heena notebooks/Metabolism_New Genes/new_metabolic_gene_annotation.tsv', sep="\t", quoting=csv.QUOTE_NONNUMERIC, index=False)
pretty_print(df_metabolic_gene_annotation.head(10))

In [None]:
df.head()

In [None]:
# Sample dataframe with multiple pathways
data = df_metabolic_gene_annotation["MultiFuntional name"]

# Function to split functional terms into individual pathways and then split those pathways into components
def split_terms(term):
    pathways = term.split('\n')  # Split by newline first
    split_data = []
    for pathway in pathways:
        parts = pathway.split('->')
        split_data.extend(['->'.join(parts[:i+1]) for i in range(len(parts))])
    return split_data

# Create a new dataframe to hold split terms
split_data = []

for term in data:
    split_data.extend([{'id': split_term, 'parent': '->'.join(split_term.split('->')[:-1])} 
                       for split_term in split_terms(term)])


In [None]:
# Convert to a dataframe
df_split = pd.DataFrame(split_data)

# Count occurrences of each term
df_split_count = df_split.groupby(['id', 'parent']).size().reset_index(name='count')

# Plot the sunburst chart
fig = px.sunburst(
    df_split_count,
    names='id',
    parents='parent',
    values='count',
    title='Hierarchical Functional Terms',
)

# Save the figure as an HTML file
fig.write_html("notebooks/Heena notebooks/sunburst_chart.html")

# Scratch

In [None]:
df_metabolic_gene_annotation.head()

In [None]:
df_metabolic_gene_annotation.to_csv('notebooks/Heena notebooks/metabolic_gene_annotation.csv', index=False)

In [22]:
NOTEBOOK_DIR = f'notebooks/Heena notebooks/'
metabolic_gene_annotation = pd.read_csv(os.path.join(NOTEBOOK_DIR, "metabolic_gene_annotation.csv"))

FileNotFoundError: [Errno 2] No such file or directory: 'notebooks/Heena notebooks/metabolic_gene_annotation.csv'

In [None]:
import ast
def string_to_list(s):
    try:
        return np.array(ast.literal_eval(s))  # Safely convert string to list
    except (ValueError, SyntaxError):
        return np.array(s)  # Return as-is if conversion fails (e.g., NaN values)

metabolic_gene_annotation = pd.read_csv(os.path.join(NOTEBOOK_DIR, "metabolic_gene_annotation.csv"), converters={'Reactions': string_to_list})


In [None]:
temp = metabolic_gene_annotation['Reactions'].to_numpy()
temp

In [23]:
np.hstack(temp)

NameError: name 'temp' is not defined