# The purpose of this noteboook is to update the `metabolic_reactions.tsv` flat file in ecoli reconstruction with:
* a new column indicating whether the reactions are new. i.e. added in 2022. 
* a new column indicating whether the reactions are using energy

In [1]:
import numpy as np
import ast
import seaborn as sns
import pandas as pd
import os
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict
import cvxpy as cp
import itertools
from scipy.special import logsumexp
%matplotlib inline
# sns.set(style='darkgrid', palette='viridis', context='talk')

os.chdir(os.path.expanduser('~/dev/vivarium-ecoli/'))


In [2]:
FLAT_DIR = f'reconstruction/ecoli/flat/'
NOTEBOOK_DIR = f'notebooks/Heena notebooks/Metabolism_New Genes'

In [3]:
# create a function to deal with non-homogeous dtype of columns. mixture of strings and lists
def string_to_list(s):
    try:
        return np.array(ast.literal_eval(s))  # Safely convert string to array
    except (ValueError, SyntaxError):
        return np.array(s)  # Return as array if object is not a list and conversion fails (e.g., NaN values)

In [22]:
# load annotated new metabolic gene tsv and metabolic_reactions.tsv
metabolic_gene_annotation = pd.read_csv(os.path.join(NOTEBOOK_DIR, "new_metabolic_gene_annotation.csv"), converters={'Reactions': string_to_list})
metabolic_reactions = pd.read_csv(os.path.join(FLAT_DIR, "metabolic_reactions.tsv"),sep='\t', skiprows=5)
modified = pd.read_csv(os.path.join(FLAT_DIR, "metabolic_reactions_modified.tsv"),sep='\t')
added = pd.read_csv(os.path.join(FLAT_DIR, "metabolic_reactions_added.tsv"),sep='\t', skiprows=1)

In [23]:
modified

Unnamed: 0,id,stoichiometry,direction,catalyzed_by,is_new,_comments
0,ADENOSINE-NUCLEOSIDASE-RXN,"{""ADENOSINE[CCO-CYTOSOL]"": -1, ""WATER[CCO-CYTO...",L2R,"[""EG11082-MONOMER""]",0,Replaced D-Ribofuranose (compound class that c...
1,INOSINE-NUCLEOSIDASE-RXN,"{""INOSINE[CCO-CYTOSOL]"": -1, ""WATER[CCO-CYTOSO...",L2R,"[""EG11082-MONOMER""]",0,Replaced D-Ribofuranose (compound class that c...
2,RXN-14882[CCO-CYTOSOL]-CPD-15818//D-Ribofurano...,"{""CPD-15818[CCO-CYTOSOL]"": -1, ""CPD0-1108[CCO-...",BOTH,[],0,Replaced D-Ribofuranose (compound class that c...
3,RXN-14882[CCO-EXTRACELLULAR]-CPD-15818//D-Ribo...,"{""CPD-15818[CCO-EXTRACELLULAR]"": -1, ""CPD0-110...",BOTH,[],0,Replaced D-Ribofuranose (compound class that c...
4,RXN-14882[CCO-PERI-BAC]-CPD-15818//D-Ribofuran...,"{""CPD-15818[CCO-PERI-BAC]"": -1, ""CPD0-1108[CCO...",BOTH,[],0,Replaced D-Ribofuranose (compound class that c...
5,RXN0-361,"{""CYTIDINE[CCO-CYTOSOL]"": -1, ""WATER[CCO-CYTOS...",L2R,"[""EG11082-MONOMER"", ""CPLX0-7904"", ""CPLX0-8280""]",0,Replaced D-Ribofuranose (compound class that c...
6,RXN0-363,"{""XANTHOSINE[CCO-CYTOSOL]"": -1, ""WATER[CCO-CYT...",L2R,"[""EG11082-MONOMER""]",0,Replaced D-Ribofuranose (compound class that c...
7,URIDINE-NUCLEOSIDASE-RXN,"{""URIDINE[CCO-CYTOSOL]"": -1, ""WATER[CCO-CYTOSO...",L2R,"[""CPLX0-7904"", ""CPLX0-8280"", ""EG11082-MONOMER""]",0,Replaced D-Ribofuranose (compound class that c...
8,ASPAMINOTRANS-RXN,"{""L-ASPARTATE[CCO-CYTOSOL]"": -1, ""2-KETOGLUTAR...",BOTH,"[""ASPAMINOTRANS-DIMER"", ""TYRB-DIMER""]",0,TYRB-DIMER added as an alternative catalyst ba...
9,RXN-15346,"{""CPD-16551[CCO-CYTOSOL]"": -1, ""RIBOSE-5P[CCO-...",L2R,[],0,Replaced CPD-15895 (aldehydo-D-ribose-5P) with...


In [26]:
# add new column in metabolic_reactions indicating whether it is a new reaction
is_new = [] # whether a metabolic rxn is new or not
rxn_in_metabolic = [] #new rxn names that are in metabolic rxn
all_new_rxn = np.hstack(metabolic_gene_annotation['Reactions'])
all_new_rxn = [rxn for rxn in all_new_rxn if rxn != ''] #remove empty rxns
all_new_rxn = np.unique(all_new_rxn) #remove duplicates

for idx,reaction in enumerate(metabolic_reactions.iloc[:,0]):
    is_new_reaction_described_in_reactions = [new_rxn in reaction for new_rxn in all_new_rxn]
    
    if any(is_new_reaction_described_in_reactions): #if reaction name in metabolic_reactions is found in all_new_rxn
        # a reaction may have several variations catalyzed by different enzymes, which leads to longer reaction name stored in sim_data
        # only the 'new_reactions' catalyzed by that encoded in metabolic_gene_annotation['Enzymes'] are the correct new_reactions

        # get row index of new reaction in metabolic_gene_annotation - get enzyme encoded by new gene catalyzing the reaction
        new_rxn_name = all_new_rxn[is_new_reaction_described_in_reactions]
        index_new_rxn = metabolic_gene_annotation[metabolic_gene_annotation['Reactions'].apply(
                        lambda x: any(rxn in x for rxn in new_rxn_name))].index
        enzyme_encoded_new = metabolic_gene_annotation["Enzyme encoded"][index_new_rxn]
        enzyme_encoded_reaction = metabolic_reactions.catalyzed_by[idx] #catalyzed_by of metabolic_reactions of the reaction
        
        rxn_in_metabolic.append(new_rxn_name)
        if any(enzyme in enzyme_encoded_reaction for enzyme in enzyme_encoded_new):
            is_new.append(1) #True
        else:
            is_new.append(0) #False
    else:
        is_new.append(0) #False

metabolic_reactions["is_new"] = is_new

In [28]:
# upon initial investigation of the is_new matrix, I realize that not all new rxns are included in the metabolic_reactions.tsv file. 
# I want to extract those new rxns that aren't included and manually investigate them to determine whether they shuld be included or not
rxn_in_metabolic = np.unique(np.hstack(rxn_in_metabolic))
print(f'There are {len(all_new_rxn) - len(rxn_in_metabolic)} new reactions missing from the metabolic_reactions.tsv file:')
print(all_new_rxn[~np.isin(all_new_rxn, rxn_in_metabolic)])


There are 14 new reactions missing from the metabolic_reactions.tsv file:
['1.5.1.34-RXN' '3-NUCLEOTID-RXN' 'CYCPHOSDIESTER-RXN' 'R303-RXN'
 'RXN-15119' 'RXN-16650' 'RXN-24038' 'RXN-24042' 'RXN-24048' 'RXN-24049'
 'RXN0-1804' 'RXN0-5001' 'RXN0-5285' 'TRANS-RXN-387']


In [29]:
all_new_rxn[~np.isin(all_new_rxn, rxn_in_metabolic)]

array(['1.5.1.34-RXN', '3-NUCLEOTID-RXN', 'CYCPHOSDIESTER-RXN',
       'R303-RXN', 'RXN-15119', 'RXN-16650', 'RXN-24038', 'RXN-24042',
       'RXN-24048', 'RXN-24049', 'RXN0-1804', 'RXN0-5001', 'RXN0-5285',
       'TRANS-RXN-387'], dtype='<U33')

In [30]:
# add new column in metabolic_reactions indicating whether a reaction uses energy
def uses_energy(reaction):
    """
    Check if a reaction uses energy.
    """
    # Define keywords that indicate energy usage
    energy_keywords = ['ATP', 'ADP', 'AMP', 'NADH', 'NADPH', 'FADH2', 'GTP', 'GDP']
    
    # Check if any of the keywords are in the reaction string
    return any(keyword in reaction for keyword in energy_keywords)

In [31]:
# Update supplementary metabolic reactions files - modified and added
# all added reactions aren't new. They are reactions not included in ecocyc and are manually added
added.insert(loc=4,column='is_new',value=0)

# all modified reactions happen to be old. Make sense though because we probably didn't try modifying any of the new reactions
modified.insert(loc=4,column='is_new',value=0)

ValueError: cannot insert is_new, already exists

In [32]:
# save updated metabolic reactions as tsv
col_list = ["id", "direction"]
for colname in col_list:
    metabolic_reactions[colname] = metabolic_reactions[colname].apply(lambda x:'"' + str(x) + '"')
    added[colname] = added[colname].apply(lambda x:'"' + str(x) + '"')
    modified[colname] = modified[colname].apply(lambda x:'"' + str(x) + '"')

added["_comments"] = added["_comments"].apply(lambda x:'"' + str(x) + '"')
modified["_comments"] = modified["_comments"].apply(lambda x:'"' + str(x) + '"')

In [None]:
metabolic_reactions.head(2)

In [37]:
import csv
metabolic_reactions[:0].to_csv("reconstruction/ecoli/flat/metabolic_reactions.tsv", sep = "\t", index=False, quoting=csv.QUOTE_NONNUMERIC)
metabolic_reactions.to_csv("reconstruction/ecoli/flat/metabolic_reactions.tsv", sep = "\t", index = False, header = False, mode="a", quoting=csv.QUOTE_NONE)

In [38]:
added[:0].to_csv("reconstruction/ecoli/flat/metabolic_reactions_added.tsv", sep = "\t", index=False, quoting=csv.QUOTE_NONNUMERIC)
modified[:0].to_csv("reconstruction/ecoli/flat/metabolic_reactions_modified.tsv", sep = "\t", index=False, quoting=csv.QUOTE_NONNUMERIC)

added.to_csv("reconstruction/ecoli/flat/metabolic_reactions_added.tsv", sep = "\t", index = False, header = False, mode="a", quoting=csv.QUOTE_NONE)
modified.to_csv("reconstruction/ecoli/flat/metabolic_reactions_modified.tsv", sep = "\t", index = False, header = False, mode="a", quoting=csv.QUOTE_NONE)

In [36]:
metabolic_reactions[:0]

Unnamed: 0,id,stoichiometry,direction,catalyzed_by,is_new
