In [1]:
import numpy as np
import ast
import seaborn as sns
import pandas as pd
import os
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict
import cvxpy as cp
import itertools
from scipy.special import logsumexp

%matplotlib inline
# sns.set(style='darkgrid', palette='viridis', context='talk')

os.chdir(os.path.expanduser('~/dev/vivarium-ecoli'))

from ecoli.processes.metabolism_redux_classic import NetworkFlowModel, FlowResult
from ecoli.library.schema import numpy_schema, bulk_name_to_idx, listener_schema, counts
from ecoli.processes.registries import topology_registry
TOPOLOGY = topology_registry.access("ecoli-metabolism-redux")


In [2]:
# create a function to deal with non-homogeous dtype of columns. mixture of strings and lists
def string_to_list(s):
    try:
        return np.array(ast.literal_eval(s))  # Safely convert string to array
    except (ValueError, SyntaxError):
        return np.array(s)  # Return as array if object is not a list and conversion fails (e.g., NaN values)

### Load Experiments

In [28]:
time = '400'
date = '2025-01-13'
experiment = 'metabolism-redux-classic-BASAL_OLD-modified_process_new_rxn_name'
entry = f'{experiment}_{time}_{date}'
folder = f'out/cofactors/{entry}/'

output = np.load(folder + '0_output.npy',allow_pickle='TRUE').item()
# output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
output = output['agents']['0']
fba = output['listeners']['fba_results']
mass = output['listeners']['mass']
bulk = pd.DataFrame(output['bulk'])

In [29]:
f = open(folder + 'agent_steps.pkl', 'rb')
agent = dill.load(f)
f.close()

### Create Dataframe

In [30]:
# create simulation flux dataframe - OLD
self = agent['ecoli-metabolism-redux-classic'] #self of metabolism_redux_classic class
reaction_names = self.reaction_names

sim_fluxes = pd.DataFrame(fba["estimated_fluxes"])
target_fluxes = pd.DataFrame(fba["target_kinetic_fluxes"])

sim_fluxes.columns = reaction_names
sim_fluxes = sim_fluxes.iloc[1:,] # remove first row (NAs) of the dataframe
target_fluxes = target_fluxes.iloc[1:,] # remove first row (NAs) of the dataframe

In [31]:
fba_new_reaction_ids = self.parameters["fba_new_reaction_ids"]
fba_reaction_ids_to_base_reaction_ids = self._parameters['fba_reaction_ids_to_base_reaction_ids']

In [32]:
#### ALL REACTION FLUX #####

# take average of flux through time -- should I normalize by cell mass? 
sim_fluxes_avg = sim_fluxes.mean(axis=0)

# set up plotly df
df = pd.DataFrame({'sim_fluxes_avg': sim_fluxes_avg, 'sim_fluxes_avg_log':sim_fluxes_avg})
df['is_new'] = 'Old Reactions'
df.loc[fba_new_reaction_ids, 'is_new'] = 'New Reactions'  

# log flux and + e-6
df.sim_fluxes_avg_log += 10**-6
df.sim_fluxes_avg_log = np.log10(df.sim_fluxes_avg_log)


#### KINETICALLY CONSTRAINED REACTION FLUX #####
# get target flux for the kinetically constrained reactions
target_kinetic_flux = fba['target_kinetic_fluxes'][1:] #atoms/second
kinetic_reactions = self.kinetic_constraint_reactions

# get average target flux for the kinetically constrained reactions
target_kinetic_flux = np.array(target_kinetic_flux).mean(axis=0)

# get estimated flux for the kinetically constrained reactions
df_kinetic = df.loc[kinetic_reactions].copy()
df_kinetic['target'] = target_kinetic_flux

# deal with 0 fluxes and math.log10 on all fluxes
df_kinetic.target  += 10**-6
df_kinetic.target   = np.log10(df_kinetic.target)

# set up plotly parameters
category_colors = {
    'Old Reactions': 'purple',
    'New Reactions': 'orange',
}

### New Reactions

In [34]:
# Extract target flux of new reactions - 43 reactions total
df_kinetic_constrained_new = df_kinetic[df_kinetic['is_new'] == 'New Reactions']
df_kinetic[df_kinetic['is_new'] == 'New Reactions']

Unnamed: 0,sim_fluxes_avg,sim_fluxes_avg_log,is_new,target
1.13.11.16-RXN,0.0,-6.0,New Reactions,2.0
2.1.1.79-RXN-CPD-18361/S-ADENOSYLMETHIONINE//CPD-18373/ADENOSYL-HOMO-CYS/PROTON.67.,0.0,-6.0,New Reactions,2.003148
2.1.1.79-RXN-CPD-18362/S-ADENOSYLMETHIONINE//CPD-18406/ADENOSYL-HOMO-CYS/PROTON.67.,0.0,-6.0,New Reactions,2.003148
2.1.1.79-RXN-CPD-18367/S-ADENOSYLMETHIONINE//CPD-18371/ADENOSYL-HOMO-CYS/PROTON.67.,0.0,-6.0,New Reactions,2.003148
2.1.1.79-RXN-CPD-18369/S-ADENOSYLMETHIONINE//CPD-18372/ADENOSYL-HOMO-CYS/PROTON.67.,0.0,-6.0,New Reactions,2.003148
2.1.1.79-RXN-CPD-18392/S-ADENOSYLMETHIONINE//CPD-18405/ADENOSYL-HOMO-CYS/PROTON.67.,0.0,-6.0,New Reactions,2.003148
2.1.1.79-RXN-CPD-18403/S-ADENOSYLMETHIONINE//CPD-18404/ADENOSYL-HOMO-CYS/PROTON.67.,0.0,-6.0,New Reactions,2.003148
2.9.1.1-RXN,0.0,-6.0,New Reactions,-6.0
3.1.3.68-RXN[CCO-CYTOSOL]-2-DEOXY-D-GLUCOSE-6-PHOSPHATE/WATER//2-DEOXY-D-GLUCOSE/Pi.71.__G6932-MONOMER,0.0,-6.0,New Reactions,4.522923
4.3.1.15-RXN,0.0,-6.0,New Reactions,2.673021


In [35]:
len(df_kinetic[df_kinetic['is_new'] == 'New Reactions'])

43

### Confirm: Are All New Reactions included in the model? Short Answer: No

In [36]:
# Use metabolic_gene_annotation.tsv to convert reactions back to Ecocyc Gene IDs
FLAT_DIR = f'reconstruction/ecoli/flat/'
NOTEBOOK_DIR = f'notebooks/Heena notebooks/Metabolism_New Genes'

new_metabolic_gene_annotation = pd.read_csv(os.path.join(NOTEBOOK_DIR, "new_metabolic_gene_annotation.csv"), converters={'Reactions': string_to_list})
metabolic_reactions = pd.read_csv(os.path.join(FLAT_DIR, "metabolic_reactions.tsv"), sep='\t', skiprows=4, converters={'catalyzed_by': string_to_list})

#### Quick check to see if I have all the new reactions.
#### i.e. fba_rxn (565) -> metabolic_reactions.tsv -> base_reactions(365)

In [52]:
total_base_reactions = len(np.unique(np.hstack(new_metabolic_gene_annotation.Reactions)))-1
print(f'Total Number of New Reaction Base IDs: {total_base_reactions}')

Total Number of New Reaction Base IDs: 365


In [53]:
base_new_reaction_ids = []
for id in fba_new_reaction_ids:
    base_new_reaction_ids.append(fba_reaction_ids_to_base_reaction_ids[id])
base_new_reaction_ids = np.sort(np.unique(base_new_reaction_ids))
print(f'Simulation Number of New Reaction Base IDs: {len(base_new_reaction_ids)}')
print(f'Number of Missing New Reaction Base IDs: {total_base_reactions-len(base_new_reaction_ids)}')

Simulation Number of New Reaction Base IDs: 349
Number of Missing New Reaction Base IDs: 16


<p> <span style='background :#fcd1d7' color = 'b' > By transposing fba_new_reaction_ids back to base_ids, we can tell that there are a totall of 16 new reactions not being incorporated into the wc model </span> 

In [67]:
# calculate the number of NEW fba reactions the metabolic_reactions.tsv should make
len_new_metabolic_reactions = len(metabolic_reactions[(metabolic_reactions.is_new == 1)])
add_reversible_reactions = len(metabolic_reactions[(metabolic_reactions.is_new == 1) & (metabolic_reactions.direction == 'BOTH')])
add_kinetic_reactions = sum(1 for id in fba_new_reaction_ids if '__' in id)

print(f'The number of new FBA_reaction_ids should be: {len_new_metabolic_reactions+add_reversible_reactions+add_kinetic_reactions}')
print(f'The number of new FBA_reaction_ids in sim: {len(fba_new_reaction_ids)}')

The number of new FBA_reaction_ids should be: 565
The number of new FBA_reaction_ids in sim: 565


### More info on the 16 missing new reactions

In [72]:
df.loc[fba_new_reaction_ids]

Unnamed: 0,sim_fluxes_avg,sim_fluxes_avg_log,is_new
1.1.1.271-RXN (reverse),0.0,-6.0,New Reactions
1.11.1.15-RXN,0.0,-6.0,New Reactions
1.13.11.16-RXN,0.0,-6.0,New Reactions
1.97.1.4-A-RXN,0.0,-6.0,New Reactions
1PFRUCTPHOSN-RXN,0.0,-6.0,New Reactions
...,...,...,...
RXN0-5462[CCO-CYTOSOL]-GTP/WATER//GDP/Pi/PROTON.38.__G7841-MONOMER,0.0,-6.0,New Reactions
RXN0-5462[CCO-CYTOSOL]-GTP/WATER//GDP/Pi/PROTON.38.__EG12104-MONOMER,0.0,-6.0,New Reactions
RXN0-5462[CCO-CYTOSOL]-GTP/WATER//GDP/Pi/PROTON.38.__EG10021-MONOMER,0.0,-6.0,New Reactions
RXN0-6732__CPLX0-7958,0.0,-6.0,New Reactions
