# Setup

In [1]:
import numpy as np

import seaborn as sns
import pandas as pd
import polars as pl
import os
import pprint
import ast
import re
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict
import json
from scipy.sparse import csr_matrix

from ecoli.analysis.causality_network.build_network import PROTEINS_IN_METABOLISM

pp = pprint.PrettyPrinter(depth=6)

os.chdir(os.path.expanduser('~/vivarium-ecoli'))

# Reading the matrix CSV files
def read_matrix(file_path, sparse=False):
    # add sparse matrix reading
    if sparse:
        return csr_matrix(pl.read_csv(file_path, has_header=False).to_numpy())
    else:
        return pl.read_csv(file_path, has_header=False).to_numpy()

def read_names(file_path):
    return np.genfromtxt(file_path, dtype=str, delimiter="\n")


# Get relevant indices

In [2]:
initial_state = json.load(open('data/vivecoli_t1.json'))

bulk_ids = [item[0] for item in initial_state['agents']['0']['bulk']]

In [3]:
# experiment = 'metabolism-redux-classic-minimal'
entry = f'sim_example'
folder = f'out/cofactors/{entry}/'
output_all = np.load(folder + '0_output.npy', allow_pickle='TRUE').item()
# output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
output = output_all['agents']['0']
mass = output['listeners']['mass']
bulk = pd.DataFrame(output['bulk'])

f = open(folder + 'agent_steps.pkl', 'rb')
agent = dill.load(f)
f.close()

rna_listen = agent['rna_synth_prob_listener']
rna_count_listen = agent['RNA_counts_listener']
gene_ids = rna_listen.gene_ids
mrna_tu_ids = rna_count_listen.mRNA_TU_ids

bulk.columns = bulk_ids

for unique_key, bulk_id in [('active_ribosome', 'CPLX0-3962'), ('active_ribosome', 'CPLX0-3953'),
                            ('active_RNAP', 'APORNAP-CPLX')]:
    if unique_key in output['listeners']['unique_molecule_counts']:
        unique_count = output['listeners']['unique_molecule_counts'][unique_key]
        bulk.loc[:, bulk_id + '[c]'] += unique_count

In [4]:
# get specific indices
specific_gene_id = 'EG10282'
specific_gene_idx = gene_ids == specific_gene_id

specific_mrna_tu_id = 'TU0-14816[c]'
specific_mrna_tu_idx = mrna_tu_ids == specific_mrna_tu_id

specific_protein_id = 'FRUCBISALD-CLASSII[c]'
specific_protein_idx = bulk_ids.index(specific_protein_id)

mete_protein_id = 'HOMOCYSMET-MONOMER[c]'
mete_protein_idx = bulk_ids.index(mete_protein_id)

rpme_protein_id = 'CPLX0-3962[c]'
rpme_protein_idx = bulk_ids.index(mete_protein_id)

# import complex_wcm_names and C, P and E matrices

In [5]:
complex_wcm_names = read_names('notebooks/cofactors/data/complex_wcm_names.csv')

C = read_matrix('notebooks/cofactors/data/C_matrix.csv', sparse=True)
P = read_matrix('notebooks/cofactors/data/P_matrix.csv', sparse=True)
E = read_matrix('notebooks/cofactors/data/E_matrix.csv', sparse=True)

element_ids = read_names("notebooks/cofactors/data/element_ids.txt")

# Save multi-gen timeseries

In [None]:
for i, cur_lineage in enumerate(['lineage_0', 'lineage_2', 'lineage_4', 'lineage_6']):
    folder = f'out/cofactors/lineage/'
    
    output = np.load(folder + cur_lineage + '.npy', allow_pickle='TRUE').item()['agents']['0']
    
    mass = output['listeners']['mass']['cell_mass']
    bulk = pd.DataFrame(output['bulk'])
    gene_counts = np.array(output['listeners']['rna_synth_prob']['gene_copy_number'][1:])
    rna_counts = np.array(output['listeners']['rna_counts']['mRNA_counts'][1:])
    
    bulk.columns = bulk_ids
    
    for unique_key, bulk_id in [('active_ribosome', 'CPLX0-3962'), ('active_ribosome', 'CPLX0-3953'),
                                ('active_RNAP', 'APORNAP-CPLX')]:
        if unique_key in output['listeners']['unique_molecule_counts']:
            unique_count = output['listeners']['unique_molecule_counts'][unique_key]
            bulk.loc[:, bulk_id + '[c]'] += unique_count
    
    mass = np.array(mass)[1:]
    gene_counts = gene_counts[:, specific_gene_idx].flatten()
    rna_counts = rna_counts[:, specific_mrna_tu_idx].flatten()
    protein_counts = bulk.loc[:, specific_protein_id].values[1:]

    mete_counts = bulk.loc[:, mete_protein_id].values[1:]
    rpme_counts = bulk.loc[:, rpme_protein_id].values[1:]

    elements = ["ZN"]
    element_indices = [np.where(element_ids == element)[0][0] for element in elements]
    element_indices
    
    total_zn_counts = ((np.array(bulk.loc[1:, complex_wcm_names.tolist()])) @ C @ P @ E[:, element_indices]).flatten()
    
    # make a dataframe
    series_df = pl.DataFrame([mass, gene_counts, rna_counts, protein_counts, total_zn_counts, mete_counts, rpme_counts],
                             schema=['mass', 'gene_counts', 'rna_counts', 'protein_counts', 'total_zn_counts', 'mete_counts', 'rpme_counts'])
    
    # if i=0 create new df otherwise concatenate
    if i == 0:
        all_series_df = series_df
    else:
        all_series_df = pl.concat([all_series_df, series_df], how='vertical')
        
# add row index timestep
all_series_df = all_series_df.with_row_index(name='timestep')

In [8]:
# melt all_series_df
melt_series_df = all_series_df.melt(variable_name='variable', value_name='value', id_vars=['timestep'])

melt_series_df

melt_series_df.write_csv('notebooks/cofactors/data/lineage_timeseries.csv')

# Minimal media

In [None]:
for i, cur_lineage in enumerate(['lineage_0', 'lineage_2', 'lineage_4']):
    folder = f'out/cofactors/lineage-min/'

    output = np.load(folder + cur_lineage + '.npy', allow_pickle='TRUE').item()['agents']['0']

    mass = output['listeners']['mass']['cell_mass']
    bulk = pd.DataFrame(output['bulk'])
    gene_counts = np.array(output['listeners']['rna_synth_prob']['gene_copy_number'][1:])
    rna_counts = np.array(output['listeners']['rna_counts']['mRNA_counts'][1:])

    bulk.columns = bulk_ids

    for unique_key, bulk_id in [('active_ribosome', 'CPLX0-3962'), ('active_ribosome', 'CPLX0-3953'),
                                ('active_RNAP', 'APORNAP-CPLX')]:
        if unique_key in output['listeners']['unique_molecule_counts']:
            unique_count = output['listeners']['unique_molecule_counts'][unique_key]
            bulk.loc[:, bulk_id + '[c]'] += unique_count

    mass = np.array(mass)[1:]
    gene_counts = gene_counts[:, specific_gene_idx].flatten()
    rna_counts = rna_counts[:, specific_mrna_tu_idx].flatten()
    protein_counts = bulk.loc[:, specific_protein_id].values[1:]

    mete_counts = bulk.loc[:, mete_protein_id].values[1:]
    rpme_counts = bulk.loc[:, rpme_protein_id].values[1:]

    elements = ["ZN"]
    element_indices = [np.where(element_ids == element)[0][0] for element in elements]
    element_indices

    total_zn_counts = ((np.array(bulk.loc[1:, complex_wcm_names.tolist()])) @ C @ P @ E[:, element_indices]).flatten()

    # make a dataframe
    series_df = pl.DataFrame([mass, gene_counts, rna_counts, protein_counts, total_zn_counts, mete_counts, rpme_counts],
                             schema=['mass', 'gene_counts', 'rna_counts', 'protein_counts', 'total_zn_counts', 'mete_counts', 'rpme_counts'])

    # if i=0 create new df otherwise concatenate
    if i == 0:
        all_series_df = series_df
    else:
        all_series_df = pl.concat([all_series_df, series_df], how='vertical')

# add row index timestep
all_series_df = all_series_df.with_row_index(name='timestep')

In [None]:
# melt all_series_df
melt_series_df = all_series_df.melt(variable_name='variable', value_name='value', id_vars=['timestep'])

melt_series_df

melt_series_df.write_csv('notebooks/cofactors/data/lineage_min_timeseries.csv')