In [65]:
import polars as pl
import numpy as np
import os
import json
import pandas as pd
from scipy.sparse import csr_matrix
os.chdir(os.path.expanduser('~/vivarium-ecoli'))


In [66]:
def read_names(file_path):
    return np.genfromtxt(file_path, dtype=str, delimiter="\n")

def read_matrix(file_path, sparse=False):
    # add sparse matrix reading
    if sparse:
        return csr_matrix(pl.read_csv(file_path, has_header=False).to_numpy())
    else:
        return pl.read_csv(file_path, has_header=False).to_numpy()


Cmatrix = read_matrix("notebooks/mia notebooks/C_matrix.csv", sparse=True)
complex_ids = list(read_names("notebooks/mia notebooks/complex_ids.txt"))
monomer_ids = list(read_names("notebooks/mia notebooks/monomer_ids.txt"))
ecocyc_data = pd.read_excel("notebooks/mia notebooks/All_polypeptides_from_ecocyc_data.xlsx")
validation_data = read_matrix("validation/ecoli/flat/li_protein_synthesis_rates_2014.tsv")

In [67]:
time = '1300'
date = '2024-04-23'
experiment = 'validation_experiment'
entry = f'{experiment}_{time}_{date}'
folder = f'out/cofactors/{entry}/'

In [68]:
output_all = np.load(folder + '0_output.npy',allow_pickle='TRUE').item()
# output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
output = output_all['agents']['0']
fba = output['listeners']['fba_results']
mass = output['listeners']['mass']
bulk = pl.DataFrame(output['bulk'])

fluxes = np.array(fba['estimated_fluxes'][1:])
exchanges = fba['estimated_exchange_dmdt']

ans = output['listeners']['unique_molecule_counts']['active_ribosome']

In [69]:
# So say I want to find the names of the proteins themselves, these are functions to implement that do so: 
initial_state = json.load(open('data/wcecoli_t0.json'))

bulk_ids = [item[0] for item in initial_state['bulk']]

bulk.columns = bulk_ids

In [70]:
# take the protein [location] out of the name of the bulk ids: 
ecocyc_ids = []
for id in bulk_ids: 
	ecocyc_ids.append(id[0:-3])

ecocyc_id_idxs = []
for i in range(len(ecocyc_ids)):
	ecocyc_id_idxs.append(i)


In [71]:
# combine the protein counts for each monomer (option 2): 
id_idx_dict = {id: i for i, id in enumerate(bulk_ids)}
idx_id_dict = {idx: i for i, idx in id_idx_dict.items()}

protein_ids = complex_ids + monomer_ids
all_names = []
name_groups = []
idx_groups = []

for p in protein_ids:
	if p not in all_names:
		indexes = [name for key, name in id_idx_dict.items() if p in key]
		for idx in indexes:
			if ecocyc_ids[idx] == p:
				pass
			else:
				indexes.remove(idx)
		name_group = [idx_id_dict.get(name) for name in indexes]
		all_names.append(p)
		name_groups.append([name_group])
		idx_groups.append([indexes])

In [72]:
# fuse and collect protein counts of interest (in the C matrix): 
bulk_interest_protein_counts = np.zeros([bulk.shape[0], len(protein_ids)])
bulk_PC_dict = {}

for i in range(len(idx_groups)):
	id = protein_ids[i]
	idxs = idx_groups[i]
	if id in ecocyc_ids:
		if len(idxs) == 1:
			idx = idxs[0][0]
			df = np.array(bulk.select(bulk_ids[idx]))
		else:
			together = []
			for idx in idxs:
				index = idx[0]
				temp_df = np.array(bulk.select(bulk_ids[index]))
				together.append(temp_df)
			df = np.sum(together, axis=1)
		protein_counts = np.transpose(df)
		bulk_interest_protein_counts[:, i] = protein_counts
		bulk_PC_dict.update({id:idxs}) 

In [73]:
sim_protein_counts = bulk_interest_protein_counts @ Cmatrix

In [75]:
EC_proteins = ecocyc_data.Proteins
EC_genes = ecocyc_data.Genes

In [85]:
# find the gene name for each monomer in the bulk data: 
EC_id_idx_dict = {id: i for i, id in enumerate(EC_proteins)}
EC_idx_id_dict = {idx: i for i, idx in id_idx_dict.items()}

EC_Gene_idx_dict = {id: i for i, id in enumerate(EC_genes)}
EC_idx_Gene_dict = {idx: i for i, idx in id_idx_dict.items()}

BD_gene_ids = np.zeros(len(monomer_ids))
gene = 0
for i in range(len(monomer_ids)):
	if gene not in BD_gene_ids:
		#TODO: change this to be for the name 
		monomer = monomer_ids[i]
		monomer_idx = [name for key, name in EC_id_idx_dict.items() if monomer in key]
		print(monomer_idx)
		monomer_idx = monomer_idx[0]
		gene = EC_idx_Gene_dict.get(monomer_idx)
		BD_gene_ids[i] = gene
	
len(BD_gene_ids)
	

4434

In [None]:
#TODO: find the gene id for each monomer in the validation data set

In [None]:
validation_data

In [None]:
validation_proteins = validation_data
