In [1]:
from    acevedo_clss_and_fcns import * 
from    cobra.io.mat import *

model               = load_matlab_model("./COBRA_models/GEM_Recon3_thermocurated_redHUMAN_AA.mat")
concentration_data        = pd.read_parquet("./results/dataframes/concentrations/augmented_metabolite_data_v2.parquet.gzip")#
feature_names       = pd.read_csv("./metabolites_data/metabolite_names.csv")
grafo_nx            = cobra_to_networkx(model)

  from .autonotebook import tqdm as notebook_tqdm
No defined compartments in model model. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e, g, i, l, m, n, r, x


In [2]:
concentration_data['Leu'] = concentration_data['Leu.Ile']
concentration_data['Ile'] = concentration_data['Leu.Ile']
concentration_data.drop('Leu.Ile', axis=1, inplace=True)

concentration_data['C3DC'] = concentration_data['C4OH.C3DC']
concentration_data['C4OH'] = concentration_data['C4OH.C3DC']
concentration_data.drop('C4OH.C3DC', axis=1, inplace=True)

concentration_data['C4DC'] = concentration_data['C5.OH.C4DC']
concentration_data['C5OH'] = concentration_data['C5.OH.C4DC']
concentration_data.drop('C5.OH.C4DC', axis=1, inplace=True)
assert len(set(feature_names.Simbolo_traductor) - set(concentration_data.columns)) == 0

node_list = list(grafo_nx.nodes)
concentration_data.rename(
        columns=feature_names.set_index("Simbolo_traductor")["Recon3_ID"].to_dict(), 
        inplace=True
    )

assert set(set(concentration_data.columns)-set(["label"])).issubset(set(list(grafo_nx.nodes)))

w  = dict(zip(grafo_nx.edges() , itertools.repeat(1)))

nx.set_edge_attributes(grafo_nx, w, "weight")

assert 1 == np.unique(list(nx.get_edge_attributes(grafo_nx, "weight").values())).__len__()

In [3]:
flux_samples_CONTROL_7_000  = pd.read_parquet("./results/dataframes/fluxes/flux_samples_CONTROL_7_000.parquet.gzip")
flux_samples_PKU_7_000      = pd.read_parquet("./results/dataframes/fluxes/flux_samples_PKU_7_000.parquet.gzip")

In [4]:
def get_sample_subset(full_samples, concentration_data, label): 


    s = concentration_data.label.value_counts()
    sample_subset = full_samples.sample(s.loc[label], replace=True).reset_index(drop=True)
    sample_subset["label"] = label
    
    return sample_subset


flux_samples_CONTROL = get_sample_subset(flux_samples_CONTROL_7_000, concentration_data, 0)
flux_samples_PKU     = get_sample_subset(flux_samples_PKU_7_000,     concentration_data, 1)
assert flux_samples_CONTROL.r0399.max() > 30 
assert flux_samples_PKU.r0399.max() < 1
assert all(flux_samples_CONTROL.columns == flux_samples_PKU.columns)

flux_samples_CONTROL = flux_samples_CONTROL.reindex(columns=flux_samples_PKU.columns)
flux_samples         = pd.concat([flux_samples_CONTROL, flux_samples_PKU], axis=0)
flux_samples         = flux_samples.reset_index(drop=True, inplace=False)

assert len(flux_samples.columns) == len(flux_samples_CONTROL.columns)
assert len(concentration_data) == len(flux_samples)
assert flux_samples.r0399.loc[flux_samples.label == 0].mean() > 30
assert flux_samples.r0399.loc[flux_samples.label == 1].mean() < 1


In [5]:
def get_largest_cc(G):
    
  largest_wcc = max(nx.connected_components(nx.Graph(G)), key=len)


  # Create a subgraph SG based on G
  SG = G.__class__()
  SG.add_nodes_from((n, G.nodes[n]) for n in largest_wcc)


  SG.add_edges_from((n, nbr, d)
      for n, nbrs in G.adj.items() if n in largest_wcc
      for nbr, d in nbrs.items() if nbr in largest_wcc)

  SG.graph.update(G.graph)

  assert G.nodes.__len__() >= SG.nodes.__len__()
  assert G.edges.__len__() >= SG.edges.__len__()
  assert SG.nodes.__len__() == len(largest_wcc)
  assert not SG.is_directed() 
  assert nx.is_connected(nx.Graph(SG))

  return copy.deepcopy(SG)


mets = ['nad_', 'nadh_', "nadp_", "nadph_", "adp_", "atp_", "gdt_", "gtp_",
        "pi_", "ppi_", "pppi_", "co2_", "hco3_", "h2o_", "h2o2_", "h_", "o2_", "oh1_",
        "o2s_", "fad_",  "fadh2_", "nh4_", "so3_", "so4_", "cl_", "k_", "na1_",
        "i_", "fe2_", "fe3_", "mg2_", "ca2", "zn2_", "M02382_"]

to_remove = []
for m in mets:

    to_remove.extend(["".join(l) for l in list(zip(itertools.repeat(m), list(model.compartments.keys())))])
    
grafo_nx.remove_nodes_from(to_remove)

grafo_nx = get_largest_cc(grafo_nx)
    
nx.write_gpickle(grafo_nx, "./results/graphs/NX_recon_graph.gpickle")


In [6]:
concentration_data = concentration_data.sort_values(by=['label'])
concentration_data.reset_index(drop=True, inplace=True)
flux_samples = flux_samples.sort_values(by=['label'])
flux_samples.reset_index(drop=True, inplace=True)
assert all(concentration_data.label == flux_samples.label)

Labels = flux_samples.label



flux_samples.drop("label", axis=1, inplace=True)
concentration_data.drop("label", axis=1, inplace=True)

In [20]:
blank_features = pd.DataFrame(
                    np.full((len(concentration_data), list(grafo_nx.nodes).__len__()), 1e-10),  columns=list(grafo_nx.nodes)
                    )
blank_features.reset_index(drop=True, inplace=True)
assert len(blank_features) == len(concentration_data) == len(flux_samples)
assert set(concentration_data.columns).issubset(set(list(grafo_nx.nodes)))





In [10]:
blank_features_dict = blank_features.to_dict(orient="list")  
flux_samples_dict   = flux_samples.to_dict(orient="list")  
concentrations_dict = concentration_data.to_dict(orient="list")  

In [60]:
import copy

def update_df_features(base: dict, new:dict):
    
    common_vars       = list(set(base.keys()).intersection(set(new)))
    base_updated      = copy.deepcopy(base)
    base_updated.update({key: new[key] for key in common_vars})
    
    return base_updated


features_only_concentrations_dict = update_df_features(blank_features_dict, concentrations_dict)
features_only_fluxes_dict         = update_df_features(blank_features_dict, flux_samples_dict)



In [75]:
def new_nx_from_dict(nx_G_in, feature_dict):
    
    nx_G        = copy.deepcopy(nx_G_in)    
    x_attribute = feature_dict #nx.get_node_attributes(nx_G, "x")
     
    nx.set_node_attributes(nx_G, x_attribute, 'x')
    
    len(nx_G.nodes(data=True)['r0399']['x']) == len(nx_G.nodes(data=True)['phe_L_c']['x'])
    assert nx_G.nodes(data=True)['phe_L_c']['x']  == feature_dict['phe_L_c']#.tolist() 
    assert nx_G.nodes(data=True)['r0399']['x']  == feature_dict['r0399']#.tolist() 

    
    
    return nx_G


nx_features_only_concentrations = new_nx_from_dict(grafo_nx, features_only_concentrations_dict)  

conc_df = pd.DataFrame(
nx.get_node_attributes(nx_features_only_concentrations, 'x'))

assert set(conc_df.sum().loc[lambda x: abs(x)>=2e-6].index.tolist()) ==  set([k for k in concentrations_dict])


In [65]:

nx_features_only_fluxes         = new_nx_from_dict(grafo_nx, features_only_fluxes_dict)  

flux_df = pd.DataFrame(
nx.get_node_attributes(nx_features_only_fluxes, 'x'))

In [78]:
rxn_list_recon: list[str] = [model.reactions[i].id       for i in range(model.reactions.__len__())]
met_list_recon: list[str] = [model.metabolites[i].id     for i in range(model.metabolites.__len__())]
first_partition , second_partition = bipartite.sets(grafo_nx)

if first_partition.__len__() > second_partition.__len__():
    rxn_partition = first_partition
    met_partition = second_partition
else:
    rxn_partition = second_partition 
    met_partition = first_partition
    
assert set(rxn_partition).issubset(set(rxn_list_recon)) and set(met_partition).issubset(set(met_list_recon))
assert len(set(rxn_partition) - set(rxn_list_recon)) == 0
assert len(set(met_partition) - set(met_list_recon)) == 0

partition_list =  np.array(list(nx.get_node_attributes(grafo_nx, "bipartite").values()))
mask_rxns      =  partition_list.astype(bool)
mask_mets      =  np.invert(partition_list.astype(bool))

In [73]:
flux_df.sum().loc[lambda x: abs(x)<2e-7]#.index.tolist()

EX_pchol_hs_e       0.000000e+00
ATPH1e              0.000000e+00
LCAT2e              0.000000e+00
LCAT30e             0.000000e+00
CAMPt               0.000000e+00
                        ...     
EX_g3pc_e           0.000000e+00
EX_gmp_e            0.000000e+00
EX_gpi_sig_e        3.767842e-09
PCHOLEIC_HSt1e      0.000000e+00
PCHOLN183_HSPLA2    0.000000e+00
Length: 914, dtype: float64