# Overview

Draw `n_flux_sample` samples from each model state, where the constraints themselves are uniform (prior) r.v.s. 

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import requests

import cobra
import cobra.test
from cobra.test import create_test_model
from cobra.util.solver import linear_reaction_coefficients
from cobra.flux_analysis import flux_variability_analysis
from cobra.sampling import sample

import numpy as np
import pandas as pd
import os
import re
from functools import reduce
import itertools
import time
from tqdm import tqdm
import pickle

import cobrapy_func as cf

def get_subsets(S, m):
    return [list(x) for x in list(set(itertools.combinations(S, m)))]

In [None]:
# Load zbgem2 model
t0 = time.time()
fn_zgem2 = "/Users/don/Documents/mlardelli/data/zebragem_20200228_mod.xml"
model = cobra.io.read_sbml_model(fn_zgem2)
print("Loaded SBML file in %.2fs" % (time.time() - t0))

# Print stuff out
print(f"num reactions = {len(model.reactions)}")
print(f"num metabs = {len(model.metabolites)}")
print(f"num genes = {len(model.genes)}")
print(f"num exchanges = {len(model.exchanges)}")
print("")
model.objective ="BIO_L_2"

# Load DE genes
d0 = pd.read_csv("/Users/don/Documents/mlardelli/data/de_genes.csv")

# For the genes in the DE list, get only those which are in ZBGEM
zbgem_gene_ls = [g.id for g in model.genes]

de_genes_ls = []
for ncbi_id in list(d0["ncbi_id"]):
    if ncbi_id in zbgem_gene_ls:
        de_genes_ls.append(ncbi_id)
print(f"Num. DE genes present in ZBGEM = {len(de_genes_ls)}")

de_df = d0.loc[d0["ncbi_id"].isin(de_genes_ls)]

# Get dict of reactions affected by DE genes, and a flat list
rxn_dict = {}
affected_rxn_ls = []
for ncbi_id in de_genes_ls:
    val_ls = [rxn.id for rxn in list(model.genes.get_by_id(ncbi_id).reactions)]
    rxn_dict[ncbi_id] = val_ls
    affected_rxn_ls.append(val_ls)

affected_rxn_ls = list(set([item for sublist in affected_rxn_ls for item in sublist]))

# Get affected reactions AFTER checking GPR, regardless of fold change direction
affected_rxn_ls2 = []
for rxn_id in affected_rxn_ls:
    rxn = model.reactions.get_by_id(rxn_id)
    gpr_str = rxn.gene_reaction_rule
    
    eval_bool = cf.eval_gpr(gpr_str, de_genes_ls)
    if eval_bool:
        affected_rxn_ls2.append(rxn_id)

print(f"{len(affected_rxn_ls2)} reactions impacted by DE genes")

In [None]:
# See all affected reactions and their GPRs
for r_id in affected_rxn_ls2:
    rxn = model.reactions.get_by_id(r_id)
    kegg_reaction_id = rxn.annotation.get("kegg.reaction")
    kegg_ec_id = rxn.annotation.get("ec-code")
    bounds = rxn.bounds
    
    printout_ln = f"{r_id}|{kegg_reaction_id}|{kegg_ec_id}|{bounds}: "
    ncbi_ls = [g.id for g in rxn.genes]
    for ncbi_id in ncbi_ls:
        fc_direction_arr = d0.loc[d0["ncbi_id"]==ncbi_id].DE_Direction.values
        fc_dir_val = 0
        if len(fc_direction_arr) > 1:
            print(f"WARNING: Multiple entries found for NCBI Id:{ncbi_id}")
        elif len(fc_direction_arr) >= 1:
            fc_dir_val = fc_direction_arr[0]
        
        printout_ln += f"{ncbi_id}({str(fc_dir_val)}), "
    
    print(printout_ln)

# Get Means of Priors by Running Unconstrained Model

In [None]:
model.objective ="BIO_L_2"
soln = model.optimize()

dv0 = soln.to_frame().reset_index().rename(columns={"index":"rxn_id"})
# Put contents of dv0_dict into a dictionary for faster lookup
dv0_dict = {}
for rxn_id in list(dv0["rxn_id"]):
    dv0_dict[rxn_id] = dv0.loc[dv0["rxn_id"]==rxn_id]["fluxes"].values[0]

# Actual Modelling: Iteratively Changing Model Bounds, Resetting Each Time

* NB: def. *Shadow price* := change in objective func value per unit increase in the RHS of a constraint.
* Introduce informative priors to all reactions with nonzero baseline fluxes (sigma = 0.1)
* Introduce differential effects for affected reactions

In [None]:
model = cobra.io.read_sbml_model(fn_zgem2)
model.objective ="BIO_L_2"

# Assume all affected reactions have FC down, except CTPS1, R1351_r, INSTt2r, CTPS2
fc_up_rxn_ls = ["CTPS1", "R1351_r", "INSTt2r", "CTPS2"]
n_iter = 100
n_likelihood_samples = 500

# Set plot params
path0 = "/Users/don/Documents/mlardelli"
#fn_out_posterior= path0+"/data/bayesian_mc_posterior_"+str(int(n_iter/1000))+".csv"
#fn_out_posterior_ofv = path0+"/data/bayesian_mc_posterior_ofv_"+str(int(n_iter/1000))+".csv"
fn_out_posterior = path0+"/data/bayesian_mc_posterior_test.csv"
fn_out_posterior_ofv = path0+"/data/bayesian_mc_posterior_ofv_test.csv"

fn_out_pickle = path0+"/data/bayesian_mc_"+str(int(n_iter/1000))+"k.p"
fn_out_plots = path0+"/plots/bayesian_mc_"+str(int(n_iter/1000))+"k_v2.pdf"

# Start simulation
super_contents = []
df_opt_ls = []
u_arr = np.random.uniform(0, 1, size=len(affected_rxn_ls2)*n_iter).reshape(len(affected_rxn_ls2), n_iter)
model_idx = 0
for i in tqdm(range(n_iter)):
    with model as model:
        # ===== SETTING HYPERPARAMS: change the bounds of affected reactions =====
        for i in range(len(affected_rxn_ls2)):
            rxn_id = affected_rxn_ls2[i]
            # set a prior that approximates DE
            # Multiply the prior mean by a uniform r.v.
            prior_mean = dv0_dict[rxn_id]
            u = u_vec[model_idx, i]

            # Multiplier to incrase or decrease bounds, based on fold change
            multiplier = 1.0
            if rxn_id in fc_up_rxn_ls:
                multiplier = 1.0 + u
            else:
                multiplier = u

            if prior_mean > 0:
                rxn.upper_bound = prior_mean * multiplier
            elif prior_mean < 0:
                rxn.lower_bound = prior_mean * multiplier
        
        model_idx += 1
                
        # full optimize
        dt = model.optimize().to_frame().reset_index().rename(columns={"index":"rxn_id"}).drop(columns=["reduced_costs"]).set_index("rxn_id").T
        dt = dt.reset_index().drop(columns="index")
        df_opt_ls.append(dt)
        df_flux_samples = sample(model, n_likelihood_samples, method="achr")
        
        # Write out
        dt.to_csv(fn_out_posterior_ofv, mode='a', header=False, index=False)
        df_flux_samples.to_csv(fn_out_posterior, mode='a', header=False, index=False)
        


# Post-process simulation output

Deprecated; data too large to process in memory. Refer to the other notebook instead, using `dask`. 

In [None]:
d0 = pd.read_csv(path0+"/data/bayesian_mc_posterior_test.csv")
d1 = pd.read_csv(path0+"/data/bayesian_mc_posterior_ofv_test.csv")

In [None]:
d1

In [None]:
# plot
num_rows = 7
num_cols = 4
affected_rxn_ls2.sort()

fig, axarr = plt.subplots(num_rows, num_cols, figsize=(20, 2.4*num_rows))

idx = 0
for i in np.arange(num_rows):
    for j in np.arange(num_cols):
        arr = list(d0[affected_rxn_ls2[idx]])
        # Set colour
        if arr == [0]*n_iter:
            hist_colour = "#9CD0FF"
        else:
            hist_colour = "#156AB7"
        
        axarr[i, j].hist(arr, bins=100, color=hist_colour)
        axarr[i, j].set_yscale('log')
        
        kegg_annot = model.reactions.get_by_id(affected_rxn_ls2[idx]).annotation.get("kegg.reaction")
        ec_code = model.reactions.get_by_id(affected_rxn_ls2[idx]).annotation.get("ec-code")
        
        if kegg_annot is not None:
            my_subplot_title = affected_rxn_ls2[idx] + " (" + kegg_annot + ")\n" + str(ec_code)
        else:
            my_subplot_title = affected_rxn_ls2[idx] + "\n" + str(ec_code)
            
        axarr[i, j].set_title(my_subplot_title)
        
        idx +=1

fig.subplots_adjust(wspace=0.16, hspace=0.55)

#plt.savefig(path0+"/plots/bayesian_mc_"+str(int(n_iter/1000))+"k_test.pdf", bbox_inches="tight")

In [None]:
model.reactions.get_by_id("R1174_m").annotation.get("ec-code")