In [1]:
import statisUtils as utils
import pandas as pd
import numpy as np
import random
from scipy.stats import loguniform, lognorm
from scipy import stats
import time
import math
from scipy.stats import gaussian_kde

import ABCSampler as abcSampler
import os


In [2]:
N = 20 # initial N value
T = 5 # fixed T value
incr_N = 50 # increment factor for N

MAX_N = 100 # maximum value for N

output_dir = 'test_gtm_samples'

In [3]:
df_cast = pd.read_csv('data/cast_genes_count.csv', header=None, index_col=[0]) # gene x cells
df_c57 = pd.read_csv('data/c57_genes_count.csv', header=None, index_col=[0]) # gene x cells

df = df_cast + df_c57

In [4]:
def get_df(result):
    result_flatten = result.flatten()
    temp_df = pd.DataFrame([x for x in result_flatten])
    return temp_df

In [5]:
def get_bf_and_bs(temp_df):
    """
    input is the dataframe containing the parameters as columns.
    need column named 'koff', 'roff', 'kon', 'ron' 
    
    compute burst frequency and burst size from the parameters
    """
    temp_df['tau_off']= temp_df['kon']/temp_df['ron']
    temp_df['tau_on'] = temp_df['koff']/temp_df['roff']
    temp_df['bf'] = 1./(temp_df['tau_on'] + temp_df['tau_off'])
    temp_df['bs'] = temp_df['mu'] * temp_df['tau_on']
    
    return temp_df

In [6]:
def get_log_bf_and_bs(temp_df):
    temp_df = get_bf_and_bs(temp_df)
    temp_df['log_kon'] = [np.log10(x) for x in temp_df['kon']]
    temp_df['log_koff'] = [np.log10(x) for x in temp_df['koff']]
    temp_df['log_bs'] = [np.log10(x) for x in temp_df['bs']]
    temp_df['log_bf'] = [np.log10(x) for x in temp_df['bf']]
    
    return temp_df

In [7]:
def best_parameters(temp_df, density_kernel_name=None, save_kde=False):
    """
    return the index for the best parameters.
    
    if density_kernel_name is None , we recompute the gaussian kde and if save_kde is a str. 
    Save the kde in df using save_kde as column name
    """
    
    if density_kernel_name is None:
        # Calculate the point density
        xy = np.vstack([temp_df['bs'],temp_df['bf']])
        z = gaussian_kde(xy)(xy)
        if save_kde is not False:
            temp_df[save_kde] = z
            
    else:
        z = temp_df[density_kernel_name]
    
    return(np.where(z == np.amax(z))[0][0])

In [8]:
gene_name = 'Fn1'
row = row = df.loc[gene_name]

In [9]:
def lognpdf(x, mu, sigma):
    shape  = sigma
    loc    = 0
    scale  = np.exp(mu)
    return lognorm.pdf(x, shape, loc, scale)

In [10]:
while N <= MAX_N:
    
    start_processing = time.time()
    print(f"Start Processing gene: {gene_name}, N= {N}, T= {T}")
    
    data = np.array(row)

    gene = data[data>=0]
    k = int(0.95*len(gene))
    gene = gene[np.argpartition(gene, k)[:k]]
    gene = np.sort(gene)

    data_mean = np.mean(gene);
    data_var = np.var(gene);
    data_noise = data_var/(data_mean**2);

    statis_data = utils.statisData(gene);

    rho = lambda s: np.sqrt(
                np.sum(
                    np.log(statis_data/(s))**2
                )
    )

    f = lambda k: utils.statisGTM(k,4)
    epsilon = 1
    
    prior = lambda: np.array([
            5 * np.random.uniform(), # kon ~ U[0,5]
            10**random.uniform(-1,1), # ron ~ logU[-1,1] base 10
            5 * np.random.uniform(), # koff ~ U[0,5]
            10**random.uniform(-1,1), # roff ~ logU[-1,1] base 10 
            100 * np.random.uniform(), # rsyn ~ U[0,50] ?? in paper assumes upper bound 50, but in code is 100
            1]) # rdeg=1

    proposal_sigma = 0.2

    proposal = lambda x: np.random.lognormal(x,proposal_sigma)

    proposal_pdf = lambda kon_post,kon_prior,ron_post,ron_prior,koff_post,koff_prior,roff_post,roff_prior,mu_post,mu_prior: \
    lognpdf(mu_post,np.log(mu_prior),proposal_sigma) * lognpdf(kon_post,np.log(kon_prior),proposal_sigma) \
    * lognpdf(ron_post,np.log(ron_prior),proposal_sigma) * lognpdf(koff_post,np.log(koff_prior),proposal_sigma) \
    * lognpdf(roff_post,np.log(roff_prior),proposal_sigma)
    
    result, flag = abcSampler.ABCSMCSampler(N,prior,f,rho,epsilon,T,proposal,proposal_pdf,gene_name)
    
    np.save(f'{output_dir}/posterior/gene_{gene_name}_N{N}_T{T}.npy', np.array([gene_name, gene, result], dtype=object))
    
    end_processing = time.time()
    total_processing = end_processing - start_processing
    print(f"Finish Processing gene: {gene_name}, N= {N}, T= {T}, time= {total_processing}")
    print()
    
    
    # save best params
    result_df = get_df(result)
    result_df = get_log_bf_and_bs(result_df)
    
    bp = best_parameters(result_df, save_kde='density_kernel')
    bp_data = result_df.iloc[bp].copy()
    
    bp_data = bp_data.to_frame().transpose().rename(index={bp: f'N{N}_T{T}'})
    
    # save number of samples and process time
    samp_arr = bp_data.index[0].split('_')
    num_samples = int(samp_arr[0][1:]) * int(samp_arr[1][1:])
    bp_data['num_samples'] = num_samples
    bp_data['process_time'] = total_processing
    
    # save the data
    if os.path.exists(f'{output_dir}/gtm_N_T_estimations.csv'):
        df_NT = pd.read_csv(f'{output_dir}/gtm_N_T_estimations.csv', index_col=[0])
        df_new = pd.concat([df_NT, bp_data])
        df_new.to_csv(f'{output_dir}/gtm_N_T_estimations.csv')
    else:
        bp_data.to_csv(f'{output_dir}/gtm_N_T_estimations.csv')



    N += incr_N
print("End")

Start Processing gene: Fn1, N= 20, T= 5
Finish Processing gene: Fn1, N= 20, T= 5, time= 9.617774486541748

Start Processing gene: Fn1, N= 70, T= 5
Finish Processing gene: Fn1, N= 70, T= 5, time= 79.90861773490906

