In [1]:
import numpy as np
from scipy import stats
from numpy import mean
from numpy.random import beta, poisson
from scipy.special import j_roots
from scipy.special import beta as beta_fun
from matplotlib import pyplot as plt
import pandas as pd
import anndata as ad

from joblib import delayed, Parallel

import warnings
warnings.filterwarnings('ignore')

In [2]:
adata = ad.read('../data/processed/mus_musculus_preprocessed.h5ad')

In [3]:
cells = len(adata.obs.index)
cells

682

In [4]:
# moment-based inference
def MomentInference(vals, export_moments=False):
    # code from Anton Larsson's R implementation
    from scipy import stats # needs imports inside function when run in ipyparallel
    import numpy as np
    m1 = float(np.mean(vals))
    m2 = float(sum(vals*(vals - 1))/len(vals))
    m3 = float(sum(vals*(vals - 1)*(vals - 2))/len(vals))
    
    # sanity check on input (e.g. need at least on expression level)
    if sum(vals) == 0: return np.nan
    if m1 == 0: return np.nan
    if m2 == 0: return np.nan
    
    r1=m1
    r2=m2/m1
    r3=m3/m2
    
    if (r1*r2-2*r1*r3 + r2*r3) == 0: return np.nan
    if ((r1*r2 - 2*r1*r3 + r2*r3)*(r1-2*r2+r3)) == 0: return np.nan
    if (r1 - 2*r2 + r3) == 0: return np.nan
        
    lambda_est = (2*r1*(r3-r2))/(r1*r2-2*r1*r3 + r2*r3)
    mu_est = (2*(r3-r2)*(r1-r3)*(r2-r1))/((r1*r2 - 2*r1*r3 + r2*r3)*(r1-2*r2+r3))
    v_est = (2*r1*r3 - r1*r2 - r2*r3)/(r1 - 2*r2 + r3)
    
    if export_moments:
        return np.array([lambda_est, mu_est, v_est, r1, r2, r3])
    
    return np.array([lambda_est, mu_est, v_est])
def MaximumLikelihood(vals, export_asymp_ci = False, fix = 0, export_fun = False):
    from scipy.interpolate import interp1d
    from scipy.optimize import minimize
    from scipy import special
    from scipy.stats import poisson,norm
    from scipy.special import j_roots
    from scipy.special import beta as beta_fun    
    import numpy as np
    if len(vals) == 0:
        return np.array([np.nan, np.nan, np.nan])
    def dBP(at, alpha, bet, lam):
        at.shape = (len(at), 1)
        np.repeat(at, 50, axis = 1)
        def fun(at, m):
            if(max(m) < 1e6):
                return(poisson.pmf(at,m))
            else:
                return(norm.pdf(at,loc=m,scale=sqrt(m)))
        
        x,w = j_roots(50,alpha = bet - 1, beta = alpha - 1)
        gs = np.sum(w*fun(at, m = lam*(1+x)/2), axis=1)
        prob = 1/beta_fun(alpha, bet)*2**(-alpha-bet+1)*gs
        return(prob)
    def LogLikelihood(x, vals):
        kon = x[0]
        koff = x[1]
        ksyn = x[2]
        return(-np.sum(np.log( dBP(vals,kon,koff,ksyn) + 1e-10) ) )
    x0 = MomentInference(vals)
    if np.isnan(x0).any():
        x0 = np.array([10,10,10])
    bnds = ((1e-3,1e3),(1e-3,1e3), (1, 1e10))
    vals_ = np.copy(vals) # Otherwise the structure is violated.
    try:
        ll = minimize(LogLikelihood, x0, args = (vals_), method='L-BFGS-B', bounds = bnds)
    except:
        if export_fun:
            return np.array([np.nan,np.nan,np.nan]), np.nan
        return np.array([np.nan,np.nan,np.nan])
    #se = ll.hess_inv.todense().diagonal()
    if export_fun:
        return ll.x, ll.fun
    estim = ll.x
    return estim

In [5]:
def dBP(at, alpha, bet, lam):
    at.shape = (len(at),1)
    np.repeat(at, 50, axis = 1)
    def fun(at, m):
        if(max(m) < 1e6):
            return(stats.poisson.pmf(at,m))
        else:
            return(stats.norm.pdf(at,loc=m,scale=sqrt(m)))
    if alpha <= 0 or bet <= 0:
        return np.nan
    x,w = j_roots(50,alpha = bet-1, beta = alpha - 1)
    gs = np.sum(w*fun(at, m = lam*(1+x)/2), axis=1)
    prob = 1/beta_fun(alpha, bet)*2**(-alpha-bet+1)*gs
    return(prob)

In [6]:
spliced = adata.to_df('spliced').transpose()
unspliced = adata.to_df('unspliced').transpose()
total = adata.to_df('total').transpose()
unspliced_by_total = adata.to_df('unspliced_by_total').transpose()


gene_df = pd.DataFrame(index=adata.var.index)

In [7]:
gene_df['spliced_kon'] = 0.0
gene_df['spliced_koff'] = 0.0
gene_df['spliced_ksyn'] = 0.0
gene_df['unspliced_kon'] = 0.0
gene_df['unspliced_koff'] = 0.0
gene_df['unspliced_ksyn'] = 0.0
gene_df['total_s_u_kon'] = 0.0
gene_df['total_s_u_koff'] = 0.0
gene_df['total_s_u_ksyn'] = 0.0
gene_df['unspliced_by_total_kon'] = 0.0
gene_df['unspliced_by_total_koff'] = 0.0
gene_df['unspliced_by_total_ksyn'] = 0.0

In [8]:
counter = 1

for gene in adata.var.index:
    spliced_vector = spliced.loc[gene][pd.notnull(spliced.loc[gene])]
    unspliced_vector = unspliced.loc[gene][pd.notnull(unspliced.loc[gene])]
    total_vector = total.loc[gene][pd.notnull(total.loc[gene])]
    unspliced_by_total_vector = unspliced_by_total.loc[gene][pd.notnull(unspliced_by_total.loc[gene])]
    
    
    
    tot_kon_spliced, tot_koff_spliced, tot_ksyn_spliced = MaximumLikelihood(spliced_vector)
    
    tot_kon_unspliced, tot_koff_unspliced, tot_ksyn_unspliced = MaximumLikelihood(unspliced_vector) 
    
    tot_kon, tot_koff, tot_ksyn = MaximumLikelihood(total_vector)
    
    unspliced_by_tot_kon, unspliced_by_tot_koff, unspliced_by_tot_ksyn = MaximumLikelihood(unspliced_by_total_vector)
    
    gene_df.loc[gene]['spliced_kon'] = tot_kon_spliced
    gene_df.loc[gene]['spliced_koff'] = tot_koff_spliced
    gene_df.loc[gene]['spliced_ksyn'] = tot_ksyn_spliced
    gene_df.loc[gene]['unspliced_kon'] = tot_kon_unspliced
    gene_df.loc[gene]['unspliced_koff'] = tot_koff_unspliced
    gene_df.loc[gene]['unspliced_ksyn'] = tot_ksyn_unspliced
    gene_df.loc[gene]['total_s_u_kon'] = tot_kon
    gene_df.loc[gene]['total_s_u_koff'] = tot_koff
    gene_df.loc[gene]['total_s_u_ksyn'] = tot_ksyn
    gene_df.loc[gene]['unspliced_by_total_kon'] = unspliced_by_tot_kon
    gene_df.loc[gene]['unspliced_by_total_koff'] = unspliced_by_tot_koff
    gene_df.loc[gene]['unspliced_by_total_ksyn'] = unspliced_by_tot_ksyn
    
    print(counter, end=" ")
    
    counter
    +=1
    
#     if counter > 20:
#         break


# def parallel_calc(gene, spliced, unspliced, total, unspliced_by_total, gene_df):
#     spliced_vector = spliced.loc[gene][pd.notnull(spliced.loc[gene])]
#     unspliced_vector = unspliced.loc[gene][pd.notnull(unspliced.loc[gene])]
#     total_vector = total.loc[gene][pd.notnull(total.loc[gene])]
#     unspliced_by_total_vector = unspliced_by_total.loc[gene][pd.notnull(unspliced_by_total.loc[gene])]
    
    
    
#     tot_kon_spliced, tot_koff_spliced, tot_ksyn_spliced = MaximumLikelihood(spliced_vector)
    
#     tot_kon_unspliced, tot_koff_unspliced, tot_ksyn_unspliced = MaximumLikelihood(unspliced_vector) 
    
#     tot_kon, tot_koff, tot_ksyn = MaximumLikelihood(total_vector)
    
#     unspliced_by_tot_kon, unspliced_by_tot_koff, unspliced_by_tot_ksyn = MaximumLikelihood(unspliced_by_total_vector)
    
#     gene_df.loc[gene]['spliced_kon'] = tot_kon_spliced
#     gene_df.loc[gene]['spliced_koff'] = tot_koff_spliced
#     gene_df.loc[gene]['spliced_ksyn'] = tot_ksyn_spliced
#     gene_df.loc[gene]['unspliced_kon'] = tot_kon_unspliced
#     gene_df.loc[gene]['unspliced_koff'] = tot_koff_unspliced
#     gene_df.loc[gene]['unspliced_ksyn'] = tot_ksyn_unspliced
#     gene_df.loc[gene]['total_kon'] = tot_kon
#     gene_df.loc[gene]['total_koff'] = tot_koff
#     gene_df.loc[gene]['total_ksyn'] = tot_ksyn
#     gene_df.loc[gene]['unspliced_by_total_kon'] = unspliced_by_tot_kon
#     gene_df.loc[gene]['unspliced_by_total_koff'] = unspliced_by_tot_koff
#     gene_df.loc[gene]['unspliced_by_total_ksyn'] = unspliced_by_tot_ksyn


1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [9]:
# gene_df = Parallel(n_jobs=8)(delayed(parallel_calc)(gene, spliced, unspliced, total, unspliced_by_total, gene_df) for gene in ['Mrpl15', '4732440D04Rik', 'Cops5', 'Arfgef1', 'Tram1'])

In [10]:
gene_df.to_csv('../data/temp_params.csv')

In [11]:
gene_df.head(20)

Unnamed: 0_level_0,spliced_kon,spliced_koff,spliced_ksyn,unspliced_kon,unspliced_koff,unspliced_ksyn,total_s_u_kon,total_s_u_koff,total_s_u_ksyn,unspliced_by_total_kon,unspliced_by_total_koff,unspliced_by_total_ksyn
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Mrpl15,2.176995,12.495542,2190.929841,0.075268,1.504743,139.701396,2.176995,12.495542,2190.929841,0.001,1.362095,1.0
4732440D04Rik,0.049667,4.569173,80.35925,0.001,156.828722,1.0,0.049667,4.569173,80.35925,0.001,156.828722,1.0
Cops5,3.656598,29.40966,11898.4166,0.114069,2.747117,277.678309,3.656598,29.40966,11898.4166,0.001,1.361855,32.486808
Arfgef1,1.756567,184.983916,22803.915975,0.635087,7.516546,1120.810458,1.756567,184.983916,22803.915975,,,
Tram1,0.001,8.26497,1.0,0.28215,5.544402,826.352109,0.001,8.26497,1.0,0.001,1.362074,1.092651
Lactb2,1.42031,20.437491,2798.786369,0.291413,8.678639,887.255924,1.42031,20.437491,2798.786369,,,
Stau2,0.325817,10.823475,1125.213682,0.278515,6.29844,906.718476,0.325817,10.823475,1125.213682,,,
Ube2w,0.001,10.898325,1.0,0.655978,11.261678,1739.530588,0.001,10.898325,1.0,,,
Tceb1,5.033632,0.001,1.0,0.314899,9.892844,774.792224,5.033632,0.001,1.0,0.001,1.361827,51.752746
Mcm3,0.242151,3.439452,3880.615969,0.014734,2.00861,130.751026,0.242151,3.439452,3880.615969,,,


In [12]:
unspliced_by_total_vector

GCTTAGAGAAGACC    2.775956e+00
GCTTAGGTAAGTGG    3.158763e+01
GCTTAGTCCGTGTT    8.668085e+01
GTTGGAGTCTATTA    2.077857e+01
GTTGGAGAAGGCGT    9.988889e+00
                      ...     
GCGGTTAATCACGA    5.854680e+16
GCGGTTCTTCATCC    4.400000e+00
GCGGTTCATGAGTA    3.900000e+01
GCGGTTAACCAGAG    1.220000e+01
GCGGTTCACCGTCT    1.850000e+01
Name: Mid1, Length: 682, dtype: float64

In [13]:
bdata = adata

In [14]:
temp = gene_df

In [15]:
adata.var = adata.var.join(gene_df)

In [16]:
adata.var

Unnamed: 0_level_0,Accession,Chromosome,End,Start,Strand,sum_allele_c57,sum_allele_cast,sum_ratio_allele_c57,sum_ratio_allele_cast,ratio_sum_allele_c57,...,spliced_ksyn,unspliced_kon,unspliced_koff,unspliced_ksyn,total_s_u_kon,total_s_u_koff,total_s_u_ksyn,unspliced_by_total_kon,unspliced_by_total_koff,unspliced_by_total_ksyn
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mrpl15,ENSMUSG00000033845,1,4785739,4773206,-,1175.0,1092.0,0.518306,0.481694,153.475673,...,2190.929841,0.075268,1.504743,139.701396,2.176995,12.495542,2190.929841,0.001,1.362095,1.000000
4732440D04Rik,ENSMUSG00000090031,1,6214590,6213293,-,1498.0,1261.0,0.542950,0.457050,308.882750,...,80.359250,0.001000,156.828722,1.000000,0.049667,4.569173,80.359250,0.001,156.828722,1.000000
Cops5,ENSMUSG00000025917,1,10038127,10024602,-,8192.0,7966.0,0.506993,0.493007,323.482196,...,11898.416600,0.114069,2.747117,277.678309,3.656598,29.409660,11898.416600,0.001,1.361855,32.486808
Arfgef1,ENSMUSG00000067851,1,10232670,10137571,-,1841.0,1842.0,0.499864,0.500136,284.025982,...,22803.915975,0.635087,7.516546,1120.810458,1.756567,184.983916,22803.915975,,,
Tram1,ENSMUSG00000025935,1,13589864,13564702,-,7514.0,7028.0,0.516710,0.483290,350.605317,...,1.000000,0.282150,5.544402,826.352109,0.001000,8.264970,1.000000,0.001,1.362074,1.092651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Exosc7,ENSMUSG00000025785,9,123136129,123113215,+,3556.0,3447.0,0.507782,0.492218,317.695221,...,1772.273585,0.096363,1.875123,239.556712,1.229338,6.279444,1772.273585,,,
Lars2,ENSMUSG00000035202,9,123462664,123366940,+,376.0,763.0,0.330114,0.669886,114.946627,...,1391.569082,0.211345,6.729131,513.758124,0.912919,18.076687,1391.569082,,,
Sacm1l,ENSMUSG00000025240,9,123592598,123529882,+,1444.0,1262.0,0.533629,0.466371,267.715853,...,,0.443060,15.381805,1889.522526,,,,,,
Gm5637,ENSMUSG00000046993,X,60046229,60045111,-,225.0,361.0,0.383959,0.616041,92.535714,...,3463.062499,0.001000,156.828722,1.000000,1.607009,161.705438,3463.062499,0.001,156.828722,1.000000


In [17]:
bdata.write('../data/processed/mus_musculus_preprocessed_arme_params_with_spliced_data.h5ad')