In [1]:
import sys, os
from Deconvolution.BLADE import Framework
import numpy as np
from numpy import transpose as t
import itertools
import pickle
from scipy.optimize import nnls
from sklearn.svm import SVR
from sklearn.svm import NuSVR

from sklearn.metrics import mean_squared_error as mse
import pandas as pd

# modules for visualization
import qgrid
from matplotlib import pyplot as plt
import seaborn as sns

### Run BLADE with TCGA bulk and Puram scRNA-seq reference


#### Application of deconvolution methods

From here, we will apply the following three methods for further performance comparison:
1. BLADE (estimation of cellular fraction + group-mode/high-resolution-mode purification)
2. NNLS (estimation of fraction)
3. SVR followed by NNLS (estimation of fraction + group-mode purification) - similar to CIBERSORTx


##### 1. Application of BLADE

These are the key parameters used in BLADE (note that there is default settings of these parameters, if not specified):
- Hyperparameters (`hyperpars`): `Alpha`, `Alpha0`, `Kappa0` and `SigmaY`, each of which can be defined as a list of options. BLADE takes an empirical Bayes approach to find the optimal parameter set given the all possible combinations. 
- `Nrep`: Number of repeat for evaluating each parameter configuration.
- `Nrepfinal`: Number of repeated optimizations for the final parameter set.
- `Njob`: Number of parallel jobs.

In [2]:
hyperpars = {
    'Alpha': [1, 10],
    'Alpha0': [0.1, 1, 5],
    'Kappa0': [1, 0.5, 0.1],
    'SY': [1,0.3,0.5],
}

Nrep=3
Nrepfinal=10
Njob=10

In [3]:
# read in marker genes, highly variable and top 100 DEGs for each cell type
marker_genes = pd.read_csv("/home/cke/Puram/top100DEGs.txt",header=None).iloc[0,:]

In [4]:
# df_Puram_std = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_std.tsv",sep='\t',index_col=0)
# df_Puram_mean = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_mean.tsv",sep='\t',index_col=0)

# merged all tumor cell types
df_Puram_std = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_std_simple.tsv",sep='\t',index_col=0)
df_Puram_mean = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_mean_simple.tsv",sep='\t',index_col=0)

In [5]:
df_Puram_std

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000121410,1.083922,0.818420,0.399890,0.801904,0.564758,0.838012,0.614700,0.668682,0.000000,0.544478
ENSG00000268895,0.511951,0.016481,0.386574,0.758462,0.462413,1.065588,0.401408,0.406487,0.000000,0.499750
ENSG00000148584,0.006775,0.018113,0.035513,0.017553,0.003137,0.030807,0.024420,0.022276,0.000000,0.012275
ENSG00000175899,0.108961,0.651788,3.116251,3.586992,2.773042,0.804513,2.095236,0.318067,2.588845,0.313355
ENSG00000245105,0.089944,0.028935,0.747009,0.484618,0.415440,0.724833,0.226807,0.407835,0.000000,0.324657
...,...,...,...,...,...,...,...,...,...,...
ENSG00000203995,0.362794,0.374291,0.219057,0.196687,0.118434,0.137730,0.457341,0.240729,0.266944,0.254686
ENSG00000162378,0.457661,0.168491,0.897044,0.904963,0.659679,0.528316,0.632295,0.442593,0.147025,0.536564
ENSG00000159840,0.923420,2.148970,2.143931,2.233748,2.064606,1.246275,1.610354,1.907724,1.370751,1.663473
ENSG00000074755,0.630889,0.826259,0.918515,0.843143,0.451562,0.614904,0.610981,0.911177,0.707439,0.588526


In [6]:
df_Puram_std_filtered = df_Puram_std.loc[marker_genes,:]
df_Puram_std_filtered

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000180879,1.257461,2.726457,2.710988,2.915231,2.363175,2.918310,3.168524,3.068894,2.396490,1.738203
ENSG00000170476,1.750666,0.000000,0.710504,0.307905,1.299676,0.689692,1.284985,1.241789,0.000000,0.259155
ENSG00000099958,2.134986,0.000000,0.328539,0.831592,1.015015,0.834168,1.115906,0.494280,2.431806,0.115939
ENSG00000134285,2.081012,0.872949,2.350889,1.810899,1.354811,0.897654,2.458720,1.861098,0.000000,1.773220
ENSG00000051108,2.285642,3.411570,2.796153,3.090888,2.619385,3.183815,3.090631,3.497793,2.388702,2.321041
...,...,...,...,...,...,...,...,...,...,...
ENSG00000237945,0.628379,0.461398,0.711661,0.571490,0.528650,0.545089,1.158162,1.031948,0.686544,1.251798
ENSG00000264230,0.000000,0.000000,0.033306,0.094715,0.000000,0.856802,2.672011,0.188092,1.293606,2.960032
ENSG00000145592,1.589604,1.576437,1.739076,1.562400,1.404541,1.508172,2.049650,1.831794,1.691156,1.043949
ENSG00000185479,0.113223,1.212813,0.280492,0.383305,1.096321,0.000000,2.595832,0.397527,0.578254,3.217219


In [7]:
df_Puram_mean

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000121410,0.531012,0.306200,0.077734,0.256194,0.219202,0.305957,0.151037,0.124142,0.000000,0.197286
ENSG00000268895,0.091276,0.002308,0.054535,0.153220,0.078708,0.238673,0.057073,0.036260,0.000000,0.112074
ENSG00000148584,0.000998,0.005308,0.004459,0.002982,0.000482,0.007019,0.007832,0.007393,0.000000,0.002747
ENSG00000175899,0.021167,0.137365,6.034123,5.435018,2.885008,0.259597,0.708786,0.079807,3.576743,0.031501
ENSG00000245105,0.007657,0.004052,0.171338,0.077473,0.057223,0.105998,0.021405,0.030704,0.000000,0.038320
...,...,...,...,...,...,...,...,...,...,...
ENSG00000203995,0.206693,0.230500,0.215373,0.172395,0.091906,0.180291,0.333693,0.335935,0.180180,0.134803
ENSG00000162378,0.223046,0.182567,0.487771,0.448538,0.316599,0.268201,0.374778,0.341616,0.115098,0.285960
ENSG00000159840,0.215561,1.949729,1.787872,1.973410,2.810183,0.403501,0.868257,0.814331,0.739055,1.611830
ENSG00000074755,0.165944,0.254342,0.334769,0.280260,0.149263,0.143537,0.181321,0.233642,0.357566,0.250754


In [8]:
df_Puram_mean_filtered = df_Puram_mean.loc[marker_genes,:]
df_Puram_mean_filtered

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000180879,10.001214,2.102411,4.077745,2.925292,3.936930,4.720721,3.872875,2.088499,2.207058,4.495932
ENSG00000170476,8.951077,0.000000,0.087531,0.016827,0.301197,0.062960,0.238289,0.249583,0.000000,0.018437
ENSG00000099958,7.248571,0.000000,0.034302,0.140362,0.217747,0.144230,0.306464,0.039378,1.384221,0.008055
ENSG00000134285,7.519426,0.122237,1.358591,0.727107,0.454236,0.173548,1.422514,0.599019,0.000000,1.226526
ENSG00000051108,9.693298,4.652936,3.725663,3.612597,5.125562,3.371624,3.045614,5.830777,1.949654,2.859569
...,...,...,...,...,...,...,...,...,...,...
ENSG00000237945,0.491258,0.550611,0.713128,0.426353,0.345971,0.424640,1.119608,0.864275,0.492570,1.971369
ENSG00000264230,0.000000,0.000000,0.002066,0.004290,0.000000,0.136055,1.510223,0.005348,0.296774,3.733746
ENSG00000145592,1.580382,1.572156,2.273377,1.817331,1.571413,1.027130,2.463960,1.120156,1.913820,3.708434
ENSG00000185479,0.020205,0.298878,0.031823,0.049894,0.309595,0.000000,1.496623,0.028290,0.151539,3.621807


In [16]:
# GET LOG SCALE OF MEAN COUNTS, LOG2(MEAN_COUNT+1)
df_Puram_mean_log2 = np.log2(df_Puram_mean_filtered+1)

In [17]:
df_Puram_mean_log2

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000180879,3.459591,1.633390,2.344188,1.972800,2.303614,2.516197,2.284773,1.626906,1.681250,2.458364
ENSG00000170476,3.314853,0.000000,0.121057,0.024075,0.379839,0.088087,0.308348,0.321446,0.000000,0.026357
ENSG00000099958,3.044144,0.000000,0.048657,0.189492,0.284215,0.194377,0.385668,0.055721,1.253518,0.011574
ENSG00000134285,3.090756,0.166378,1.237925,0.788357,0.540261,0.230877,1.276505,0.677187,0.000000,1.154795
ENSG00000051108,3.418635,2.499000,2.240517,2.205579,2.614842,2.128169,2.016359,2.772050,1.560546,1.948440
...,...,...,...,...,...,...,...,...,...,...
ENSG00000237945,0.576529,0.632837,0.776633,0.512331,0.428648,0.510598,1.083797,0.898614,0.577799,1.571128
ENSG00000264230,0.000000,0.000000,0.002977,0.006176,0.000000,0.184033,1.327816,0.007695,0.374927,2.242982
ENSG00000145592,1.367585,1.362978,1.710780,1.494329,1.362561,1.019438,1.792422,1.084171,1.542912,2.235247
ENSG00000185479,0.028859,0.377266,0.045196,0.070244,0.389121,0.000000,1.319978,0.040247,0.203563,2.208457


In [9]:
df_TCGA = pd.read_csv("/home/cke/TCGA-HNSC.htseq_counts_exp2.tsv",sep='\t',index_col=0)

In [10]:
df_TCGA

Unnamed: 0_level_0,TCGA-BB-4224-01A,TCGA-H7-7774-01A,TCGA-CV-6943-01A,TCGA-CN-5374-01A,TCGA-CQ-6227-01A,TCGA-CV-6959-01A,TCGA-F7-A61V-01A,TCGA-CV-7413-01A,TCGA-CV-7247-01A,TCGA-CR-5249-01A,...,TCGA-CV-6960-11A,TCGA-CV-A464-01A,TCGA-C9-A47Z-01A,TCGA-CN-6010-01A,TCGA-WA-A7GZ-11A,TCGA-CV-7235-01A,TCGA-CX-7086-01A,TCGA-CV-6935-11A,TCGA-P3-A6SW-01A,TCGA-HD-A6HZ-01A
Ensembl_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,2237.0,2740.0,2686.0,2086.0,10167.0,1154.0,1978.0,1930.0,1066.0,2538.0,...,3340.0,929.0,1707.0,2218.0,2537.0,492.0,2741.0,8492.0,770.0,923.0
ENSG00000000005,2.0,0.0,0.0,1.0,9.0,6.0,1.0,1.0,1.0,0.0,...,0.0,2.0,0.0,0.0,41.0,0.0,0.0,3.0,1.0,0.0
ENSG00000000419,1606.0,1691.0,1649.0,2333.0,3021.0,2766.0,1762.0,1668.0,1760.0,1268.0,...,1388.0,2332.0,1926.0,1574.0,1171.0,976.0,1952.0,1578.0,1569.0,1183.0
ENSG00000000457,1063.0,803.0,917.0,1288.0,537.0,527.0,482.0,671.0,600.0,1066.0,...,608.0,472.0,410.0,852.0,557.0,376.0,1003.0,818.0,620.0,607.0
ENSG00000000460,1208.0,317.0,402.0,1105.0,459.0,747.0,331.0,476.0,874.0,950.0,...,206.0,386.0,210.0,729.0,127.0,363.0,1103.0,281.0,591.0,344.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSGR0000275287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSGR0000276543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSGR0000277120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSGR0000280767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
merge_genes_mean = pd.merge(df_Puram_mean_log2,df_TCGA,left_index=True,right_index=True,how='inner')
merge_genes_std = pd.merge(df_Puram_std_filtered,df_TCGA,left_index=True,right_index=True,how='inner')
#21706 genes in common
# df_TCGA_shared = merge_genes_mean.iloc[:,24:]
# df_shared_mean = merge_genes_mean.iloc[:,:24]
# df_shared_std = merge_genes_std.iloc[:,:24]

#simple tumor cell type setup
df_TCGA_shared = merge_genes_mean.iloc[:,10:]
df_shared_mean = merge_genes_mean.iloc[:,:10]
df_shared_std = merge_genes_std.iloc[:,:10]

In [12]:
df_TCGA_shared

Unnamed: 0,TCGA-BB-4224-01A,TCGA-H7-7774-01A,TCGA-CV-6943-01A,TCGA-CN-5374-01A,TCGA-CQ-6227-01A,TCGA-CV-6959-01A,TCGA-F7-A61V-01A,TCGA-CV-7413-01A,TCGA-CV-7247-01A,TCGA-CR-5249-01A,...,TCGA-CV-6960-11A,TCGA-CV-A464-01A,TCGA-C9-A47Z-01A,TCGA-CN-6010-01A,TCGA-WA-A7GZ-11A,TCGA-CV-7235-01A,TCGA-CX-7086-01A,TCGA-CV-6935-11A,TCGA-P3-A6SW-01A,TCGA-HD-A6HZ-01A
ENSG00000003402,3716.0,7287.0,9488.0,4673.0,4920.0,7693.0,1554.0,3204.0,4489.0,8346.0,...,3321.0,4735.0,2864.0,6284.0,6405.0,3210.0,11701.0,6709.0,6089.0,6139.0
ENSG00000003436,225.0,243.0,1435.0,680.0,2224.0,1251.0,210.0,327.0,628.0,2468.0,...,475.0,708.0,229.0,909.0,979.0,2948.0,294.0,893.0,250.0,737.0
ENSG00000003436,225.0,243.0,1435.0,680.0,2224.0,1251.0,210.0,327.0,628.0,2468.0,...,475.0,708.0,229.0,909.0,979.0,2948.0,294.0,893.0,250.0,737.0
ENSG00000004399,3730.0,1313.0,10135.0,4290.0,7236.0,5582.0,2177.0,3769.0,5900.0,4726.0,...,1414.0,4838.0,1344.0,4589.0,1523.0,6001.0,1724.0,1800.0,3584.0,3484.0
ENSG00000004468,335.0,3221.0,2401.0,327.0,1589.0,387.0,1130.0,169.0,134.0,1118.0,...,415.0,89.0,61.0,2183.0,248.0,682.0,3891.0,407.0,400.0,1637.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000267041,147.0,18.0,89.0,228.0,61.0,113.0,61.0,11.0,113.0,268.0,...,63.0,86.0,47.0,115.0,57.0,198.0,149.0,39.0,100.0,80.0
ENSG00000271503,582.0,1741.0,18561.0,3442.0,2321.0,3514.0,721.0,9888.0,1433.0,3434.0,...,233.0,4951.0,1629.0,1453.0,231.0,4955.0,1526.0,562.0,3113.0,1489.0
ENSG00000272573,7.0,1.0,8.0,6.0,197.0,95.0,47.0,2.0,127.0,2.0,...,1.0,101.0,14.0,1.0,253.0,1.0,0.0,7.0,0.0,33.0
ENSG00000276975,63.0,10.0,1.0,13.0,0.0,4.0,0.0,15.0,2.0,3.0,...,1.0,3.0,1.0,53.0,8.0,2.0,14.0,1.0,4.0,1.0


In [13]:
df_shared_mean

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000003402,3.965319,5.158222,3.034486,2.156816,2.896742,1.691081,2.324375,2.891413,2.606933,2.534943
ENSG00000003436,0.004004,0.223417,3.744232,2.499596,0.149699,0.003945,0.438807,0.012854,1.302279,0.312333
ENSG00000003436,0.004004,0.223417,3.744232,2.499596,0.149699,0.003945,0.438807,0.012854,1.302279,0.312333
ENSG00000004399,0.000000,0.549420,1.687792,0.472600,0.767809,0.212628,0.121773,0.074112,0.000000,0.066365
ENSG00000004468,3.556487,0.332463,0.155182,0.062074,0.789517,0.272665,0.488337,0.137704,0.000000,0.806148
...,...,...,...,...,...,...,...,...,...,...
ENSG00000267041,0.459932,0.508493,0.573710,0.467061,0.277953,0.487566,0.664987,0.791584,0.557444,0.305656
ENSG00000271503,1.423306,1.932294,1.663294,1.551110,1.769293,1.809516,1.999731,5.516435,1.619902,1.179146
ENSG00000272573,0.053286,0.000000,0.062059,2.373150,0.073171,0.055892,0.075262,0.086315,4.939921,0.038955
ENSG00000276975,0.193497,0.232019,0.228565,0.187650,0.109833,0.205785,0.316925,0.368559,0.203035,0.115146


In [14]:
df_shared_std

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000003402,2.203674,1.620045,1.906755,1.597623,1.914676,1.295219,1.716956,2.026647,1.608838,1.423351
ENSG00000003436,0.024432,0.897956,3.062287,2.653942,0.725784,0.022833,1.362985,0.189990,2.006773,0.984210
ENSG00000003436,0.024432,0.897956,3.062287,2.653942,0.725784,0.022833,1.362985,0.189990,2.006773,0.984210
ENSG00000004399,0.000000,1.134139,1.784824,1.075307,1.042394,0.785650,0.585981,0.546017,0.000000,0.326545
ENSG00000004468,2.985176,1.135200,0.893962,0.538615,1.682291,1.085830,1.651405,0.875682,0.000000,1.437999
...,...,...,...,...,...,...,...,...,...,...
ENSG00000267041,0.333189,0.341332,0.512605,0.283333,0.267111,0.305385,0.692498,0.380377,0.456139,0.333618
ENSG00000271503,0.850516,1.243896,0.933702,0.815524,1.504521,0.950195,1.444765,3.276136,1.334287,0.898732
ENSG00000272573,0.378618,0.000000,0.461570,3.130770,0.411163,0.590576,0.614152,0.650731,2.526449,0.303349
ENSG00000276975,0.139156,0.159220,0.181959,0.141089,0.105930,0.139984,0.398076,0.178212,0.185654,0.120230


Given the configuration above, BLADE is applied to each of the simulation dataset created previously.  

BLADE produce several outcomes:
- `final_obj`: final BLADE object with optimized variational parameters
- `best_obj`: BLADE object trained with the best parameter set found by the Empirical Bayes framework. Empirical Bayes framework is applied after selecting a subset of samples (5 samples; indicated by `Ind_sample` below), and thus the outcome contains only 5 samples. If `Nsample` <= 5, `final_obj` is identical to `best_obj`.
- `best_set`: Best parameter set defined by Empirical Bayes framework.
- `outs`: Outcome of BLADE for every possible combination of hyperparameters, used in the Empirical Bayes framework. 


- There are nan in mean and std matrix! NAs are filled with 0?

full tumor type setup:
- ngenes = 21706 common genes
- ncells = 24, including all 16 tumor types
- nsample = 546


simple tumor type setup:
- ngenes = 21706 common genes
- ncells = 10, all tumor types are merged, including one NA type?
- nsample = 546
- marker genes = 900 (including 9 genes not shared)


In [15]:
df_TCGA_shared.to_numpy().shape

(891, 546)

In [None]:
Y = df_TCGA_shared.to_numpy()
mean = df_shared_mean.to_numpy() 
sd = df_shared_std.to_numpy()

outfile = './BLADE/data/PuramTCGA_BLADE.pickle'

final_obj, best_obj, best_set, outs = Framework(
    mean, sd, Y,
    Alphas=hyperpars['Alpha'], Alpha0s=hyperpars['Alpha0'], 
    Kappa0s=hyperpars['Kappa0'], SYs=hyperpars['SY'],
    Nrep=Nrep, Njob=Njob, Nrepfinal=Nrepfinal)

pickle.dump(
    {
        'final_obj': final_obj,
        'best_obj': best_obj,
        'best_set': best_set,
        'outs' : outs
    }, open(outfile, 'wb')
    )

all of 891 genes are used for optimization.
All samples are used during the optimization.
Initialization with Support vector regression


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    1.4s
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    1.5s
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    1.7s
[Parallel(n_jobs=10)]: Done  41 tasks      | elapsed:    1.8s
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    2.0s
[Parallel(n_jobs=10)]: Done  65 tasks      | elapsed:    2.2s
[Parallel(n_jobs=10)]: Done  78 tasks      | elapsed:    2.4s
[Parallel(n_jobs=10)]: Done  93 tasks      | elapsed:    2.7s
[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:    2.9s
[Parallel(n_jobs=10)]: Done 125 tasks      | elapsed:    3.1s
[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed:    3.4s
[Parallel(n_jobs=10)]: Done 161 tasks      | elapsed:    3.7s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    3.9s
[Parallel(n_jobs=10)]: Done 201 tasks      | elapsed:  

No feature filtering is done (fsel = 0)


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, 

  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed: 13.4min
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed: 73.0min
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  r

### test run with a small subset of scrna reference, take randomly 1000 genes from both ref and bulk counts
> reduced hyperparam combinations

In [None]:
hyperpars_test = {
    'Alpha': [1, 10],
    'Alpha0': [0.1, 1, 5],
    'Kappa0': [1, 0.5, 0.1],
    'SY': [1,0.3,0.5],
}

Nrep=3
Nrepfinal=10
Njob=10

df_mean_sample = df_shared_mean.sample(n=2000)
df_std_sample = df_shared_std[df_shared_std.index.isin(df_mean_sample.index)]
Y_sample = df_TCGA_shared[df_TCGA_shared.index.isin(df_mean_sample.index)]

Y = Y_sample.to_numpy()
mean = df_mean_sample.to_numpy() 
sd = df_std_sample.to_numpy()

outfile = './BLADE/data/PuramTCGA_BLADE_Sample2000.pickle'

final_obj, best_obj, best_set, outs = Framework(
    mean, sd, Y,
    Alphas=hyperpars_test['Alpha'], Alpha0s=hyperpars_test['Alpha0'], 
    Kappa0s=hyperpars_test['Kappa0'], SYs=hyperpars_test['SY'],
    Nrep=Nrep, Njob=Njob, Nrepfinal=Nrepfinal)

pickle.dump(
    {
        'final_obj': final_obj,
        'best_obj': best_obj,
        'best_set': best_set,
        'outs' : outs
    }, open(outfile, 'wb')
    )


all of 2000 genes are used for optimization.
All samples are used during the optimization.
Initialization with Support vector regression


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:    7.1s
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    8.0s
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    8.7s
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    8.9s
[Parallel(n_jobs=10)]: Done  41 tasks      | elapsed:   10.4s
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:   11.2s
[Parallel(n_jobs=10)]: Done  65 tasks      | elapsed:   12.0s
[Parallel(n_jobs=10)]: Done  78 tasks      | elapsed:   12.7s
[Parallel(n_jobs=10)]: Done  93 tasks      | elapsed:   14.1s
[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:   15.0s
[Parallel(n_jobs=10)]: Done 125 tasks      | elapsed:   16.5s
[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed:   17.8s
[Parallel(n_jobs=10)]: Done 161 tasks      | elapsed:   19.1s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   20.4s
[Parallel(n_jobs=10)]: Done 201 tasks      | elapsed:  

No feature filtering is done (fsel = 0)


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, 

  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed: 61.8min
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed: 166.9min
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
[Parallel(n_jobs=10)]: Done  21 tasks      | el

In [17]:
BLADE_out = pickle.load(open("/home/cke/BLADE/data/PuramTCGA_BLADEout.pickle", 'rb'))

In [5]:
obj = BLADE_out['final_obj']
    
outcomes = {
    'BLADE': {
        'Fraction': t(obj.ExpF(obj.Beta)), 
        'Signature': np.mean(obj.Nu, 0), #group mode purification
        'HighRes': obj.Nu                #highresolution mode purification
    }}

In [6]:
obj

<Deconvolution.BLADE.BLADE at 0x7f83873621a0>

In [16]:
outcomes['BLADE']['HighRes']

array([[[ 2.30605347e+00,  2.62865000e+00,  2.01893963e+00, ...,
          1.96210937e+00,  1.85478124e+00,  1.83446388e+00],
        [ 6.35951152e-03,  2.91678640e-01,  2.23678787e+00, ...,
          1.88555078e-02,  1.20403895e+00,  4.00104090e-01],
        [ 6.35951152e-03,  2.91678640e-01,  2.23678787e+00, ...,
          1.88555078e-02,  1.20403895e+00,  4.00104090e-01],
        ...,
        [ 7.56285149e-02, -2.32898899e-02,  8.71423805e-02, ...,
          1.19657303e-01,  2.57131386e+00,  5.64923866e-02],
        [ 2.61605710e-01,  3.02278431e-01,  2.99706804e-01, ...,
          4.54576315e-01,  2.68324539e-01,  1.68840096e-01],
        [ 5.18714323e-01,  6.69181601e-01,  6.11033364e-01, ...,
          7.63625170e-01,  6.10380723e-01,  3.93237680e-01]],

       [[ 2.32762247e+00,  2.62662350e+00,  2.02001522e+00, ...,
          1.96436580e+00,  1.85123888e+00,  1.83665612e+00],
        [ 6.37943928e-03,  2.91813744e-01,  2.23744276e+00, ...,
          1.88113739e-02,  1.20469337e

In [15]:
filtered_celltypefrac_BLADE = pd.DataFrame(outcomes['BLADE']['Fraction'])
filtered_celltypefrac_BLADE

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,536,537,538,539,540,541,542,543,544,545
0,0.104603,0.100206,0.122808,0.108579,0.123022,0.125823,0.098201,0.09994,0.098655,0.117692,...,0.094491,0.108451,0.102236,0.105779,0.082436,0.119341,0.110944,0.104061,0.141004,0.097803
1,0.066228,0.088058,0.066535,0.057493,0.104468,0.090629,0.091832,0.077035,0.092922,0.047814,...,0.091859,0.078456,0.096658,0.086071,0.079262,0.06566,0.084659,0.077693,0.083895,0.074268
2,0.090092,0.096285,0.139677,0.104748,0.133108,0.151043,0.097712,0.112606,0.135649,0.134974,...,0.111523,0.113648,0.097211,0.141448,0.125629,0.113515,0.117308,0.116922,0.098197,0.114874
3,0.07596,0.064099,0.083428,0.087106,0.102031,0.052358,0.086453,0.102556,0.079781,0.089237,...,0.082835,0.095963,0.071834,0.09517,0.098809,0.09053,0.063053,0.077389,0.069769,0.084671
4,0.083474,0.102457,0.140833,0.110944,0.089159,0.095262,0.067513,0.091835,0.058474,0.128326,...,0.086243,0.085643,0.066898,0.089236,0.050995,0.121051,0.084043,0.085337,0.10678,0.11403
5,0.083868,0.086151,0.0948,0.073646,0.077763,0.087105,0.067724,0.072028,0.063767,0.092906,...,0.087387,0.081468,0.067817,0.079202,0.070016,0.084416,0.071765,0.081914,0.085757,0.082666
6,0.128971,0.096708,0.058684,0.111118,0.085349,0.085278,0.116048,0.085294,0.108134,0.08539,...,0.10687,0.101458,0.113281,0.080487,0.130594,0.106892,0.107672,0.100148,0.114705,0.099877
7,0.053946,0.075083,0.042138,0.072806,0.035023,0.040004,0.057262,0.066159,0.045329,0.027985,...,0.071116,0.061224,0.066049,0.052363,0.061641,0.060721,0.061483,0.071095,0.086195,0.051916
8,0.052322,0.015902,0.02435,0.031015,0.033329,0.026466,0.054689,0.016949,0.050235,0.015125,...,0.019312,0.046001,0.049742,0.026661,0.119795,0.017319,0.012899,0.01899,0.012866,0.053849
9,0.260536,0.275053,0.226747,0.242544,0.216749,0.246032,0.262566,0.275598,0.267055,0.260551,...,0.248365,0.227687,0.268274,0.243584,0.180824,0.220555,0.286174,0.266451,0.200833,0.226047


In [33]:
filtered_celltypefrac_BLADE.columns = df_TCGA_shared.columns

In [34]:
filtered_celltypefrac_BLADE.index = df_shared_mean.columns

In [37]:
filtered_celltypefrac_BLADE

Unnamed: 0,TCGA-BB-4224-01A,TCGA-H7-7774-01A,TCGA-CV-6943-01A,TCGA-CN-5374-01A,TCGA-CQ-6227-01A,TCGA-CV-6959-01A,TCGA-F7-A61V-01A,TCGA-CV-7413-01A,TCGA-CV-7247-01A,TCGA-CR-5249-01A,...,TCGA-CV-6960-11A,TCGA-CV-A464-01A,TCGA-C9-A47Z-01A,TCGA-CN-6010-01A,TCGA-WA-A7GZ-11A,TCGA-CV-7235-01A,TCGA-CX-7086-01A,TCGA-CV-6935-11A,TCGA-P3-A6SW-01A,TCGA-HD-A6HZ-01A
B cell,0.104603,0.100206,0.122808,0.108579,0.123022,0.125823,0.098201,0.09994,0.098655,0.117692,...,0.094491,0.108451,0.102236,0.105779,0.082436,0.119341,0.110944,0.104061,0.141004,0.097803
Dendritic,0.066228,0.088058,0.066535,0.057493,0.104468,0.090629,0.091832,0.077035,0.092922,0.047814,...,0.091859,0.078456,0.096658,0.086071,0.079262,0.06566,0.084659,0.077693,0.083895,0.074268
Endothelial,0.090092,0.096285,0.139677,0.104748,0.133108,0.151043,0.097712,0.112606,0.135649,0.134974,...,0.111523,0.113648,0.097211,0.141448,0.125629,0.113515,0.117308,0.116922,0.098197,0.114874
Fibroblast,0.07596,0.064099,0.083428,0.087106,0.102031,0.052358,0.086453,0.102556,0.079781,0.089237,...,0.082835,0.095963,0.071834,0.09517,0.098809,0.09053,0.063053,0.077389,0.069769,0.084671
Macrophage,0.083474,0.102457,0.140833,0.110944,0.089159,0.095262,0.067513,0.091835,0.058474,0.128326,...,0.086243,0.085643,0.066898,0.089236,0.050995,0.121051,0.084043,0.085337,0.10678,0.11403
Mast,0.083868,0.086151,0.0948,0.073646,0.077763,0.087105,0.067724,0.072028,0.063767,0.092906,...,0.087387,0.081468,0.067817,0.079202,0.070016,0.084416,0.071765,0.081914,0.085757,0.082666
,0.128971,0.096708,0.058684,0.111118,0.085349,0.085278,0.116048,0.085294,0.108134,0.08539,...,0.10687,0.101458,0.113281,0.080487,0.130594,0.106892,0.107672,0.100148,0.114705,0.099877
T cell,0.053946,0.075083,0.042138,0.072806,0.035023,0.040004,0.057262,0.066159,0.045329,0.027985,...,0.071116,0.061224,0.066049,0.052363,0.061641,0.060721,0.061483,0.071095,0.086195,0.051916
myocyte,0.052322,0.015902,0.02435,0.031015,0.033329,0.026466,0.054689,0.016949,0.050235,0.015125,...,0.019312,0.046001,0.049742,0.026661,0.119795,0.017319,0.012899,0.01899,0.012866,0.053849
tumor,0.260536,0.275053,0.226747,0.242544,0.216749,0.246032,0.262566,0.275598,0.267055,0.260551,...,0.248365,0.227687,0.268274,0.243584,0.180824,0.220555,0.286174,0.266451,0.200833,0.226047


In [38]:
filtered_celltypefrac_BLADE.to_csv("/home/cke/BLADE/data/filtered_celltypefrac_BLADE_PuramTCGA.csv")