In [2]:
import sys, os
from Deconvolution.BLADE import Framework
import numpy as np
from numpy import transpose as t
import itertools
import pickle
from scipy.optimize import nnls
from sklearn.svm import SVR
from sklearn.svm import NuSVR

from sklearn.metrics import mean_squared_error as mse
import pandas as pd

# modules for visualization
import qgrid
from matplotlib import pyplot as plt
import seaborn as sns

### Run BLADE with TCGA bulk and Puram scRNA-seq reference


#### Application of deconvolution methods

From here, we will apply the following three methods for further performance comparison:
1. BLADE (estimation of cellular fraction + group-mode/high-resolution-mode purification)
2. NNLS (estimation of fraction)
3. SVR followed by NNLS (estimation of fraction + group-mode purification) - similar to CIBERSORTx


##### 1. Application of BLADE

These are the key parameters used in BLADE (note that there is default settings of these parameters, if not specified):
- Hyperparameters (`hyperpars`): `Alpha`, `Alpha0`, `Kappa0` and `SigmaY`, each of which can be defined as a list of options. BLADE takes an empirical Bayes approach to find the optimal parameter set given the all possible combinations. 
- `Nrep`: Number of repeat for evaluating each parameter configuration.
- `Nrepfinal`: Number of repeated optimizations for the final parameter set.
- `Njob`: Number of parallel jobs.

In [10]:
hyperpars = {
    'Alpha': [1, 10],
    'Alpha0': [0.1, 1, 5],
    'Kappa0': [1, 0.5, 0.1],
    'SY': [1,0.3,0.5],
}

Nrep=3
Nrepfinal=10
Njob=10

In [11]:
# read in marker genes, highly variable and top 100 DEGs for each cell type
marker_genes = pd.read_csv("/home/cke/Puram/top100markers_de_cor.txt",header=None).iloc[0,:]

In [12]:
# df_Puram_std = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_std.tsv",sep='\t',index_col=0)
# df_Puram_mean = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_mean.tsv",sep='\t',index_col=0)

# merged all tumor cell types
df_Puram_std = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_std_simple.tsv",sep='\t',index_col=0)
df_Puram_mean = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_mean_simple.tsv",sep='\t',index_col=0)

In [13]:
df_Puram_std

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000121410,1.083922,0.818420,0.399890,0.801904,0.564758,0.838012,0.614700,0.668682,0.000000,0.544478
ENSG00000268895,0.511951,0.016481,0.386574,0.758462,0.462413,1.065588,0.401408,0.406487,0.000000,0.499750
ENSG00000148584,0.006775,0.018113,0.035513,0.017553,0.003137,0.030807,0.024420,0.022276,0.000000,0.012275
ENSG00000175899,0.108961,0.651788,3.116251,3.586992,2.773042,0.804513,2.095236,0.318067,2.588845,0.313355
ENSG00000245105,0.089944,0.028935,0.747009,0.484618,0.415440,0.724833,0.226807,0.407835,0.000000,0.324657
...,...,...,...,...,...,...,...,...,...,...
ENSG00000203995,0.362794,0.374291,0.219057,0.196687,0.118434,0.137730,0.457341,0.240729,0.266944,0.254686
ENSG00000162378,0.457661,0.168491,0.897044,0.904963,0.659679,0.528316,0.632295,0.442593,0.147025,0.536564
ENSG00000159840,0.923420,2.148970,2.143931,2.233748,2.064606,1.246275,1.610354,1.907724,1.370751,1.663473
ENSG00000074755,0.630889,0.826259,0.918515,0.843143,0.451562,0.614904,0.610981,0.911177,0.707439,0.588526


In [14]:
df_Puram_std_filtered = df_Puram_std.loc[marker_genes,:]
df_Puram_std_filtered

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000134285,2.081012,0.872949,2.350889,1.810899,1.354811,0.897654,2.458720,1.861098,0.000000,1.773220
ENSG00000026751,2.875747,1.602404,0.275670,0.291092,2.199984,0.170209,0.954702,1.587335,0.373973,0.486743
ENSG00000206560,2.432494,1.158146,1.566233,1.222124,0.722855,1.903457,0.814598,1.658128,0.601143,0.541751
ENSG00000118363,1.991783,1.653282,2.262542,2.355330,1.776896,2.156453,2.201362,2.385229,1.594664,1.500061
ENSG00000135046,1.265247,2.911882,3.481683,3.414719,2.245746,2.300987,3.762640,4.308938,3.232415,2.400389
...,...,...,...,...,...,...,...,...,...,...
ENSG00000106853,0.000000,1.309926,1.655642,2.132211,1.390806,0.488696,1.674916,0.014792,1.965746,2.414095
ENSG00000108826,1.788428,1.617861,2.469836,2.352559,2.205044,1.770743,2.638541,1.674134,2.368219,2.186814
ENSG00000139514,1.110492,0.860129,0.837239,0.567299,0.745258,0.004291,1.462706,0.778590,0.914472,1.517481
ENSG00000164904,0.319172,0.025973,1.228673,1.251642,0.746752,0.020970,1.542731,0.043001,1.562951,1.610443


In [15]:
df_Puram_mean

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000121410,0.531012,0.306200,0.077734,0.256194,0.219202,0.305957,0.151037,0.124142,0.000000,0.197286
ENSG00000268895,0.091276,0.002308,0.054535,0.153220,0.078708,0.238673,0.057073,0.036260,0.000000,0.112074
ENSG00000148584,0.000998,0.005308,0.004459,0.002982,0.000482,0.007019,0.007832,0.007393,0.000000,0.002747
ENSG00000175899,0.021167,0.137365,6.034123,5.435018,2.885008,0.259597,0.708786,0.079807,3.576743,0.031501
ENSG00000245105,0.007657,0.004052,0.171338,0.077473,0.057223,0.105998,0.021405,0.030704,0.000000,0.038320
...,...,...,...,...,...,...,...,...,...,...
ENSG00000203995,0.206693,0.230500,0.215373,0.172395,0.091906,0.180291,0.333693,0.335935,0.180180,0.134803
ENSG00000162378,0.223046,0.182567,0.487771,0.448538,0.316599,0.268201,0.374778,0.341616,0.115098,0.285960
ENSG00000159840,0.215561,1.949729,1.787872,1.973410,2.810183,0.403501,0.868257,0.814331,0.739055,1.611830
ENSG00000074755,0.165944,0.254342,0.334769,0.280260,0.149263,0.143537,0.181321,0.233642,0.357566,0.250754


In [16]:
df_Puram_mean_filtered = df_Puram_mean.loc[marker_genes,:]
df_Puram_mean_filtered

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000134285,7.519426,0.122237,1.358591,0.727107,0.454236,0.173548,1.422514,0.599019,0.000000,1.226526
ENSG00000026751,4.939985,0.767954,0.121721,0.119129,1.583630,0.132358,0.344129,0.706510,0.203738,0.141863
ENSG00000206560,3.455216,0.455401,0.887737,0.471214,0.228186,1.286028,0.207882,0.676300,0.174943,0.168546
ENSG00000118363,5.316448,0.893145,2.520081,2.254201,2.850228,1.566486,2.211636,1.431471,0.972571,3.112495
ENSG00000135046,0.235502,1.695405,5.480532,4.403522,6.098593,8.105830,4.983282,4.046035,2.160422,6.438264
...,...,...,...,...,...,...,...,...,...,...
ENSG00000106853,0.000000,0.466676,0.671437,1.033917,0.616116,0.044612,0.683736,0.000584,0.980916,2.830673
ENSG00000108826,0.609314,0.523410,1.324534,1.262678,1.524344,0.580507,1.899278,0.457255,1.328111,3.499661
ENSG00000139514,0.439953,0.245679,0.293729,0.128405,0.227753,0.000535,0.751983,0.153971,0.446848,1.407189
ENSG00000164904,0.045150,0.005946,0.510706,0.461885,0.254827,0.006492,0.856042,0.010484,0.805236,1.750448


In [17]:
# GET LOG SCALE OF MEAN COUNTS, LOG2(MEAN_COUNT+1)
df_Puram_mean_log2 = np.log2(df_Puram_mean_filtered+1)

In [18]:
df_Puram_mean_log2

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000134285,3.090756,0.166378,1.237925,0.788357,0.540261,0.230877,1.276505,0.677187,0.000000,1.154795
ENSG00000026751,2.570459,0.822081,0.165713,0.162376,1.369399,0.179330,0.426672,0.771049,0.267522,0.191389
ENSG00000206560,2.155496,0.541417,0.916658,0.557007,0.296529,1.192843,0.272479,0.745281,0.232590,0.224715
ENSG00000118363,2.659113,0.920785,1.815609,1.702303,1.944944,1.359795,1.683308,1.281829,0.980077,2.040014
ENSG00000135046,0.305097,1.430502,2.696112,2.433900,2.827533,3.186791,2.580937,2.335150,1.660117,2.894966
...,...,...,...,...,...,...,...,...,...,...
ENSG00000106853,0.000000,0.552551,0.741089,1.024261,0.692531,0.062967,0.751666,0.000842,0.986168,1.937598
ENSG00000108826,0.686446,0.607304,1.216942,1.178031,1.335908,0.660387,1.535694,0.543253,1.219160,2.169816
ENSG00000139514,0.526021,0.316932,0.371535,0.174285,0.296021,0.000771,0.808989,0.206607,0.532913,1.267350
ENSG00000164904,0.063710,0.008553,0.595223,0.547829,0.327488,0.009335,0.892229,0.015047,0.852188,1.459667


In [8]:
df_TCGA = pd.read_csv("/home/cke/TCGA-HNSC.htseq_counts_exp2.tsv",sep='\t',index_col=0)

In [9]:
df_TCGA

Unnamed: 0_level_0,TCGA-BB-4224-01A,TCGA-H7-7774-01A,TCGA-CV-6943-01A,TCGA-CN-5374-01A,TCGA-CQ-6227-01A,TCGA-CV-6959-01A,TCGA-F7-A61V-01A,TCGA-CV-7413-01A,TCGA-CV-7247-01A,TCGA-CR-5249-01A,...,TCGA-CV-6960-11A,TCGA-CV-A464-01A,TCGA-C9-A47Z-01A,TCGA-CN-6010-01A,TCGA-WA-A7GZ-11A,TCGA-CV-7235-01A,TCGA-CX-7086-01A,TCGA-CV-6935-11A,TCGA-P3-A6SW-01A,TCGA-HD-A6HZ-01A
Ensembl_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,2237.0,2740.0,2686.0,2086.0,10167.0,1154.0,1978.0,1930.0,1066.0,2538.0,...,3340.0,929.0,1707.0,2218.0,2537.0,492.0,2741.0,8492.0,770.0,923.0
ENSG00000000005,2.0,0.0,0.0,1.0,9.0,6.0,1.0,1.0,1.0,0.0,...,0.0,2.0,0.0,0.0,41.0,0.0,0.0,3.0,1.0,0.0
ENSG00000000419,1606.0,1691.0,1649.0,2333.0,3021.0,2766.0,1762.0,1668.0,1760.0,1268.0,...,1388.0,2332.0,1926.0,1574.0,1171.0,976.0,1952.0,1578.0,1569.0,1183.0
ENSG00000000457,1063.0,803.0,917.0,1288.0,537.0,527.0,482.0,671.0,600.0,1066.0,...,608.0,472.0,410.0,852.0,557.0,376.0,1003.0,818.0,620.0,607.0
ENSG00000000460,1208.0,317.0,402.0,1105.0,459.0,747.0,331.0,476.0,874.0,950.0,...,206.0,386.0,210.0,729.0,127.0,363.0,1103.0,281.0,591.0,344.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSGR0000275287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSGR0000276543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSGR0000277120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSGR0000280767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
merge_genes_mean = pd.merge(df_Puram_mean_log2,df_TCGA,left_index=True,right_index=True,how='inner')
merge_genes_std = pd.merge(df_Puram_std_filtered,df_TCGA,left_index=True,right_index=True,how='inner')
#21706 genes in common
# df_TCGA_shared = merge_genes_mean.iloc[:,24:]
# df_shared_mean = merge_genes_mean.iloc[:,:24]
# df_shared_std = merge_genes_std.iloc[:,:24]

#simple tumor cell type setup
df_TCGA_shared = merge_genes_mean.iloc[:,10:]
df_shared_mean = merge_genes_mean.iloc[:,:10]
df_shared_std = merge_genes_std.iloc[:,:10]

In [20]:
df_TCGA_shared

Unnamed: 0,TCGA-BB-4224-01A,TCGA-H7-7774-01A,TCGA-CV-6943-01A,TCGA-CN-5374-01A,TCGA-CQ-6227-01A,TCGA-CV-6959-01A,TCGA-F7-A61V-01A,TCGA-CV-7413-01A,TCGA-CV-7247-01A,TCGA-CR-5249-01A,...,TCGA-CV-6960-11A,TCGA-CV-A464-01A,TCGA-C9-A47Z-01A,TCGA-CN-6010-01A,TCGA-WA-A7GZ-11A,TCGA-CV-7235-01A,TCGA-CX-7086-01A,TCGA-CV-6935-11A,TCGA-P3-A6SW-01A,TCGA-HD-A6HZ-01A
ENSG00000002549,1761.0,11512.0,23733.0,2595.0,6249.0,14539.0,1664.0,3509.0,4709.0,5298.0,...,2942.0,8907.0,4001.0,3977.0,2302.0,14008.0,7817.0,4122.0,2865.0,5146.0
ENSG00000002586,6575.0,6882.0,18744.0,5821.0,16307.0,9342.0,6421.0,23114.0,14804.0,16179.0,...,7883.0,21783.0,7725.0,15481.0,7301.0,14532.0,3594.0,11525.0,8249.0,5859.0
ENSG00000002586,6575.0,6882.0,18744.0,5821.0,16307.0,9342.0,6421.0,23114.0,14804.0,16179.0,...,7883.0,21783.0,7725.0,15481.0,7301.0,14532.0,3594.0,11525.0,8249.0,5859.0
ENSG00000003436,225.0,243.0,1435.0,680.0,2224.0,1251.0,210.0,327.0,628.0,2468.0,...,475.0,708.0,229.0,909.0,979.0,2948.0,294.0,893.0,250.0,737.0
ENSG00000004059,12456.0,8068.0,8825.0,6761.0,9777.0,10393.0,9682.0,6069.0,6576.0,7514.0,...,5819.0,8226.0,5243.0,7286.0,4927.0,10226.0,8670.0,10221.0,5045.0,3192.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000271043,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
ENSG00000276409,6.0,6.0,71.0,25.0,138.0,88.0,12.0,1.0,38.0,24.0,...,52.0,27.0,0.0,61.0,237.0,31.0,3.0,90.0,16.0,27.0
ENSG00000277443,15322.0,6485.0,22969.0,13194.0,12459.0,14893.0,6847.0,16878.0,10570.0,16299.0,...,8519.0,10405.0,17090.0,6525.0,3838.0,20714.0,14171.0,8859.0,3486.0,4628.0
ENSG00000280213,71.0,14.0,12.0,80.0,7.0,41.0,5.0,24.0,9.0,45.0,...,29.0,23.0,110.0,63.0,124.0,13.0,92.0,113.0,30.0,14.0


In [13]:
df_shared_mean

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000003402,3.965319,5.158222,3.034486,2.156816,2.896742,1.691081,2.324375,2.891413,2.606933,2.534943
ENSG00000003436,0.004004,0.223417,3.744232,2.499596,0.149699,0.003945,0.438807,0.012854,1.302279,0.312333
ENSG00000003436,0.004004,0.223417,3.744232,2.499596,0.149699,0.003945,0.438807,0.012854,1.302279,0.312333
ENSG00000004399,0.000000,0.549420,1.687792,0.472600,0.767809,0.212628,0.121773,0.074112,0.000000,0.066365
ENSG00000004468,3.556487,0.332463,0.155182,0.062074,0.789517,0.272665,0.488337,0.137704,0.000000,0.806148
...,...,...,...,...,...,...,...,...,...,...
ENSG00000267041,0.459932,0.508493,0.573710,0.467061,0.277953,0.487566,0.664987,0.791584,0.557444,0.305656
ENSG00000271503,1.423306,1.932294,1.663294,1.551110,1.769293,1.809516,1.999731,5.516435,1.619902,1.179146
ENSG00000272573,0.053286,0.000000,0.062059,2.373150,0.073171,0.055892,0.075262,0.086315,4.939921,0.038955
ENSG00000276975,0.193497,0.232019,0.228565,0.187650,0.109833,0.205785,0.316925,0.368559,0.203035,0.115146


In [14]:
df_shared_std

Unnamed: 0,B cell,Dendritic,Endothelial,Fibroblast,Macrophage,Mast,NA,T cell,myocyte,tumor
ENSG00000003402,2.203674,1.620045,1.906755,1.597623,1.914676,1.295219,1.716956,2.026647,1.608838,1.423351
ENSG00000003436,0.024432,0.897956,3.062287,2.653942,0.725784,0.022833,1.362985,0.189990,2.006773,0.984210
ENSG00000003436,0.024432,0.897956,3.062287,2.653942,0.725784,0.022833,1.362985,0.189990,2.006773,0.984210
ENSG00000004399,0.000000,1.134139,1.784824,1.075307,1.042394,0.785650,0.585981,0.546017,0.000000,0.326545
ENSG00000004468,2.985176,1.135200,0.893962,0.538615,1.682291,1.085830,1.651405,0.875682,0.000000,1.437999
...,...,...,...,...,...,...,...,...,...,...
ENSG00000267041,0.333189,0.341332,0.512605,0.283333,0.267111,0.305385,0.692498,0.380377,0.456139,0.333618
ENSG00000271503,0.850516,1.243896,0.933702,0.815524,1.504521,0.950195,1.444765,3.276136,1.334287,0.898732
ENSG00000272573,0.378618,0.000000,0.461570,3.130770,0.411163,0.590576,0.614152,0.650731,2.526449,0.303349
ENSG00000276975,0.139156,0.159220,0.181959,0.141089,0.105930,0.139984,0.398076,0.178212,0.185654,0.120230


Given the configuration above, BLADE is applied to each of the simulation dataset created previously.  

BLADE produce several outcomes:
- `final_obj`: final BLADE object with optimized variational parameters
- `best_obj`: BLADE object trained with the best parameter set found by the Empirical Bayes framework. Empirical Bayes framework is applied after selecting a subset of samples (5 samples; indicated by `Ind_sample` below), and thus the outcome contains only 5 samples. If `Nsample` <= 5, `final_obj` is identical to `best_obj`.
- `best_set`: Best parameter set defined by Empirical Bayes framework.
- `outs`: Outcome of BLADE for every possible combination of hyperparameters, used in the Empirical Bayes framework. 


- There are nan in mean and std matrix! NAs are filled with 0?

full tumor type setup:
- ngenes = 21706 common genes
- ncells = 24, including all 16 tumor types
- nsample = 546


simple tumor type setup:
- ngenes = 21706 common genes
- ncells = 10, all tumor types are merged, including one NA type?
- nsample = 546
- marker genes = 900 (including 9 genes not shared)


In [15]:
df_TCGA_shared.to_numpy().shape

(891, 546)

In [None]:
Y = df_TCGA_shared.to_numpy()
mean = df_shared_mean.to_numpy() 
sd = df_shared_std.to_numpy()

outfile = './BLADE/data/PuramTCGA_BLADE.pickle'

final_obj, best_obj, best_set, outs = Framework(
    mean, sd, Y,
    Alphas=hyperpars['Alpha'], Alpha0s=hyperpars['Alpha0'], 
    Kappa0s=hyperpars['Kappa0'], SYs=hyperpars['SY'],
    Nrep=Nrep, Njob=Njob, Nrepfinal=Nrepfinal)

pickle.dump(
    {
        'final_obj': final_obj,
        'best_obj': best_obj,
        'best_set': best_set,
        'outs' : outs
    }, open(outfile, 'wb')
    )

all of 891 genes are used for optimization.
All samples are used during the optimization.
Initialization with Support vector regression


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    1.4s
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    1.5s
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    1.7s
[Parallel(n_jobs=10)]: Done  41 tasks      | elapsed:    1.8s
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    2.0s
[Parallel(n_jobs=10)]: Done  65 tasks      | elapsed:    2.2s
[Parallel(n_jobs=10)]: Done  78 tasks      | elapsed:    2.4s
[Parallel(n_jobs=10)]: Done  93 tasks      | elapsed:    2.7s
[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:    2.9s
[Parallel(n_jobs=10)]: Done 125 tasks      | elapsed:    3.1s
[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed:    3.4s
[Parallel(n_jobs=10)]: Done 161 tasks      | elapsed:    3.7s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    3.9s
[Parallel(n_jobs=10)]: Done 201 tasks      | elapsed:  

No feature filtering is done (fsel = 0)


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, 

  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed: 13.4min
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed: 73.0min
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  r

### test run with a small subset of scrna reference, take randomly 1000 genes from both ref and bulk counts
> reduced hyperparam combinations

In [None]:
hyperpars_test = {
    'Alpha': [1, 10],
    'Alpha0': [0.1, 1, 5],
    'Kappa0': [1, 0.5, 0.1],
    'SY': [1,0.3,0.5],
}

Nrep=3
Nrepfinal=10
Njob=10

df_mean_sample = df_shared_mean.sample(n=2000)
df_std_sample = df_shared_std[df_shared_std.index.isin(df_mean_sample.index)]
Y_sample = df_TCGA_shared[df_TCGA_shared.index.isin(df_mean_sample.index)]

Y = Y_sample.to_numpy()
mean = df_mean_sample.to_numpy() 
sd = df_std_sample.to_numpy()

outfile = './BLADE/data/PuramTCGA_BLADE_Sample2000.pickle'

final_obj, best_obj, best_set, outs = Framework(
    mean, sd, Y,
    Alphas=hyperpars_test['Alpha'], Alpha0s=hyperpars_test['Alpha0'], 
    Kappa0s=hyperpars_test['Kappa0'], SYs=hyperpars_test['SY'],
    Nrep=Nrep, Njob=Njob, Nrepfinal=Nrepfinal)

pickle.dump(
    {
        'final_obj': final_obj,
        'best_obj': best_obj,
        'best_set': best_set,
        'outs' : outs
    }, open(outfile, 'wb')
    )


all of 2000 genes are used for optimization.
All samples are used during the optimization.
Initialization with Support vector regression


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:    7.1s
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    8.0s
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    8.7s
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    8.9s
[Parallel(n_jobs=10)]: Done  41 tasks      | elapsed:   10.4s
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:   11.2s
[Parallel(n_jobs=10)]: Done  65 tasks      | elapsed:   12.0s
[Parallel(n_jobs=10)]: Done  78 tasks      | elapsed:   12.7s
[Parallel(n_jobs=10)]: Done  93 tasks      | elapsed:   14.1s
[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:   15.0s
[Parallel(n_jobs=10)]: Done 125 tasks      | elapsed:   16.5s
[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed:   17.8s
[Parallel(n_jobs=10)]: Done 161 tasks      | elapsed:   19.1s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   20.4s
[Parallel(n_jobs=10)]: Done 201 tasks      | elapsed:  

No feature filtering is done (fsel = 0)


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, B0, Ngene, Ncell, Nsample)
  g_Exp = g_Exp_Beta(Nu, Omega, Beta, 

  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed: 61.8min
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed: 166.9min
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
  return -self.Nsample*np.sum(np.log(Omega))
  return PX+PY+PF-QX-QF
[Parallel(n_jobs=10)]: Done  21 tasks      | el

## Results

In [2]:
BLADE_out = pickle.load(open("/home/cke/BLADE/data/Puramfiltered_TCGA_corDEmarkers_BLADEout.pickle", 'rb'))

In [3]:
obj = BLADE_out['final_obj']
    
outcomes = {
    'BLADE': {
        'Fraction': t(obj.ExpF(obj.Beta)), 
        'Signature': np.mean(obj.Nu, 0), #group mode purification
        'HighRes': obj.Nu                #highresolution mode purification
    }}

In [4]:
obj

<Deconvolution.BLADE.BLADE at 0x7f884ae146d0>

In [5]:
outcomes['BLADE']['HighRes']

array([[[1.65999711, 2.41340815, 1.92168711, ..., 0.9479235 ,
         1.21087763, 1.80333423],
        [1.34301939, 0.92977145, 2.60227063, ..., 1.97223322,
         1.91973334, 2.28399776],
        [1.34301939, 0.92977145, 2.60227063, ..., 1.97223322,
         1.91973334, 2.28399776],
        ...,
        [0.74511718, 2.04816611, 1.53867464, ..., 0.10880362,
         0.00708967, 1.59158936],
        [0.36805438, 0.42242647, 0.48631699, ..., 0.60441956,
         0.42627705, 0.27063994],
        [0.8736406 , 0.79991105, 0.98356807, ..., 1.20515078,
         0.78776488, 0.77316287]],

       [[1.66009543, 2.41441201, 1.92183368, ..., 0.9480201 ,
         1.2108765 , 1.80262034],
        [1.34303971, 0.92970763, 2.60226148, ..., 1.9722637 ,
         1.91971061, 2.28363554],
        [1.34303971, 0.92970763, 2.60226148, ..., 1.9722637 ,
         1.91971061, 2.28363554],
        ...,
        [0.74512207, 2.04820426, 1.53870277, ..., 0.10880898,
         0.00709685, 1.59199642],
        [0.3

In [6]:
filtered_celltypefrac_BLADE = pd.DataFrame(outcomes['BLADE']['Fraction'])
filtered_celltypefrac_BLADE

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,536,537,538,539,540,541,542,543,544,545
0,0.016556,0.015301,0.039342,0.014119,0.013129,0.054413,0.013497,0.011429,0.015393,0.017912,...,0.016356,0.029524,0.015004,0.01676,0.029606,0.048992,0.030525,0.018896,0.018126,0.02413
1,0.063403,0.039511,0.061691,0.060972,0.027258,0.046588,0.0526,0.035493,0.03367,0.085479,...,0.041012,0.040022,0.064081,0.037648,0.039909,0.067665,0.048958,0.06439,0.053089,0.052805
2,0.043757,0.059522,0.123503,0.079168,0.125399,0.114573,0.047118,0.082173,0.085627,0.07907,...,0.078544,0.086754,0.063521,0.083358,0.05638,0.095378,0.049797,0.071475,0.093179,0.082791
3,0.028534,0.023768,0.115462,0.020689,0.15515,0.0933,0.025856,0.058014,0.073275,0.030229,...,0.030917,0.127889,0.02986,0.034178,0.06899,0.118928,0.048317,0.043705,0.03775,0.075082
4,0.069529,0.088064,0.151573,0.093153,0.086169,0.098878,0.048804,0.137143,0.037015,0.108271,...,0.094108,0.101086,0.045451,0.102387,0.063296,0.103254,0.066117,0.102964,0.111042,0.116849
5,0.01593,0.015115,0.011309,0.013378,0.013959,0.024312,0.012675,0.01437,0.011865,0.013871,...,0.015334,0.012556,0.013653,0.014415,0.016726,0.011495,0.012722,0.014948,0.013133,0.012352
6,0.214287,0.20505,0.146397,0.194647,0.15425,0.174789,0.203193,0.183463,0.132944,0.197712,...,0.234678,0.181655,0.227009,0.188126,0.137799,0.184807,0.222996,0.227656,0.189634,0.179838
7,0.029148,0.03346,0.050251,0.059932,0.01853,0.039921,0.050597,0.037408,0.163357,0.045866,...,0.023175,0.029407,0.027207,0.022811,0.040066,0.066358,0.034508,0.014687,0.044785,0.059869
8,0.031785,0.049646,0.022183,0.03498,0.076023,0.033535,0.060258,0.022526,0.070095,0.033317,...,0.040727,0.026167,0.029668,0.047236,0.304943,0.039481,0.023455,0.067743,0.028611,0.027197
9,0.487072,0.470564,0.278289,0.428963,0.330134,0.319692,0.485401,0.417981,0.376759,0.388274,...,0.425148,0.364939,0.484545,0.453081,0.242285,0.263642,0.462605,0.373537,0.410651,0.369086


In [21]:
filtered_celltypefrac_BLADE.columns = df_TCGA_shared.columns

In [22]:
filtered_celltypefrac_BLADE.index = df_shared_mean.columns

In [31]:
list_ind = filtered_celltypefrac_BLADE.index.tolist()
list_ind[6] = 'other'
filtered_celltypefrac_BLADE.index=list_ind

In [34]:
filtered_celltypefrac_BLADE

Unnamed: 0,TCGA-BB-4224-01A,TCGA-H7-7774-01A,TCGA-CV-6943-01A,TCGA-CN-5374-01A,TCGA-CQ-6227-01A,TCGA-CV-6959-01A,TCGA-F7-A61V-01A,TCGA-CV-7413-01A,TCGA-CV-7247-01A,TCGA-CR-5249-01A,...,TCGA-CV-6960-11A,TCGA-CV-A464-01A,TCGA-C9-A47Z-01A,TCGA-CN-6010-01A,TCGA-WA-A7GZ-11A,TCGA-CV-7235-01A,TCGA-CX-7086-01A,TCGA-CV-6935-11A,TCGA-P3-A6SW-01A,TCGA-HD-A6HZ-01A
B cell,0.016556,0.015301,0.039342,0.014119,0.013129,0.054413,0.013497,0.011429,0.015393,0.017912,...,0.016356,0.029524,0.015004,0.01676,0.029606,0.048992,0.030525,0.018896,0.018126,0.02413
Dendritic,0.063403,0.039511,0.061691,0.060972,0.027258,0.046588,0.0526,0.035493,0.03367,0.085479,...,0.041012,0.040022,0.064081,0.037648,0.039909,0.067665,0.048958,0.06439,0.053089,0.052805
Endothelial,0.043757,0.059522,0.123503,0.079168,0.125399,0.114573,0.047118,0.082173,0.085627,0.07907,...,0.078544,0.086754,0.063521,0.083358,0.05638,0.095378,0.049797,0.071475,0.093179,0.082791
Fibroblast,0.028534,0.023768,0.115462,0.020689,0.15515,0.0933,0.025856,0.058014,0.073275,0.030229,...,0.030917,0.127889,0.02986,0.034178,0.06899,0.118928,0.048317,0.043705,0.03775,0.075082
Macrophage,0.069529,0.088064,0.151573,0.093153,0.086169,0.098878,0.048804,0.137143,0.037015,0.108271,...,0.094108,0.101086,0.045451,0.102387,0.063296,0.103254,0.066117,0.102964,0.111042,0.116849
Mast,0.01593,0.015115,0.011309,0.013378,0.013959,0.024312,0.012675,0.01437,0.011865,0.013871,...,0.015334,0.012556,0.013653,0.014415,0.016726,0.011495,0.012722,0.014948,0.013133,0.012352
other,0.214287,0.20505,0.146397,0.194647,0.15425,0.174789,0.203193,0.183463,0.132944,0.197712,...,0.234678,0.181655,0.227009,0.188126,0.137799,0.184807,0.222996,0.227656,0.189634,0.179838
T cell,0.029148,0.03346,0.050251,0.059932,0.01853,0.039921,0.050597,0.037408,0.163357,0.045866,...,0.023175,0.029407,0.027207,0.022811,0.040066,0.066358,0.034508,0.014687,0.044785,0.059869
myocyte,0.031785,0.049646,0.022183,0.03498,0.076023,0.033535,0.060258,0.022526,0.070095,0.033317,...,0.040727,0.026167,0.029668,0.047236,0.304943,0.039481,0.023455,0.067743,0.028611,0.027197
tumor,0.487072,0.470564,0.278289,0.428963,0.330134,0.319692,0.485401,0.417981,0.376759,0.388274,...,0.425148,0.364939,0.484545,0.453081,0.242285,0.263642,0.462605,0.373537,0.410651,0.369086


In [33]:
# ignore this block and above
# use code in runMuSiC to store and access celltypefrac file
filtered_celltypefrac_BLADE.to_csv("/home/cke/BLADE/data/filtered_celltypefrac_BLADE_decor_PuramTCGA.csv")