In [1]:
import anndata as ad
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
import math
import pandas as pd
import random
from scipy import stats
import torch
import seaborn as sns
import sys
sys.path.append('..')
from VAE.VAE_model import VAE

In [2]:
def load_VAE():
    autoencoder = VAE(
        num_genes=18996,
        device='cuda',
        seed=0,
        hparams="",
        decoder_activation='ReLU',
    )
    autoencoder.load_state_dict(torch.load('/data1/lep/Workspace/guided-diffusion/VAE/checkpoint/muris_all/model_seed=0_step=800000.pt'))
    return autoencoder

real data

In [3]:
adata = sc.read_h5ad('/data1/lep/Workspace/guided-diffusion/data/tabula_muris/all.h5ad')
adata.var_names_make_unique()
sc.pp.filter_cells(adata, min_genes=10)
sc.pp.filter_genes(adata, min_cells=3)
gene_names = adata.var_names

sc.pp.normalize_total(adata, target_sum=1e4)
cell_data = adata.X.toarray()#[:10000]

cell_data.shape

  utils.warn_names_duplicates("obs")


(57004, 18996)

generated data

In [4]:
# the generated data path
npzfile=np.load('/data1/lep/Workspace/guided-diffusion/data/new_version/muris_all.npz',allow_pickle=True)

cell_gen_all = npzfile['cell_gen']

autoencoder = load_VAE()
cell_gen_all = autoencoder(torch.tensor(cell_gen_all).cuda(),return_decoded=True).detach().cpu().numpy()
ori = ad.AnnData(cell_gen_all, dtype=np.float32)
cell_gen = ori.X
cell_gen.shape

(57000, 18996)

correlation

In [5]:
print('spearman=',stats.spearmanr(cell_data.mean(axis=0), cell_gen.mean(axis=0)).correlation)
print('pearson=',np.corrcoef(cell_data.mean(axis=0), cell_gen.mean(axis=0))[0][1])

spearman= 0.9324554384174221
pearson= 0.9880028185723689


celltypist for conditional generation

In [7]:
import celltypist

cato = ['Bladder', 'Heart_and_Aorta', 'Kidney', 'Limb_Muscle', 'Liver',
       'Lung', 'Mammary_Gland', 'Marrow', 'Spleen', 'Thymus', 'Tongue',
       'Trachea']

rf = []
diffu_acc = []

for i in range(12):
    npzfile=np.load('/data1/lep/Workspace/guided-diffusion/data/new_version/muris_all2_'+str(i)+'.npz',allow_pickle=True)
    cell_gen_all = npzfile['cell_gen'][:1000]
    autoencoder = load_VAE()
    batch = []
    for j in range(18):
        batch.append(autoencoder(torch.tensor(cell_gen_all[j*500:(j+1)*500]).cuda(),return_decoded=True).cpu().detach().numpy())
    cell_gen_all = np.concatenate(batch)
    ori = ad.AnnData(cell_gen_all, dtype=np.float32)
    ori.var_names = gene_names

    sc.pp.normalize_total(ori,1e4)
    sc.pp.log1p(ori)
    predictions = celltypist.annotate(ori, model = '/data1/lep/Workspace/guided-diffusion/checkpoint/celltypist_muris_all_re2.pkl')
    acc = (predictions.predicted_labels.squeeze(1).values==cato[i]).sum()/1000
    diffu_acc.append(acc)
    print(acc)

🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.892


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.586


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.829


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.777


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.77


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.807


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.811


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.789


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.982


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.88


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.964


🔬 Input data has 1000 cells and 18996 genes
🔗 Matching reference genes in the model
🧬 18996 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


0.93


MMD

In [8]:
import torch

def guassian_kernel(source, target, kernel_mul=2.0, kernel_num=5, fix_sigma=None):
    '''
    将源域数据和目标域数据转化为核矩阵, 即上文中的K
    Params: 
	    source: 源域数据(n * len(x))
	    target: 目标域数据(m * len(y))
	    kernel_mul: 
	    kernel_num: 取不同高斯核的数量
	    fix_sigma: 不同高斯核的sigma值
	Return:
		sum(kernel_val): 多个核矩阵之和
    '''
    n_samples = int(source.size()[0])+int(target.size()[0])
    total = torch.cat([source, target], dim=0)

    total0 = total.unsqueeze(0).expand(int(total.size(0)), int(total.size(0)), int(total.size(1)))

    total1 = total.unsqueeze(1).expand(int(total.size(0)), int(total.size(0)), int(total.size(1)))

    L2_distance = ((total0-total1)**2).sum(2) 

    if fix_sigma:
        bandwidth = fix_sigma
    else:
        bandwidth = torch.sum(L2_distance.data) / (n_samples**2-n_samples)

    bandwidth /= kernel_mul ** (kernel_num // 2)
    bandwidth_list = [bandwidth * (kernel_mul**i) for i in range(kernel_num)]

    kernel_val = [torch.exp(-L2_distance / bandwidth_temp) for bandwidth_temp in bandwidth_list]

    return sum(kernel_val)

def mmd_rbf(source, target, kernel_mul=2.0, kernel_num=5, fix_sigma=None):
    batch_size = int(source.size()[0])
    kernels = guassian_kernel(source, target,
        kernel_mul=kernel_mul, kernel_num=kernel_num, fix_sigma=fix_sigma)

    XX = kernels[:batch_size, :batch_size]
    YY = kernels[batch_size:, batch_size:]
    XY = kernels[:batch_size, batch_size:]
    YX = kernels[batch_size:, :batch_size]
    loss = torch.mean(XX + YY - XY -YX)
    return loss

In [9]:
adata = np.concatenate((cell_data, cell_gen),axis=0)
adata = ad.AnnData(adata, dtype=np.float32)
adata.obs_names = [f"true_Cell" for i in range(cell_data.shape[0])]+[f"gen_Cell" for i in range(cell_gen.shape[0])]

In [10]:
from torch.autograd import Variable
sc.tl.pca(adata, svd_solver='arpack')
real = adata[adata.obs_names=='true_Cell'].obsm['X_pca'][:10000]
gen = adata[adata.obs_names=='gen_Cell'].obsm['X_pca'][:10000]
X = torch.Tensor(real)
Y = torch.Tensor(gen)
X,Y = Variable(X), Variable(Y)
print(mmd_rbf(X,Y))

tensor(0.0629)


scib

In [11]:
import scib
adata = np.concatenate((cell_data, cell_gen),axis=0)
adata = ad.AnnData(adata, dtype=np.float32)
adata.obs['batch'] = pd.Categorical([f"true_Cell" for i in range(cell_data.shape[0])]+[f"gen_Cell" for i in range(cell_gen.shape[0])])
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=20)
scib.me.ilisi_graph(adata, batch_key="batch", type_="knn")

         Falling back to preprocessing with `sc.pp.pca` and default params.


0.8841768000117711