In [1]:
import os
import sys
os.environ["OMP_NUM_THREADS"] = "11"
os.environ["OPENBLAS_NUM_THREADS"] = "8" # export OPENBLAS_NUM_THREADS=4 
os.environ["MKL_NUM_THREADS"] = "11" # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "8" # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "11" # export NUMEXPR_NUM_THREADS=6
os.environ["NUMBA_CACHE_DIR"]='/tmp/numba_cache'
import numpy as np
import pandas as pd
import scipy as sp
import scipy.sparse as sps
import h5py

import scvi
import scanpy as sc
import scipy.io as sio

import matplotlib.pyplot as plt
from os.path import join

Global seed set to 0


In [2]:
import scipy
import copy
import gc
def pearson_mat(X0, Y0):
    X, Y = copy.deepcopy(X0), copy.deepcopy(Y0)
    X = (X - X.mean(axis=0))
    X /= (scipy.linalg.norm(X, axis=0, ord=2) + 1e-12)
    Y = (Y - Y.mean(axis=0))
    Y /= (scipy.linalg.norm(Y, axis=0, ord=2) + 1e-12)
    res = (X * Y).sum(axis=0)
    del X, Y
    gc.collect()
    return res

def pearson_mat_axis1(X0, Y0):
    X, Y = copy.deepcopy(X0), copy.deepcopy(Y0)
    X = (X - X.mean(axis=1, keepdims=True))
    X /= (scipy.linalg.norm(X, axis=1, ord=2, keepdims=True) + 1e-12)
    Y = (Y - Y.mean(axis=1, keepdims=True))
    Y /= (scipy.linalg.norm(Y, axis=1, ord=2, keepdims=True) + 1e-12)
    res = (X * Y).sum(axis=1)
    del X, Y
    gc.collect()
    return res

def eval_PearSpear_AlongGene(X, Y):
    pears = pearson_mat(X, Y)
    spears = []
    for gi in range(X.shape[1]):
        spears.append(scipy.stats.spearmanr(X[:, gi], Y[:, gi])[0])
    return pears, spears

def eval_PearSpear_AlongCell(X, Y):
    pear_alongcell = pearson_mat_axis1(X, Y)
    spear_alongcell = []
    for ci in range(X.shape[0]):
        spear_alongcell.append(scipy.stats.spearmanr(X[ci, ], Y[ci, ])[0])
    return pear_alongcell, spear_alongcell
        
def eval_imputation_flatten(x, y):
    pearson_r, pearson_p = scipy.stats.pearsonr(x, y)
    print(f"Found pearson's correlation/p of {pearson_r:.4f}/{pearson_p:.4g}")
    spearman_corr, spearman_p = scipy.stats.spearmanr(x, y)
    print(f"Found spearman's collelation/p of {spearman_corr:.4f}/{spearman_p:.4g}")
    rmse = np.sqrt(np.mean((x - y)**2))
    print(f"Found rmse {rmse:.4f}")
    return pearson_r, spearman_corr, rmse

In [3]:
data_dir = "/home/yanxh/data/Seurat_demo_data/bm_cite"

X_count = sps.csr_matrix(sio.mmread(join(data_dir, 'rna_mat_count.mtx')).T)
X = sps.csr_matrix(sio.mmread(join(data_dir, 'rna_mat_norm.mtx')).T)
Y_count = sps.csr_matrix(sio.mmread(join(data_dir, 'adt_mat_count.mtx')).T)
Y = sps.csr_matrix(sio.mmread(join(data_dir, 'adt_mat_norm.mtx')).T)

rna_names = pd.read_csv(join(data_dir, 'gene_names.csv'))['x'].to_numpy()
adt_names = pd.read_csv(join(data_dir, 'adt_names.csv'))['x'].to_numpy()

cell_names = pd.read_csv(join(data_dir, 'cell_names.csv'))['x'].to_numpy()
meta_data = pd.read_csv(join(data_dir, 'metadata.csv'), index_col=0)
meta_data['batch'] = meta_data['donor'].to_numpy()

# select hvg
ad_rna = sc.AnnData(X, obs=meta_data.loc[cell_names])
sc.pp.highly_variable_genes(ad_rna, n_top_genes=5000)
hvg_idx = np.where(ad_rna.var.highly_variable)[0]

train_idx = np.where((meta_data.batch=='batch1').to_numpy())[0]
test_idx  = np.where((meta_data.batch=='batch2').to_numpy())[0]

X = X[:, hvg_idx].copy()
X_count = X_count[:, hvg_idx].copy()

In [4]:
adata = sc.AnnData(X[train_idx].A)
adata.obsm['ADT'] = Y_count[train_idx].A
adata.layers["counts"] = X_count[train_idx].A
# adata.obs['batch'] = np.r_[gex.obs.batch.to_numpy(), gex_test.obs.batch.to_numpy()]
adata.raw = adata

In [5]:
scvi.model.TOTALVI.setup_anndata(
    adata,
#     batch_key="batch",          # set batch hurts performance badly
    layer="counts",
    protein_expression_obsm_key="ADT"
)

vae = scvi.model.TOTALVI(adata, 
                         n_latent=32,
                         latent_distribution="normal")

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


[34mINFO    [0m Generating sequential column names                                                                        
[34mINFO    [0m Computing empirical prior initialization for protein background.                                          


In [6]:
if False:
    vae.train(
       train_size=0.9,
       early_stopping=15,
       reduce_lr_on_plateau=False
    )
    vae.save('./TotalVI_checkpoint/bm-cite', overwrite=True)

    vae = scvi.model.TOTALVI.load('./TotalVI_checkpoint/bm-cite', adata)
else:
    vae = scvi.model.TOTALVI.load('./TotalVI_checkpoint/bm-cite', adata)

[34mINFO    [0m Computing empirical prior initialization for protein background.                                          


## ADT->RNA

In [7]:
adata_test = sc.AnnData(np.zeros_like(X[test_idx].A))
adata_test.obsm['ADT'] = Y_count[test_idx].A
adata_test.layers['counts'] = np.zeros_like(X_count[test_idx].A)

X_hat, _ = vae.get_normalized_expression(
    adata=adata_test, 
    n_samples=25,
    return_mean=True,
)

X_hat = np.log1p(X_hat * 1e4)

[34mINFO    [0m Input AnnData not setup with scvi-tools. attempting to transfer AnnData setup                             


In [9]:
x, y = X[test_idx].A, X_hat.values
pr, sr, rmse = eval_imputation_flatten(x.flatten(), y.flatten())

pear_along_gene, spear_along_gene = eval_PearSpear_AlongGene(x, y)
gene_pear_along_cell, gene_spear_along_cell = eval_PearSpear_AlongCell(x, y)

np.mean(pear_along_gene), np.mean(spear_along_gene), np.mean(gene_pear_along_cell), np.mean(gene_spear_along_cell)

Found pearson's correlation/p of 0.5659/0
Found spearman's collelation/p of 0.2414/0
Found rmse 0.6039




(0.07271284787315001, nan, 0.5656557576370216, 0.23602405413260327)

# RNA->ADT

In [8]:
adata_test = sc.AnnData(X[test_idx].A)
adata_test.obsm['ADT'] = np.zeros_like(Y_count[test_idx].A)
adata_test.layers['counts'] = X_count[test_idx].A

In [9]:
_, Y_hat = vae.get_normalized_expression(
    adata=adata_test,
    n_samples=25,
    include_protein_background=True,
    return_mean=True,
)

Y_hat = np.log1p(Y_hat.values * 1e4)
Y_test = np.log1p(Y_count[test_idx].A/np.sum(Y_count[test_idx].A, axis=-1, keepdims=True)*1e4)

[34mINFO    [0m Input AnnData not setup with scvi-tools. attempting to transfer AnnData setup                             
[34mINFO    [0m Found batches with missing protein expression                                                             


In [12]:
pr, sr, rmse = eval_imputation_flatten(Y_test.flatten(), Y_hat.flatten())

pear_along_adt, spear_along_adt = eval_PearSpear_AlongGene(Y_test, Y_hat)
adt_pear_along_cell, adt_spear_along_cell = eval_PearSpear_AlongCell(Y_test, Y_hat)

np.mean(pear_along_adt), np.mean(spear_along_adt), np.mean(adt_pear_along_cell), np.mean(adt_spear_along_cell)

Found pearson's correlation/p of 0.7289/0
Found spearman's collelation/p of 0.6910/0
Found rmse 7.3716


(0.39558970779140684,
 0.32406326067516533,
 0.7564383713584866,
 0.7230228926980063)

In [19]:
save_dir = '/home/yanxh/gitrepo/multi-omics-matching/Visualization/outputs/imputation'

adt_metcs = np.vstack([pear_along_adt, spear_along_adt]).T
_df1 = pd.DataFrame(adt_metcs, index=adt_names, columns=['pear', 'spear'])
_df1.to_csv(join(save_dir, 'TotalVI_bm-cite_along_adt.csv'))

gene_metcs = np.vstack([pear_along_gene, spear_along_gene]).T
_df2 = pd.DataFrame(gene_metcs, index=rna_names[hvg_idx], columns=['pear', 'spear'])
_df2.to_csv(join(save_dir, 'TotalVI_bm-cite_along_gene.csv'))