In [1]:
import os
import sys
os.environ["OMP_NUM_THREADS"] = "11"
os.environ["OPENBLAS_NUM_THREADS"] = "8" # export OPENBLAS_NUM_THREADS=4 
os.environ["MKL_NUM_THREADS"] = "11" # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "8" # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "11" # export NUMEXPR_NUM_THREADS=6
os.environ["NUMBA_CACHE_DIR"]='/tmp/numba_cache'
import numpy as np
import pandas as pd
import scipy as sp
import scipy.sparse as sps
import h5py

import scvi
import scanpy as sc
import scipy.io as sio

import matplotlib.pyplot as plt
from os.path import join

Global seed set to 0


In [2]:
import scipy
from sklearn.metrics import roc_auc_score
def pearson_mat(X, Y):
    X = (X - X.mean(axis=0))
    X /= (scipy.linalg.norm(X, axis=0, ord=2) + 1e-12)
    Y = (Y - Y.mean(axis=0))
    Y /= (scipy.linalg.norm(Y, axis=0, ord=2) + 1e-12)
    return (X * Y).sum(axis=0)

def eval_pearRmse_AlongGene(X, Y):
    pear = pearson_mat(X, Y)
    rmse = np.sqrt(np.mean((X-Y)**2, axis=0))
    return pear, rmse

def eval_spear_AlongGene(X, Y):
    spears = []
    for gi in range(X.shape[1]):
        spears.append(scipy.stats.spearmanr(X[:, gi], Y[:, gi])[0])
    return spears

def eval_aucRmse_AlongPeak(X, Y):
    aucs, rmses = [], []
    for pi in range(X.shape[1]):
        aucs.append(roc_auc_score(X[:, pi], Y[:, pi]))
        rmses.append(
            np.sqrt(np.mean((X[:, pi] - Y[:, pi])**2))
        )
    return aucs, rmses

def eval_imputation_flatten(x, y):
    pearson_r, pearson_p = scipy.stats.pearsonr(x, y)
    print(f"Found pearson's correlation/p of {pearson_r:.4f}/{pearson_p:.4g}")
    spearman_corr, spearman_p = scipy.stats.spearmanr(x, y)
    print(f"Found spearman's collelation/p of {spearman_corr:.4f}/{spearman_p:.4g}")
    rmse = np.sqrt(np.mean((x - y)**2))
    print(f"Found rmse {rmse:.4f}")
    return pearson_r, spearman_corr, rmse

In [3]:
data_dir = "/home/sda1/yanxh/data/Seurat_demo_data/pbmc_multiome"

# print('Reading `mtx` files...')
X = sps.csr_matrix(sio.mmread(join(data_dir, 'rna_mat_norm.mtx')).T)
Y = sps.csr_matrix(sio.mmread(join(data_dir, 'atac_mat_norm.mtx')).T)
# X_count = sps.csr_matrix(sio.mmread(join(data_dir, 'rna_mat_count.mtx')).T)

rna_names = pd.read_csv(join(data_dir, 'gene_names.csv'))['x'].to_numpy()
atac_names = pd.read_csv(join(data_dir, 'atac_names.csv'))['x'].to_numpy()

cell_names = pd.read_csv(join(data_dir, 'cell_names.csv'))['x'].to_numpy()
meta_data = pd.read_csv(join(data_dir, 'metadata.csv'), index_col=0)

train_idx = pd.read_csv(join(data_dir, 'train_idx.csv'))['0'].to_numpy()
test_idx  = pd.read_csv(join(data_dir, 'test_idx.csv'))['0'].to_numpy()

# select hvg and hvp
ad_rna = sc.AnnData(X, obs=meta_data.loc[cell_names])
sc.pp.highly_variable_genes(ad_rna, n_top_genes=5000)
hvg_idx = np.where(ad_rna.var.highly_variable)[0]

# pick peak startwith chr1-23
valid_atac_idx = [
    _ for _ in range(len(atac_names)) 
    if atac_names[_].startswith('chr') and 
    not atac_names[_].startswith('chrX-') and 
    not atac_names[_].startswith('chrY-')
]
valid_atac_names = atac_names[valid_atac_idx]
Y = Y[:, valid_atac_idx]

hvp_idx = np.argsort(Y.sum(axis=0).A1)[-20000:]
hvp_names = valid_atac_names[hvp_idx]

In [4]:
mult_X = X[train_idx][:, hvg_idx].A
mult_Y = (Y[train_idx][:, hvp_idx]>0).A.astype('float32')

single_X = X[test_idx][:, hvg_idx].A
single_Y = (Y[test_idx][:, hvp_idx] > 0).A.astype('float32')

# ATAC->RNA

In [5]:
adata_paired = sc.AnnData(np.c_[mult_X, mult_Y])
# adata_paired.obs['batch'] = 'batch1'
adata_paired.var['modality'] = ['gene']*hvg_idx.size + ['peak']*hvp_idx.size

adata_atac = sc.AnnData(np.c_[np.zeros_like(single_X), single_Y])
# adata_atac.obs['batch'] = 'batch2'
adata_atac.var['modality'] = ['gene']*hvg_idx.size + ['peak']*hvp_idx.size

adata = scvi.data.organize_multiome_anndatas(adata_paired, adata_atac)

import gc
del adata_atac, adata_paired
gc.collect()


See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


366

In [6]:
scvi.model.MULTIVI.setup_anndata(
    adata, 
    batch_key="modality",
#     continuous_covariate_keys=['batch']
)

vae = scvi.model.MULTIVI(
    adata, n_latent=32, gene_likelihood='nb',
    n_genes=hvg_idx.size,
    n_regions=hvp_idx.size,
)

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


In [7]:
# vae.train(
#    train_size=0.9,
#    early_stopping=15
# )

# vae.save('./MultiVI_checkpoint/pbmc-mult/atac2rna', overwrite=True)

vae = scvi.model.MULTIVI.load('./MultiVI_checkpoint/pbmc-mult/atac2rna', adata)

In [8]:
imputed_expr = vae.get_normalized_expression()

X_hat = np.log1p(imputed_expr.iloc[train_idx.size:].values * 1e4)

  x = torch.where(mask_expr.T, x_expr.T, x_acc.T).T


In [27]:
pr, sr, rmse = eval_imputation_flatten(single_X.flatten(), X_hat.flatten())
pear_along_gene, rmse_along_gene = eval_pearRmse_AlongGene(single_X, X_hat)
spear_along_gene = eval_spear_AlongGene(single_X, X_hat)

np.mean(pear_along_gene), np.mean(spear_along_gene) #, np.mean(rmse_along_gene)

Found pearson's correlation/p of 0.5538/0
Found spearman's collelation/p of 0.3408/0
Found rmse 0.8116




(0.14448559890430912, nan)

# RNA->ATAC

In [9]:
adata_paired = sc.AnnData(np.c_[mult_X, mult_Y])
# adata_paired.obs['batch'] = 'batch1'
adata_paired.var['modality'] = ['gene']*hvg_idx.size + ['peak']*hvp_idx.size

adata_rna = sc.AnnData(np.c_[single_X, np.zeros_like(single_Y)])
# adata_atac.obs['batch'] = 'batch2'
adata_rna.var['modality'] = ['gene']*hvg_idx.size + ['peak']*hvp_idx.size

adata = scvi.data.organize_multiome_anndatas(adata_paired, adata_rna)

import gc
del adata_rna, adata_paired
gc.collect()


See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


293

In [10]:
scvi.model.MULTIVI.setup_anndata(
    adata, 
    batch_key="modality",
#     continuous_covariate_keys=['batch']
)

vae = scvi.model.MULTIVI(
    adata, n_latent=32, gene_likelihood='nb',
    n_genes=hvg_idx.size,
    n_regions=hvp_idx.size,
)

# vae.train(
#    train_size=0.9,
#    early_stopping=15
# )
# vae.save('./MultiVI_checkpoint/pbmc-mult/rna2atac', overwrite=True)

In [11]:
vae = scvi.model.MULTIVI.load('./MultiVI_checkpoint/pbmc-mult/rna2atac', adata)

In [12]:
imputed_expr = vae.get_accessibility_estimates()
Y_hat = imputed_expr.iloc[train_idx.size:].values

In [17]:
auc = roc_auc_score(single_Y.flatten(), Y_hat.flatten())

auc_along_peak, rmse_along_peak = eval_aucRmse_AlongPeak(single_Y, Y_hat)
auc, np.mean(auc_along_peak) #, np.mean(rmse_along_peak)

(0.7698606149294867, 0.6465899227375057)

In [41]:
save_dir = '/home/yanxh/gitrepo/multi-omics-matching/Visualization/outputs/imputation'

gene_metcs = np.vstack([pear_along_gene, spear_along_gene]).T
_df1 = pd.DataFrame(gene_metcs, index=rna_names[hvg_idx], columns=['pear', 'spear'])
_df1.to_csv(join(save_dir, 'MultiVI_pbmc-mult_along_gene.csv'))

peak_metcs = auc_along_peak
_df2 = pd.DataFrame(peak_metcs, index=hvp_names, columns=['auc'])
_df2.to_csv(join(save_dir, 'MultiVI_pbmc-mult_along_peak.csv'))