# BM-CITE

In [1]:
import os
import sys
os.environ["OMP_NUM_THREADS"] = "11"
os.environ["OPENBLAS_NUM_THREADS"] = "8" # export OPENBLAS_NUM_THREADS=4 
os.environ["MKL_NUM_THREADS"] = "11" # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "8" # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "11" # export NUMEXPR_NUM_THREADS=6
os.environ["NUMBA_CACHE_DIR"]='/tmp/numba_cache'
import numpy as np
import pandas as pd
import scipy 
import scipy.io as sio
import scipy.sparse as sps
import h5py
from types import SimpleNamespace

import tensorflow as tf
import matplotlib.pyplot as plt
import scanpy as sc

from os.path import join
import sys

os.environ["CUDA_VISIBLE_DEVICES"] = '1'
# physical_devices = tf.config.list_physical_devices('GPU')
# try:
#     tf.config.experimental.set_memory_growth(physical_devices[0], True)
# except:
#     # Invalid device or cannot modify virtual devices once initialized.
#     pass

In [2]:
import scipy
import copy
import gc
def pearson_mat(X0, Y0):
    X, Y = copy.deepcopy(X0), copy.deepcopy(Y0)
    X = (X - X.mean(axis=0))
    X /= (scipy.linalg.norm(X, axis=0, ord=2) + 1e-12)
    Y = (Y - Y.mean(axis=0))
    Y /= (scipy.linalg.norm(Y, axis=0, ord=2) + 1e-12)
    res = (X * Y).sum(axis=0)
    del X, Y
    gc.collect()
    return res

def pearson_mat_axis1(X0, Y0):
    X, Y = copy.deepcopy(X0), copy.deepcopy(Y0)
    X = (X - X.mean(axis=1, keepdims=True))
    X /= (scipy.linalg.norm(X, axis=1, ord=2, keepdims=True) + 1e-12)
    Y = (Y - Y.mean(axis=1, keepdims=True))
    Y /= (scipy.linalg.norm(Y, axis=1, ord=2, keepdims=True) + 1e-12)
    res = (X * Y).sum(axis=1)
    del X, Y
    gc.collect()
    return res

def eval_PearSpear_AlongGene(X, Y):
    pears = pearson_mat(X, Y)
    spears = []
    for gi in range(X.shape[1]):
        spears.append(scipy.stats.spearmanr(X[:, gi], Y[:, gi])[0])
    return pears, spears

def eval_PearSpear_AlongCell(X, Y):
    pear_alongcell = pearson_mat_axis1(X, Y)
    spear_alongcell = []
    for ci in range(X.shape[0]):
        spear_alongcell.append(scipy.stats.spearmanr(X[ci, ], Y[ci, ])[0])
    return pear_alongcell, spear_alongcell
        
def eval_imputation_flatten(x, y):
    pearson_r, pearson_p = scipy.stats.pearsonr(x, y)
    print(f"Found pearson's correlation/p of {pearson_r:.4f}/{pearson_p:.4g}")
    spearman_corr, spearman_p = scipy.stats.spearmanr(x, y)
    print(f"Found spearman's collelation/p of {spearman_corr:.4f}/{spearman_p:.4g}")
    rmse = np.sqrt(np.mean((x - y)**2))
    print(f"Found rmse {rmse:.4f}")
    return pearson_r, spearman_corr, rmse

# Load data

In [3]:
data_dir = "/home/yanxh/data/Seurat_demo_data/bm_cite"

X = sps.csr_matrix(sio.mmread(join(data_dir, 'rna_mat_norm.mtx')).T)
Y = sps.csr_matrix(sio.mmread(join(data_dir, 'adt_mat_norm.mtx')).T)

rna_names = pd.read_csv(join(data_dir, 'gene_names.csv'))['x'].to_numpy()
adt_names = pd.read_csv(join(data_dir, 'adt_names.csv'))['x'].to_numpy()

cell_names = pd.read_csv(join(data_dir, 'cell_names.csv'))['x'].to_numpy()
meta_data = pd.read_csv(join(data_dir, 'metadata.csv'), index_col=0)
meta_data['batch'] = meta_data['donor'].to_numpy()

# select hvg
ad_rna = sc.AnnData(X, obs=meta_data.loc[cell_names])
sc.pp.highly_variable_genes(ad_rna, n_top_genes=5000)
hvg_idx = np.where(ad_rna.var.highly_variable)[0]

train_idx = np.where((meta_data.batch=='batch1').to_numpy())[0]
test_idx  = np.where((meta_data.batch=='batch2').to_numpy())[0]

  ad_rna = sc.AnnData(X, obs=meta_data.loc[cell_names])


In [4]:
mult_X = X[train_idx][:, hvg_idx].A
mult_Y = Y[train_idx].A
single_X  = X[test_idx][:, hvg_idx].A
single_Y  = Y[test_idx].A

n_mult, n_rna, n_adt = mult_X.shape[0], single_X.shape[0], single_Y.shape[0]
m_rna, m_adt = mult_X.shape[1], mult_Y.shape[1]

In [5]:
mult_data = np.hstack([mult_X, mult_Y])

In [6]:
dim_input_arr = [m_rna, m_adt]
config = {
    'dim_input_arr': dim_input_arr,
    'dimensions':[256], 
    'dim_latent':32,
    'dim_block': np.array(dim_input_arr),
    'dist_block':['NB','NB'], 
    'dim_block_enc':np.array([256, 128]),
    'dim_block_dec':np.array([256, 128]),
    'dim_block_embed':np.array([32, 16]),
    
    'block_names':np.array(['rna', 'adt']),
    'uni_block_names':np.array(['rna','adt']),
    
    'beta_kl':1.,
    'beta_unobs':2./3.,
    'beta_modal':np.array([0.15,0.85]),
    'beta_reverse':0.5,

    "p_feat" : 0.2,
    "p_modal" : np.ones(2)/2,
    
}
config = SimpleNamespace(**config)
n_samples = 50

In [7]:
from functools import partial
from scVAEIT.VAEIT import scVAEIT
# model = scVAEIT(config, mosaic_data, masks, batch_ids)  # for integration
model = scVAEIT(config, mult_data)

2023-10-01 18:07:26.388309: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-01 18:07:26.806834: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22306 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:b3:00.0, compute capability: 8.6


In [8]:
if False:
    model.train(
        valid=False, num_epoch=500, batch_size=512, save_every_epoch=50,
        verbose=True, checkpoint_dir='./checkpoint/bm-cite-impute'
    )

We skip the training process here and load the model checkpoint directly.

```
model.train(
        valid=False, num_epoch=500, batch_size=512, save_every_epoch=50,
        verbose=True, checkpoint_dir=path_root+'checkpoint/')
```        

In [9]:
# load the model and ensure it is loaded successfully
checkpoint = tf.train.Checkpoint(net=model.vae)
epoch = 10
status = checkpoint.restore('checkpoint/bm-cite-impute/ckpt-{}'.format(epoch))

# one-step forward?
model.vae(tf.zeros((1,np.sum(model.vae.config.dim_input_arr))),
          tf.zeros((1,np.sum(model.vae.config.dim_input_arr))),
          tf.zeros((1,np.sum(model.batches.shape[1]))), 
          pre_train=True, L=1, training=False)
print(status)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus object at 0x7f8b25cc3880>


2023-10-01 18:07:29.263446: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [10]:
dataset_test = tf.data.Dataset.from_tensor_slices((
    np.hstack([single_X, single_Y]),
    model.cat_enc.transform(np.zeros((test_idx.size, 1))).toarray().astype(np.float32),
    np.zeros(test_idx.size).astype(np.int32)
)).batch(512).prefetch(tf.data.experimental.AUTOTUNE)

# adt->rna
mask_rna = np.zeros((1, m_rna+m_adt), dtype=np.float32)
mask_rna[:,:m_rna] = -1.
recon = model.vae.get_recon(dataset_test, mask_rna)
X_hat = recon[:, :m_rna]

mask_adt = np.zeros((1, m_rna+m_adt), dtype=np.float32)
mask_adt[:, m_rna:] = -1.
recon = model.vae.get_recon(dataset_test, mask_adt)
Y_hat = recon[:, m_rna:]

In [76]:
# batch2: rna->adt
pear1, spear1, rmse = eval_imputation_flatten(single_Y.flatten(), Y_hat.flatten())
pear_along_adt, spear_along_adt = eval_PearSpear_AlongGene(single_Y, Y_hat)
adt_pear_along_cell, adt_spear_along_cell = eval_PearSpear_AlongCell(single_Y, Y_hat)

np.mean(pear_along_adt), np.mean(spear_along_adt), np.mean(adt_pear_along_cell), np.mean(adt_spear_along_cell)

Found pearson's correlation/p of 0.9455/0
Found spearman's collelation/p of 0.9220/0
Found rmse 0.3309


(0.8338684714571624,
 0.6725610199600964,
 0.9473096467951613,
 0.9163671292480389)

In [78]:
# batch3: adt->rna
pear1, spear1, rmse = eval_imputation_flatten(single_X.flatten(), X_hat.flatten())
pear_along_gene, spear_along_gene = eval_PearSpear_AlongGene(single_X, X_hat)
gene_pear_along_cell, gene_spear_along_cell = eval_PearSpear_AlongCell(single_X, X_hat)

np.mean(pear_along_gene), np.mean(spear_along_gene), np.mean(gene_pear_along_cell), np.mean(gene_spear_along_cell)

Found pearson's correlation/p of 0.7605/0
Found spearman's collelation/p of 0.2626/0
Found rmse 0.2658


(0.1285874402664802, nan, 0.764599843625379, 0.2548823767857744)

In [32]:
save_dir = '/home/yanxh/gitrepo/multi-omics-matching/Visualization/outputs/imputation'

adt_metcs = np.vstack([pear_along_adt, spear_along_adt]).T
_df1 = pd.DataFrame(adt_metcs, index=adt_names, columns=['pear', 'spear'])
_df1.to_csv(join(save_dir, 'scVAEIT_bm-cite_along_adt.csv'))

gene_metcs = np.vstack([pear_along_gene, spear_along_gene]).T
_df2 = pd.DataFrame(gene_metcs, index=rna_names[hvg_idx], columns=['pear', 'spear'])
_df2.to_csv(join(save_dir, 'scVAEIT_bm-cite_along_gene.csv'))

In [80]:
save_dir = '/home/yanxh/gitrepo/multi-omics-matching/Visualization/outputs/imputation'

_metcs = np.vstack([adt_pear_along_cell, adt_spear_along_cell, gene_pear_along_cell, gene_spear_along_cell]).T
_df1 = pd.DataFrame(_metcs, index=cell_names[test_idx], columns=['adt_pear', 'adt_spear', 'gene_pear', 'gene_spear'])
_df1.to_csv(join(save_dir, 'scVAEIT_bm-cite_along_cell.csv'))


In [33]:
np.save('./checkpoint/bm-cite-impute/X_test_imputed.npy', X_hat)
np.save('./checkpoint/bm-cite-impute/Y_test_imputed.npy', Y_hat)