In [1]:
%load_ext autoreload
%autoreload 2

import os
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import csv
import gzip
import scipy.io

import scipy.sparse as sps

from os.path import join
from sklearn.decomposition import PCA, IncrementalPCA

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

np.random.seed(1234)

sc.settings.verbosity = 3
sc.logging.print_header()

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.23.3 scipy==1.9.3 pandas==1.5.1 scikit-learn==1.1.2 statsmodels==0.13.2 python-igraph==0.10.2 louvain==0.8.0 pynndescent==0.5.7


In [2]:
data_root = '/home/yanxh/data/MCA/scjoint/data_atlas'

adata_atac = sc.read_h5ad(join(data_root, 'adata_atac_cache.h5ad'))
adata_rna_facs = sc.read_h5ad('./cache/adata_rna_facs.h5ad')

In [4]:
meta_rna = adata_rna_facs.obs
meta_atac = adata_atac.obs

meta = pd.concat([meta_rna, meta_atac], axis=0)

# Integration using Portal

In [5]:
import portal

# Specify the GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Create a folder for saving results
result_path = "./result"
if not os.path.exists(result_path):
    os.makedirs(result_path)

In [8]:
## standard portal pipeline

model = portal.model.Model(training_steps=3000, 
                           lambdacos=10., lambdaAE=10., lambdaLA=10., lambdaGAN=1.0)
model.preprocess(adata_rna_facs, adata_atac, hvg_num=4000, norm_pca=False) # perform preprocess and PCA
model.train() # train the model
model.eval() # get integrated latent representation of cells

# portal.utils.plot_UMAP(model.latent, meta, colors=["domain", "cell_type"], save=False, result_path=result_path)

Finding highly variable genes...
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes




--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)
Normalizing and scaling...
normalizing counts per cell
    finished (0:00:00)


  hvg_total = hvg_A & hvg_B


... as `zero_center=True`, sparse input is densified and may lead to large memory consumption


  view_to_actual(adata)


normalizing counts per cell
    finished (0:00:00)


  view_to_actual(adata)


... as `zero_center=True`, sparse input is densified and may lead to large memory consumption


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


Dimensionality reduction via PCA...
Begining time:  Tue Mar  7 11:56:40 2023
step 0, loss_D=9.284319, loss_GAN=2.951206, loss_AE=222.320526, loss_cos=19.603338, loss_LA=163.271332
step 200, loss_D=1.674860, loss_GAN=5.944926, loss_AE=9.162988, loss_cos=6.147538, loss_LA=2.821856
step 400, loss_D=1.793508, loss_GAN=4.827241, loss_AE=5.746913, loss_cos=5.099998, loss_LA=1.568431
step 600, loss_D=1.981043, loss_GAN=4.464570, loss_AE=4.823460, loss_cos=4.676253, loss_LA=1.075558
step 800, loss_D=1.645749, loss_GAN=4.494967, loss_AE=4.126327, loss_cos=4.827905, loss_LA=0.883243
step 1000, loss_D=1.589784, loss_GAN=4.762619, loss_AE=4.002243, loss_cos=4.158810, loss_LA=0.765795
step 1200, loss_D=1.626206, loss_GAN=4.761108, loss_AE=3.588576, loss_cos=4.731690, loss_LA=0.701957
step 1400, loss_D=1.439348, loss_GAN=4.873697, loss_AE=3.418447, loss_cos=4.043716, loss_LA=0.526594
step 1600, loss_D=1.439572, loss_GAN=4.758642, loss_AE=3.113201, loss_cos=4.384361, loss_LA=0.480056
step 1800, loss_

In [12]:
from portal.knn_classifier import knn_classifier_top_k, faiss_knn, knn_classifier_prob_concerto
rna_lab = np.array(adata_rna_facs.obs.cell_type.values)
atac_lab = np.array(adata_atac.obs.cell_type.values)

feat_A, feat_B = model.latent[:len(rna_lab)], model.latent[len(rna_lab):]
# feat_A, feat_B = normalize(feat_A, axis=1), normalize(feat_B, axis=1)

# knn_classifier
atac_pred, atac_prob = knn_classifier_prob_concerto(feat_A, feat_B, rna_lab, n_sample=None, knn=30, num_chunks=100)

shr_mask = np.in1d(atac_lab, np.unique(rna_lab))
(np.ravel(atac_pred)[shr_mask] == atac_lab[shr_mask]).mean() 

0.7581779962823342

In [8]:
from portal.metrics import osr_evaluator

open_score = 1 - atac_prob

kn_data_pr = atac_pred[shr_mask]
kn_data_gt = atac_lab[shr_mask]
kn_data_open_score = open_score[shr_mask]

unk_data_open_score = open_score[np.logical_not(shr_mask)]

closed_acc, os_auroc, os_aupr, oscr = osr_evaluator(kn_data_pr, kn_data_gt, kn_data_open_score, unk_data_open_score)
closed_acc, os_auroc, os_aupr, oscr

close_acc= 0.7533
AUROC= 0.7785
AUPR= 0.5925
OSCR= 0.6267


(0.7533137605753696, 0.7785080899484507, 0.5925281976095327, 0.626651700613932)