In [1]:
import os
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import csv
import gzip
import scipy.io

import scipy.sparse as sps

from os.path import join
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.preprocessing import normalize

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

np.random.seed(1234)

sc.settings.verbosity = 3
sc.logging.print_header()

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.23.3 scipy==1.9.3 pandas==1.5.1 scikit-learn==1.1.2 statsmodels==0.13.2 python-igraph==0.10.2 louvain==0.8.0 pynndescent==0.5.7


In [2]:
data_root = '/home/yanxh/data'

In [3]:
adata_rna = sc.read_h5ad(join(data_root, 'CITE-ASAP/adata_rna_cache.h5ad'))
adata_atac = sc.read_h5ad(join(data_root, 'CITE-ASAP/adata_atac_cache.h5ad'))
sc.pp.normalize_total(adata_rna, target_sum=1e4)
sc.pp.log1p(adata_rna)

sc.pp.normalize_total(adata_atac, target_sum=1e4)
sc.pp.log1p(adata_atac)

adata_rna, adata_atac

normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)


(AnnData object with n_obs × n_vars = 4644 × 17441
     obs: 'cell_type', 'data_type'
     uns: 'log1p',
 AnnData object with n_obs × n_vars = 4502 × 17441
     obs: 'cell_type', 'data_type'
     uns: 'log1p')

In [4]:
# adt features, already normed
cite_adt = sps.load_npz(join(data_root, 'CITE-ASAP/citeseq_control_adt.npz'))
asap_adt = sps.load_npz(join(data_root, 'CITE-ASAP/asapseq_control_adt.npz'))

cite_adt.shape, asap_adt.shape

((4644, 227), (4502, 227))

In [8]:
meta_rna = adata_rna.obs
meta_atac = adata_atac.obs

meta = pd.concat([meta_rna, meta_atac], axis=0)

# Integration using Portal

In [9]:
import portal

# Specify the GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Create a folder for saving results
result_path = "./result"
if not os.path.exists(result_path):
    os.makedirs(result_path)

In [10]:
## standard portal pipeline

model = portal.model.Model(training_steps=2000, 
                           lambdacos=10., lambdaAE=10., lambdaLA=10., lambdaGAN=1.0)
model.preprocess(adata_rna, adata_atac, norm=False, hvg_num=4000,
                 norm_pca=False) # perform preprocess and PCA
model.train() # train the modela
model.eval() # get integrated latent representation of cells

# portal.utils.plot_UMAP(model.latent, meta, colors=["data_type", "cell_type"], save=False, result_path=result_path)

Finding highly variable genes...
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes




--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)
Normalizing and scaling...
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
Dimensionality reduction via PCA...


  hvg_total = hvg_A & hvg_B
  view_to_actual(adata)
  view_to_actual(adata)
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


Begining time:  Wed Mar 15 18:12:56 2023
step 0, loss_D=9.025234, loss_GAN=2.861747, loss_AE=164.344208, loss_cos=20.485653, loss_LA=123.485123
step 200, loss_D=3.032115, loss_GAN=3.264146, loss_AE=11.616183, loss_cos=7.294182, loss_LA=2.559832
step 400, loss_D=2.388177, loss_GAN=2.580564, loss_AE=7.482092, loss_cos=5.430686, loss_LA=1.181703
step 600, loss_D=2.475388, loss_GAN=2.731363, loss_AE=6.651053, loss_cos=5.141285, loss_LA=0.862355
step 800, loss_D=2.173625, loss_GAN=2.880960, loss_AE=5.651341, loss_cos=4.417048, loss_LA=0.738611
step 1000, loss_D=2.592792, loss_GAN=2.744187, loss_AE=5.678359, loss_cos=4.715260, loss_LA=0.694730
step 1200, loss_D=2.355566, loss_GAN=2.743433, loss_AE=5.313591, loss_cos=4.643137, loss_LA=0.529457
step 1400, loss_D=2.150003, loss_GAN=3.114131, loss_AE=5.138807, loss_cos=4.632844, loss_LA=0.465213
step 1600, loss_D=2.197693, loss_GAN=3.127018, loss_AE=5.000278, loss_cos=4.786707, loss_LA=0.415012
step 1800, loss_D=2.322793, loss_GAN=3.164012, loss

In [11]:
from portal.knn_classifier import knn_classifier_top_k, faiss_knn, knn_classifier_prob_concerto
rna_lab = np.array(adata_rna.obs.cell_type.values)
atac_lab = np.array(adata_atac.obs.cell_type.values)

feat_A, feat_B = model.latent[:len(rna_lab)], model.latent[len(rna_lab):]
# feat_A, feat_B = normalize(feat_A, axis=1), normalize(feat_B, axis=1)

# knn_classifier
atac_pred, atac_prob = knn_classifier_prob_concerto(feat_A, feat_B, rna_lab, n_sample=None, knn=30, num_chunks=100)

shr_mask = np.in1d(atac_lab, np.unique(rna_lab))
(np.ravel(atac_pred)[shr_mask] == atac_lab[shr_mask]).mean() 

0.8352177050757758

In [12]:
from portal.metrics import osr_evaluator


open_score = 1 - atac_prob

kn_data_pr = atac_pred[shr_mask]
kn_data_gt = atac_lab[shr_mask]
kn_data_open_score = open_score[shr_mask]

unk_data_open_score = open_score[np.logical_not(shr_mask)]

closed_acc, os_auroc, os_aupr, oscr = osr_evaluator(kn_data_pr, kn_data_gt, kn_data_open_score, unk_data_open_score)
closed_acc, os_auroc, os_aupr, oscr

close_acc= 0.8352
AUROC= 0.5107
AUPR= 0.0730
OSCR= 0.4572


(0.8352177050757758,
 0.5106978625193057,
 0.07299037625711248,
 0.45716043830382164)