In [15]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import scnym
import time
import os
from os.path import join
import scipy.sparse as sps

In [16]:
exp_id = 'PBMCMultome'
binz = False
new_ident = 'no_new_identity'  # no_new_identity

In [24]:
data_root = '/home/yxh/gitrepo/multi-omics/scJoint-main/data/pbmc_10x'

adata_atac = sc.read_h5ad(join(data_root, 'ATAC/adata_atac.h5ad'))
adata_rna = sc.read_h5ad(join(data_root, 'RNA/adata_rna.h5ad'))
adata_atac_gam = sc.read_h5ad(join(data_root, 'ATAC_GAM/adata_atac_gam.h5ad'))

gene_share = adata_atac_gam.var_names.intersection(adata_rna.var_names)
test_adata = adata_atac_gam[:, gene_share]
train_adata = adata_rna[:, gene_share]
test_adata.obs['cell_type_bkp'] = test_adata.obs.cell_type.values
train_adata.obs['cell_type_bkp'] = train_adata.obs.cell_type.values

if binz:
    train_adata.X = (train_adata.X>0).astype('float32')
    test_adata.X  = (test_adata.X>0).astype('float32')

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


In [25]:
print('%d cells, %d genes in the training set.' % train_adata.shape)
print('%d cells, %d genes in the target set.' % test_adata.shape)

10412 cells, 18353 genes in the training set.
10412 cells, 18353 genes in the target set.


In [26]:
# preprocess datasets
sc.pp.normalize_total(train_adata, target_sum=1e6)
sc.pp.log1p(train_adata)

sc.pp.normalize_total(test_adata, target_sum=1e6)
sc.pp.log1p(test_adata)

In [27]:
# set test data cells to the target data token `"Unlabeled"`
test_adata.obs["cell_type"] = "Unlabeled"
# concatenate training and test data into a single object
# for scNym
adata = train_adata.concatenate(test_adata)

scnym.api.scnym_api(
    adata=adata,
    task="train",
    groupby="cell_type",
    config=new_ident,
    out_path=f"./scnym_outputs/{exp_id}_binz={binz}_{new_ident}",
)

In [28]:
scnym.api.scnym_api(
    adata=adata,
    task='predict',
    trained_model=f"./scnym_outputs/{exp_id}_binz={binz}_{new_ident}",
)

CUDA compute device found.


Finding cell types:   0%|                                                                                        | 0/21 [00:00<?, ?it/s]

Loaded model predicting 19 classes from 18353 features
['CD14 Mono' 'CD16 Mono' 'CD4 Naive' 'CD4 TCM' 'CD4 TEM' 'CD8 Naive'
 'CD8 TEM_1' 'CD8 TEM_2' 'HSPC' 'Intermediate B' 'MAIT' 'Memory B' 'NK'
 'Naive B' 'Plasma' 'Treg' 'cDC' 'gdT' 'pDC']
Building a classification matrix...
Gene names match exactly, returning input.
Predicting cell types...


Finding cell types: 100%|███████████████████████████████████████████████████████████████████████████████| 21/21 [00:04<00:00,  4.87it/s]


Extracting model embeddings...


In [29]:
# copy scNym predictions to the original test data embedding
test_adata.obs['scNym'] = np.array(adata.obs.loc[[x + '-1' for x in test_adata.obs_names], 'scNym'])  
train_adata.obs['scNym'] = np.array(adata.obs.loc[[x + '-0' for x in train_adata.obs_names], 'scNym'])

test_adata.obs['max_prob'] = np.array(adata.obs.loc[[x + '-1' for x in test_adata.obs_names], 'scNym_confidence'])
train_adata.obs['max_prob'] = np.array(adata.obs.loc[[x + '-0' for x in train_adata.obs_names], 'scNym_confidence'])

In [30]:
from metrics import osr_evaluator

shr_mask = np.in1d(test_adata.obs.cell_type_bkp, train_adata.obs.cell_type.unique())

# test_acc = (test_adata.obs.scNym[shr_mask] == test_adata.obs.cell_type_bkp[shr_mask]).mean()
# train_acc = (train_adata.obs.scNym == train_adata.obs.cell_type).mean()

open_score = 1 - test_adata.obs['max_prob']

kn_data_pr = np.array(test_adata.obs['scNym'])[shr_mask]
kn_data_gt = np.array(test_adata.obs['cell_type_bkp'])[shr_mask]
kn_data_open_score = open_score[shr_mask]

unk_data_open_score = open_score[np.logical_not(shr_mask)]

closed_acc, os_auroc, os_aupr, oscr = osr_evaluator(kn_data_pr, kn_data_gt, kn_data_open_score, unk_data_open_score)
closed_acc, os_auroc, os_aupr, oscr

close_acc= 0.7425


(0.7425086438724549, -1, -1, -1)