In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import scnym
import time
import os
from os.path import join
import scipy.sparse as sps

In [2]:
exp_id = 'CITE-ASAP'
binz = False
new_ident = 'new_identity_discovery'  # no_new_identity

In [3]:

data_root = '/home/yxh/gitrepo/multi-omics/scJoint-main/data'

train_adata = sc.read_h5ad(join(data_root, 'CITE-ASAP/adata_rna_cache.h5ad'))
test_adata = sc.read_h5ad(join(data_root, 'CITE-ASAP/adata_atac_cache.h5ad'))
test_adata.obs['cell_type_bkp'] = test_adata.obs.cell_type.values

if binz:
    train_adata.X = (train_adata.X>0).astype('float32')
    test_adata.X  = (test_adata.X>0).astype('float32')
    
# preprocess datasets
sc.pp.normalize_total(train_adata, target_sum=1e6)
sc.pp.log1p(train_adata)

sc.pp.normalize_total(test_adata, target_sum=1e6)
sc.pp.log1p(test_adata)

cite_adt = sc.AnnData(sps.load_npz(join(data_root, 'CITE-ASAP/citeseq_control_adt.npz')))
asap_adt = sc.AnnData(sps.load_npz(join(data_root, 'CITE-ASAP/asapseq_control_adt.npz')))
sc.pp.normalize_total(cite_adt, target_sum=1e6) # for some reason, need to be renormalized
sc.pp.log1p(cite_adt)
sc.pp.normalize_total(asap_adt, target_sum=1e6)
sc.pp.log1p(asap_adt)

# concat protein features
train_adata = sc.AnnData(sps.csr_matrix(sps.hstack([train_adata.X, cite_adt.X])), obs=train_adata.obs)
test_adata  = sc.AnnData(sps.csr_matrix(sps.hstack([test_adata.X,  asap_adt.X])), obs=test_adata.obs)

In [4]:
print('%d cells, %d genes in the training set.' % train_adata.shape)
print('%d cells, %d genes in the target set.' % test_adata.shape)

4644 cells, 17668 genes in the training set.
4502 cells, 17668 genes in the target set.


In [5]:
meta_rna = train_adata.obs
meta_atac = test_adata.obs

meta = pd.concat([meta_rna, meta_atac], axis=0)

In [6]:
# set test data cells to the target data token `"Unlabeled"`
test_adata.obs["cell_type"] = "Unlabeled"
# concatenate training and test data into a single object
# for scNym
adata = train_adata.concatenate(test_adata)

scnym.api.scnym_api(
    adata=adata,
    task="train",
    groupby="cell_type",
    config=new_ident,
    out_path=f"./scnym_outputs/{exp_id}_binz={binz}_{new_ident}",
)

CUDA compute device found.
4502 unlabeled observations found.
Using unlabeled data as a target set for semi-supervised, adversarial training.

training examples:  (4644, 17668)
target   examples:  (4502, 17668)
X:  (4644, 17668)
y:  (4644,)
Not weighting classes and not balancing classes.
Found 2 unique domains.
Using MixMatch for semi-supervised learning
Scaling ICL over 100 epochs, 0 epochs for burn in.
Scaling ICL over 20 epochs, 0 epochs for burn in.
Using a Domain Adaptation Loss.
Training...
Saving best model weights...
Saved best weights.
Trainer has a `dan_criterion`.
Saving DAN weights...
Saving best model weights...
Saved best weights.
Trainer has a `dan_criterion`.
Saving DAN weights...
Saving best model weights...
Saved best weights.
Trainer has a `dan_criterion`.
Saving DAN weights...
>>>>>
Early stopping at epoch 67
>>>>>
Training complete.

Evaluating model.
EVAL LOSS:  0.326552614569664
EVAL ACC :  0.9010752688172043
Predictions | Labels
[[5 1]
 [6 6]
 [5 5]
 [1 1]
 [6 

In [7]:
scnym.api.scnym_api(
    adata=adata,
    task='predict',
    trained_model=f"./scnym_outputs/{exp_id}_binz={binz}_{new_ident}",
)

CUDA compute device found.


Finding cell types:  11%|█████████                                                                        | 1/9 [00:00<00:01,  5.90it/s]

Loaded model predicting 7 classes from 17668 features
['B' 'Effector CD4+ T' 'Effector CD8+ T' 'Monocytes' 'NK' 'Naive CD4+ T'
 'Naive CD8+ T']
Building a classification matrix...
Gene names match exactly, returning input.
Predicting cell types...


Finding cell types: 100%|█████████████████████████████████████████████████████████████████████████████████| 9/9 [00:01<00:00,  5.31it/s]


Extracting model embeddings...


In [8]:
# copy scNym predictions to the original test data embedding
test_adata.obs['scNym'] = np.array(adata.obs.loc[[x + '-1' for x in test_adata.obs_names], 'scNym'])  
train_adata.obs['scNym'] = np.array(adata.obs.loc[[x + '-0' for x in train_adata.obs_names], 'scNym'])

test_adata.obs['max_prob'] = np.array(adata.obs.loc[[x + '-1' for x in test_adata.obs_names], 'scNym_confidence'])
train_adata.obs['max_prob'] = np.array(adata.obs.loc[[x + '-0' for x in train_adata.obs_names], 'scNym_confidence'])

In [9]:
from metrics import osr_evaluator

shr_mask = np.in1d(test_adata.obs.cell_type_bkp, train_adata.obs.cell_type.unique())

# test_acc = (test_adata.obs.scNym[shr_mask] == test_adata.obs.cell_type_bkp[shr_mask]).mean()
# train_acc = (train_adata.obs.scNym == train_adata.obs.cell_type).mean()

open_score = 1 - test_adata.obs['max_prob']

kn_data_pr = np.array(test_adata.obs['scNym'])[shr_mask]
kn_data_gt = np.array(test_adata.obs['cell_type_bkp'])[shr_mask]
kn_data_open_score = open_score[shr_mask]

unk_data_open_score = open_score[np.logical_not(shr_mask)]

closed_acc, os_auroc, os_aupr, oscr = osr_evaluator(kn_data_pr, kn_data_gt, kn_data_open_score, unk_data_open_score)
closed_acc, os_auroc, os_aupr, oscr

close_acc= 0.6906
AUROC= 0.6454
AUPR= 0.1564
OSCR= 0.4785


(0.6906422901130623, 0.645433056865842, 0.1563533099832937, 0.4784812068346388)

### We found that scnym's performance was unstable, overall acc distributed in [0.65, 0.75]