In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import scnym
import time
import datetime
import os
from os.path import join
import scipy.sparse as sps

In [2]:
exp_id = 'HumanFetal'
binz = False
new_ident = 'no_new_identity'  # no_new_identity

In [3]:
data_root = '/home/yxh/data/HumanFetal'

adata_atac = sc.read_h5ad(join(data_root, 'ATAC/adata_atac.h5ad'))
adata_rna = sc.read_h5ad(join(data_root, 'RNA/adata_rna_sampled.h5ad'))
adata_rna.obs['cell_type'] = adata_rna.obs.Main_cluster_name.values

test_adata = adata_atac.copy()
train_adata = adata_rna.copy()
test_adata.obs['cell_type_bkp'] = test_adata.obs.cell_type.values
train_adata.obs['cell_type_bkp'] = train_adata.obs.cell_type.values

if binz:
    train_adata.X = (train_adata.X>0).astype('float32')
    test_adata.X  = (test_adata.X>0).astype('float32')

In [4]:
print('%d cells, %d genes in the training set.' % train_adata.shape)
print('%d cells, %d genes in the target set.' % test_adata.shape)

433695 cells, 22121 genes in the training set.
656074 cells, 22121 genes in the target set.


In [5]:
# preprocess datasets
sc.pp.normalize_total(train_adata, target_sum=1e6)
sc.pp.log1p(train_adata)

sc.pp.normalize_total(test_adata, target_sum=1e6)
sc.pp.log1p(test_adata)

In [6]:
# set test data cells to the target data token `"Unlabeled"`
test_adata.obs["cell_type"] = "Unlabeled"
# concatenate training and test data into a single object
# for scNym
adata = train_adata.concatenate(test_adata)

start_time = datetime.datetime.now()
scnym.api.scnym_api(
    adata=adata,
    task="train",
    groupby="cell_type",
    config=new_ident,
    out_path=f"./scnym_outputs/{exp_id}_binz={binz}_{new_ident}",
)

CUDA compute device found.
656074 unlabeled observations found.
Using unlabeled data as a target set for semi-supervised, adversarial training.

training examples:  (433695, 22121)
target   examples:  (656074, 22121)
X:  (433695, 22121)
y:  (433695,)
Not weighting classes and not balancing classes.
Found 2 unique domains.
Using MixMatch for semi-supervised learning
Scaling ICL over 100 epochs, 0 epochs for burn in.
Scaling ICL over 20 epochs, 0 epochs for burn in.
Using a Domain Adaptation Loss.
Training...
Saving best model weights...
Saved best weights.
Trainer has a `dan_criterion`.
Saving DAN weights...
>>>>>
Early stopping at epoch 61
>>>>>
Training complete.

Evaluating model.
EVAL LOSS:  1.271226515489466
EVAL ACC :  0.9451694719852433
Predictions | Labels
[[41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]
 [41 41]]


In [7]:
scnym.api.scnym_api(
    adata=adata,
    task='predict',
    trained_model=f"./scnym_outputs/{exp_id}_binz={binz}_{new_ident}",
)

CUDA compute device found.


Finding cell types:   0%|                                                                                      | 0/1065 [00:00<?, ?it/s]

Loaded model predicting 54 classes from 22121 features
['Acinar cells' 'Adrenocortical cells' 'Antigen presenting cells'
 'Astrocytes' 'Bronchiolar and alveolar epithelial cells' 'Cardiomyocytes'
 'Chromaffin cells' 'Ciliated epithelial cells' 'Ductal cells' 'ENS glia'
 'ENS neurons' 'Endocardial cells' 'Epicardial fat cells' 'Erythroblasts'
 'Excitatory neurons' 'Extravillous trophoblasts' 'Ganglion cells'
 'Goblet cells' 'Granule neurons' 'Hematopoietic stem cells'
 'Hepatoblasts' 'IGFBP1_DKK1 positive cells' 'Inhibitory neurons'
 'Intestinal epithelial cells' 'Islet endocrine cells'
 'Limbic system neurons' 'Lymphatic endothelial cells' 'Lymphoid cells'
 'Megakaryocytes' 'Mesangial cells' 'Mesothelial cells'
 'Metanephric cells' 'Myeloid cells' 'Neuroendocrine cells'
 'PAEP_MECOM positive cells' 'Parietal and chief cells'
 'Photoreceptor cells' 'Purkinje neurons' 'Retinal pigment cells'
 'Retinal progenitors and Muller glia' 'SKOR2_NPSR1 positive cells'
 'Satellite cells' 'Schwann c

Finding cell types: 100%|███████████████████████████████████████████████████████████████████████████| 1065/1065 [03:53<00:00,  4.55it/s]


Extracting model embeddings...


In [8]:
end_time = datetime.datetime.now()
print('time cost ', (end_time - start_time).total_seconds())

time cost  10059.104811


In [8]:
# copy scNym predictions to the original test data embedding
test_adata.obs['scNym'] = np.array(adata.obs.loc[[x + '-1' for x in test_adata.obs_names], 'scNym'])  
train_adata.obs['scNym'] = np.array(adata.obs.loc[[x + '-0' for x in train_adata.obs_names], 'scNym'])

test_adata.obs['max_prob'] = np.array(adata.obs.loc[[x + '-1' for x in test_adata.obs_names], 'scNym_confidence'])
train_adata.obs['max_prob'] = np.array(adata.obs.loc[[x + '-0' for x in train_adata.obs_names], 'scNym_confidence'])

In [9]:
from metrics import osr_evaluator

shr_mask = np.in1d(test_adata.obs.cell_type_bkp, train_adata.obs.cell_type.unique())

# test_acc = (test_adata.obs.scNym[shr_mask] == test_adata.obs.cell_type_bkp[shr_mask]).mean()
# train_acc = (train_adata.obs.scNym == train_adata.obs.cell_type).mean()

open_score = 1 - test_adata.obs['max_prob']

kn_data_pr = np.array(test_adata.obs['scNym'])[shr_mask]
kn_data_gt = np.array(test_adata.obs['cell_type_bkp'])[shr_mask]
kn_data_open_score = open_score[shr_mask]

unk_data_open_score = open_score[np.logical_not(shr_mask)]

closed_acc, os_auroc, os_aupr, oscr = osr_evaluator(kn_data_pr, kn_data_gt, kn_data_open_score, unk_data_open_score)
closed_acc, os_auroc, os_aupr, oscr

close_acc= 0.4893


(0.4892816968817542, -1, -1, -1)