In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import datetime
sys.path.append("../")
from concerto_function5_3 import *
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
import scanpy as sc
import scipy.sparse as sps
import matplotlib.pyplot as plt
from metrics import osr_evaluator

from os.path import join

In [2]:
#Select an available GPU to run on a multi-GPU computer or you can run it directly on the CPU without executing this cell
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = '0' 
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True) 

In [3]:
exp_id = 'HumanFetal_50k'

adata_rna  = sc.read_h5ad('/home/yanxh/data/HumanFetal_50k/RNA/adata_rna_sampled.h5ad')
adata_atac = sc.read_h5ad('/home/yanxh/data/HumanFetal_50k/ATAC/adata_atac.h5ad')

adata_rna.obs['domain'] = 'RNA'
adata_atac.obs['domain']= 'ATAC'

adata_rna.obs['cell_type'] = adata_rna.obs['Main_cluster_name'].values

batch_key = 'domain'
type_key = 'cell_type'

In [4]:
adata_all = sc.concat([adata_rna, adata_atac])
adata_all

AnnData object with n_obs × n_vars = 50000 × 22121
    obs: 'batch', 'domain', 'cell_type'

# Preprocess

In [5]:
st_time = datetime.datetime.now()

In [6]:
# filter cells, normalize_total, hvg(if), no scale
adata = preprocessing_rna(adata_all, 
                          min_features=0, 
                          n_top_features=None, 
                          is_hvg=False, 
                          batch_key=batch_key)

adata_ref = adata[adata.obs[batch_key] == 'RNA']
adata_query = adata[adata.obs[batch_key] == 'ATAC']

shr_mask = np.in1d(adata_query.obs[type_key], adata_ref.obs[type_key].unique())
atac_lab = np.array(adata_query.obs[type_key].values)

save_path = './'
# if not os.path.exists(save_path):
#     os.makedirs(save_path)
# adata_ref.write_h5ad(save_path + 'adata_ref.h5ad')
# adata_query.write_h5ad(save_path + 'adata_query.h5ad')  # .tech=='indrop'

Processed dataset shape: (50000, 22121)


In [7]:
ed_time = datetime.datetime.now()

pp_cost = (ed_time-st_time).total_seconds()
print('pp cost ', pp_cost)

pp cost  7.655622


In [8]:
st_time = datetime.datetime.now()

In [9]:
ref_tf_path = concerto_make_tfrecord_supervised(adata_ref, tf_path = save_path + f'tfrecord/{exp_id}/ref_tf/',
                                     batch_col_name = batch_key, label_col_name=type_key)
query_tf_path = concerto_make_tfrecord_supervised(adata_query, tf_path = save_path + f'tfrecord/{exp_id}/query_tf/',
                                     batch_col_name = batch_key, label_col_name=type_key)

counter: 10000 shape: (22121,), batch: 0
[1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
[9.99999975e-06 9.99999975e-06 9.99999975e-06 ... 9.99999975e-06
 9.99999975e-06 9.99999975e-06]
batchs:  ['RNA']
counter: 20000 shape: (22121,), batch: 0
[1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
[9.99999975e-06 9.99999975e-06 9.99999975e-06 ... 9.99999975e-06
 9.99999975e-06 9.99999975e-06]
batchs:  ['RNA']
counter: 10000 shape: (22121,), batch: 0
[1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
[9.99999975e-06 9.99999975e-06 9.99999975e-06 ... 9.99999975e-06
 9.99999975e-06 9.99999975e-06]
batchs:  ['ATAC']
counter: 20000 shape: (22121,), batch: 0
[1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
[9.99999975e-06 9.99999975e-06 9.99999975e-06 ... 9.99999975e-06
 9.99999975e-06 9.99999975e-06]
batchs:  ['ATAC']
counter: 30000 shape: (22121,), batch: 0
[1.000000e-05 1.000000e-05 1.000000e-05 ... 1.000000e-05 7.105235e-01
 1.000000e-05]
[9.99999975e-06 9.99999975e-06 9.99999975e-06 ... 9.99999975

In [10]:
ed_time = datetime.datetime.now()

rec_cost = (ed_time-st_time).total_seconds()
print('rec cost ', rec_cost)

rec cost  592.621344


In [11]:
st_time = datetime.datetime.now()

In [12]:
# train (leave spleen out). If you don't want to train the model, you can just load our trained classifier's weight and test it directly.
weight_path = save_path + f'weight/{exp_id}/'
ref_tf_path = save_path + f'tfrecord/{exp_id}/ref_tf/'
query_tf_path = save_path + f'tfrecord/{exp_id}/query_tf/'

concerto_train_inter_supervised_uda2(ref_tf_path, query_tf_path, weight_path,
                                     super_parameters={'batch_size': 128, 'epoch_pretrain': 1,'epoch_classifier': 10, 'lr': 1e-4,'drop_rate': 0.1})

./tfrecord/HumanFetal_50k/ref_tf/tf_0.tfrecord
Epoch 1, step 5, simclr loss: 10.4744.
Epoch 1, step 10, simclr loss: 10.0509.
Epoch 1, step 15, simclr loss: 9.6125.
Epoch 1, step 20, simclr loss: 9.0899.
Epoch 1, step 25, simclr loss: 8.5401.
Epoch 1, step 30, simclr loss: 7.9691.
Epoch 1, step 35, simclr loss: 7.3916.
Epoch 1, step 40, simclr loss: 6.8161.
Epoch 1, step 45, simclr loss: 6.2730.
Epoch 1, step 50, simclr loss: 5.7423.
Epoch 1, step 55, simclr loss: 5.2429.
Epoch 1, step 60, simclr loss: 4.7736.
Epoch 1, step 65, simclr loss: 4.3378.
Epoch 1, step 70, simclr loss: 3.9373.
Epoch 1, step 75, simclr loss: 3.5794.
Epoch 1, step 80, simclr loss: 3.2663.
Epoch 1, step 85, simclr loss: 2.9830.
Epoch 1, step 90, simclr loss: 2.7295.
Epoch 1, step 95, simclr loss: 2.5057.
Epoch 1, step 100, simclr loss: 2.2989.
Epoch 1, step 105, simclr loss: 2.1126.
Epoch 1, step 110, simclr loss: 1.9410.
Epoch 1, step 115, simclr loss: 1.7849.
Epoch 1, step 120, simclr loss: 1.6396.
Epoch 1, st

'./weight/HumanFetal_50k/'

In [13]:
ed_time = datetime.datetime.now()

train_cost = (ed_time-st_time).total_seconds()
print('train cost ', train_cost)

train cost  7636.278284


In [14]:
# test (only spleen)
weight_path = save_path + f'weight/{exp_id}/'
ref_tf_path = save_path + f'tfrecord/{exp_id}/ref_tf/'
query_tf_path = save_path + f'tfrecord/{exp_id}/query_tf/'

for epoch in [4]:
    st_time = datetime.datetime.now()
    results = concerto_test_inter_supervised2(weight_path, ref_tf_path, query_tf_path,
                                         super_parameters = {'batch_size': 64, 'epoch': epoch, 'lr': 1e-5,'drop_rate': 0.1})
    ed_time = datetime.datetime.now()

    test_cost = (ed_time-st_time).total_seconds()
    print('test cost ', test_cost)
    
    # NN classifier
    query_neighbor, query_prob = knn_classifier(results['source_feature'],
                                           results['target_feature'],
                                           adata_ref,
                                           adata_ref.obs_names,
                                           column_name=type_key,
                                           k=30)
    open_score = 1 - query_prob

    kn_data_pr = query_neighbor[shr_mask]
    kn_data_gt = atac_lab[shr_mask]
    kn_data_open_score = open_score[shr_mask]

    unk_data_open_score = open_score[np.logical_not(shr_mask)]

    closed_acc, os_auroc, os_aupr, oscr = osr_evaluator(kn_data_pr, kn_data_gt, kn_data_open_score, unk_data_open_score)
    print(closed_acc, os_auroc, os_aupr, oscr)


test cost  610.359155
30000 300
close_acc= 0.2099
0.2099 -1 -1 -1


# Neural classifier

In [13]:
query_pred, query_prob = results['target_pred'], results['target_prob']

In [14]:
open_score = 1 - query_prob

kn_data_pr = query_pred[shr_mask]
kn_data_gt = atac_lab[shr_mask]
kn_data_open_score = open_score[shr_mask]

unk_data_open_score = open_score[np.logical_not(shr_mask)]

closed_acc, os_auroc, os_aupr, oscr = osr_evaluator(kn_data_pr, kn_data_gt, kn_data_open_score, unk_data_open_score)
closed_acc, os_auroc, os_aupr, oscr

close_acc= 0.1532


(0.15316666666666667, -1, -1, -1)