In [1]:
import warnings 
warnings.simplefilter('ignore')

import scanpy as sc
import anndata
import os
import muon as mu
import scparadise
import pandas as pd
import TOSICA
import numpy as np

In [2]:
os.chdir('/mnt/c/Users/vadim/scRNA/scParadise/scripts_article')

In [3]:
# Load normalized integrated data
mdata = mu.read_h5mu('PBMC_3p_CITE/human_PBMC_mdata_unintegrated.h5mu')

In [4]:
# Subset genes
adata = mdata.mod['rna'].copy()
del mdata
adata = anndata.AnnData(X = adata.layers['counts'],
                        obs = adata.obs,
                        var = adata.var,
                        obsm = adata.obsm) 
# Create list of samples to leave in reference train dataset
lst_reference_1 = ['P8_0', 'P7_0', 'P2_0', 'P6_0']
# Create list of samples to leave in reference train dataset
lst_reference_2 = ['P1_0', 'P5_0', 'P3_0', 'P4_0']
# Create adata_train - 8 samples of 8 donors from unintegrated adata object (8 donors, 24 samples)
adata_train_1 = adata[adata.obs['orig.ident'].isin(lst_reference_1)].copy()
adata_train_2 = adata[adata.obs['orig.ident'].isin(lst_reference_2)].copy()
# Create adata_train1, adata_train2 and adata_test datasets
adata_test_1 = adata[adata.obs['orig.ident'].isin(['P1_3', 'P3_3'])].copy()
adata_test_2 = adata[adata.obs['orig.ident'].isin(['P1_7', 'P8_3'])].copy()
adata_test_3 = adata[adata.obs['orig.ident'].isin(['P2_3', 'P4_7'])].copy()
adata_test_4 = adata[adata.obs['orig.ident'].isin(['P2_7', 'P6_3'])].copy()
adata_test_5 = adata[adata.obs['orig.ident'].isin(['P3_7', 'P7_3'])].copy()
adata_test_6 = adata[adata.obs['orig.ident'].isin(['P4_3', 'P7_7'])].copy()
adata_test_7 = adata[adata.obs['orig.ident'].isin(['P5_3', 'P8_7'])].copy()
adata_test_8 = adata[adata.obs['orig.ident'].isin(['P5_7', 'P6_7'])].copy()

# Normalize data, find highly variable features
for i in [adata_train_1, adata_train_2, adata_test_1, adata_test_2, adata_test_3, adata_test_4, adata_test_5, adata_test_6, adata_test_7, adata_test_8]:
    i.layers['counts'] = i.X.copy()
    sc.pp.normalize_total(i, target_sum=None)
    sc.pp.log1p(i)
    i.raw = i

# Subset anndata object based on a selected marker genes
genes = pd.read_csv('PBMC_3p_CITE/genes_for_AI.csv')
adata_train_1 = adata_train_1[:, genes.genes].copy()

In [5]:
# Create lists with paired samples and annotation levels
lst_test_folders = ['P1_3_P3_3', 'P1_7_P8_3', 'P2_3_P4_7', 'P2_7_P6_3', 'P3_7_P7_3', 'P4_3_P7_7', 'P5_3_P8_7', 'P5_7_P6_7']
lst_test_adatas = [adata_test_1, adata_test_2, adata_test_3, adata_test_4, adata_test_5, adata_test_6, adata_test_7, adata_test_8]
lst_annotations = ['celltype_l3', 'celltype_l2', 'celltype_l1']

In [6]:
for folder in lst_test_folders:
    os.makedirs(os.path.join('PBMC_3p_CITE/TOSICA_test/reports', folder))

In [7]:
for i in lst_annotations:
    TOSICA.train(adata_train_1, 
             gmt_path = 'human_immune', 
             label_name = i, 
             epochs = 3, 
             project = 'PBMC_3p_CITE')
    for adata_test, folder in zip(lst_test_adatas, lst_test_folders):
        adata_test = adata_test[:, genes.genes].copy()
        adata_test = TOSICA.pre(adata_test, model_weight_path = 'PBMC_3p_CITE/model-2.pth', project='PBMC_3p_CITE')
        file_save = 'report_test_TOSICA_' + i + '.csv'
        scparadise.scnoah.report_classif_full(adata_test, 
                                              celltype = i, 
                                              pred_celltype = 'Prediction', 
                                              report_name = file_save,
                                              save_path = os.path.join('PBMC_3p_CITE/TOSICA_test/reports/', folder),
                                              save_report=True)

cuda:0
Mask loaded!
Model builded!


[train epoch 0] loss: 1.342, acc: 0.586: 100%|█████████████████████████| 27342/27342 [05:21<00:00, 85.11it/s]
[valid epoch 0] loss: 0.245, acc: 0.923: 100%|████████████████████████| 11718/11718 [01:08<00:00, 170.35it/s]
[train epoch 1] loss: 0.231, acc: 0.930: 100%|█████████████████████████| 27342/27342 [05:23<00:00, 84.44it/s]
[valid epoch 1] loss: 0.091, acc: 0.971: 100%|████████████████████████| 11718/11718 [01:10<00:00, 167.23it/s]
[train epoch 2] loss: 0.077, acc: 0.982: 100%|█████████████████████████| 27342/27342 [05:30<00:00, 82.81it/s]
[valid epoch 2] loss: 0.024, acc: 0.993: 100%|████████████████████████| 11718/11718 [01:20<00:00, 146.41it/s]


Training finished!
cuda:0
0
10000
10823
Successfully saved report

cuda:0
0
10000
14926
Successfully saved report

cuda:0
0
10000
11647
Successfully saved report

cuda:0
0
10000
13035
Successfully saved report

cuda:0
0
10000
13123
Successfully saved report

cuda:0
0
10000
14516
Successfully saved report

cuda:0
0
10000
14983
Successfully saved report

cuda:0
0
10000
14865
Successfully saved report

cuda:0
Mask loaded!
Model builded!


[train epoch 0] loss: 1.236, acc: 0.577: 100%|█████████████████████████| 12816/12816 [02:40<00:00, 79.78it/s]
[valid epoch 0] loss: 0.101, acc: 0.970: 100%|██████████████████████████| 5492/5492 [00:34<00:00, 157.26it/s]
[train epoch 1] loss: 0.123, acc: 0.969: 100%|█████████████████████████| 12816/12816 [02:40<00:00, 79.78it/s]
[valid epoch 1] loss: 0.052, acc: 0.985: 100%|██████████████████████████| 5492/5492 [00:32<00:00, 170.96it/s]
[train epoch 2] loss: 0.072, acc: 0.983: 100%|█████████████████████████| 12816/12816 [02:34<00:00, 83.06it/s]
[valid epoch 2] loss: 0.034, acc: 0.990: 100%|██████████████████████████| 5492/5492 [00:31<00:00, 176.32it/s]


Training finished!
cuda:0
0
10000
10823
Successfully saved report

cuda:0
0
10000
14926
Successfully saved report

cuda:0
0
10000
11647
Successfully saved report

cuda:0
0
10000
13035
Successfully saved report

cuda:0
0
10000
13123
Successfully saved report

cuda:0
0
10000
14516
Successfully saved report

cuda:0
0
10000
14983
Successfully saved report

cuda:0
0
10000
14865
Successfully saved report

cuda:0
Mask loaded!
Model builded!


[train epoch 0] loss: 0.782, acc: 0.671: 100%|███████████████████████████| 6119/6119 [01:10<00:00, 86.26it/s]
[valid epoch 0] loss: 0.114, acc: 0.982: 100%|██████████████████████████| 2622/2622 [00:14<00:00, 177.04it/s]
[train epoch 1] loss: 0.095, acc: 0.983: 100%|███████████████████████████| 6119/6119 [01:12<00:00, 84.71it/s]
[valid epoch 1] loss: 0.042, acc: 0.989: 100%|██████████████████████████| 2622/2622 [00:14<00:00, 176.75it/s]
[train epoch 2] loss: 0.042, acc: 0.990: 100%|███████████████████████████| 6119/6119 [01:11<00:00, 85.98it/s]
[valid epoch 2] loss: 0.023, acc: 0.994: 100%|██████████████████████████| 2622/2622 [00:14<00:00, 176.73it/s]


Training finished!
cuda:0
0
10000
10823
Successfully saved report

cuda:0
0
10000
14926
Successfully saved report

cuda:0
0
10000
11647
Successfully saved report

cuda:0
0
10000
13035
Successfully saved report

cuda:0
0
10000
13123
Successfully saved report

cuda:0
0
10000
14516
Successfully saved report

cuda:0
0
10000
14983
Successfully saved report

cuda:0
0
10000
14865
Successfully saved report



In [11]:
pip list

Package                   Version
------------------------- --------------
absl-py                   2.1.0
aiohappyeyeballs          2.3.4
aiohttp                   3.10.1
aiosignal                 1.3.1
alembic                   1.13.2
anndata                   0.10.8
anyio                     4.4.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
array_api_compat          1.8
arrow                     1.3.0
asttokens                 2.4.1
async-lru                 2.0.4
async-timeout             4.0.3
attrs                     24.2.0
Babel                     2.15.0
beautifulsoup4            4.12.3
bleach                    6.1.0
cached-property           1.5.2
cell-gears                0.0.2
certifi                   2024.7.4
cffi                      1.17.0
charset-normalizer        3.3.2
chex                      0.1.86
click                     8.1.7
cloudpickle               3.1.1
colorlog                  6.8.2
comm                      0.2.2
contextlib2       