In [1]:
import scanpy as sc
import anndata
import muon as mu
import scparadise
import pandas as pd
import warnings 
import torch
import os
warnings.simplefilter('ignore')

  from optuna import progress_bar as pbar_module


In [8]:
# Load normalized integrated data
mdata = mu.read_h5mu('/mnt/c/Users/vadim/Desktop/R/PBMC_ref/CITEseq/3p/mdata_unintegrated.h5mu')

In [9]:
# Subset anndata object based on a selected marker genes
genes = pd.read_csv('/mnt/c/Users/vadim/scRNA/scParadise/scAdam/PBMC/3p/genes_for_AI.csv')
mdata.mod['rna'] = mdata.mod['rna'][:, genes.genes].copy()
mdata.update()

In [10]:
# Create list of samples to leave in reference train dataset
lst_reference = ['P1_0', 'P2_0', 'P3_0', 'P4_0', 'P5_0', 'P6_0', 'P7_0', 'P8_0']

In [11]:
# Create mdata_train - 8 samples of 8 donors from unintegrated mdata object (8 donors, 24 samples)
mdata_train = mdata[mdata.obs['orig.ident'].isin(lst_reference)].copy()

In [17]:
# Train default scEve model using mdata_train dataset
scparadise.sceve.train(mdata_train, 
                       path = '/mnt/c/Users/vadim/scRNA/scParadise/scEVE/PBMC/3p/scparadise/',
                       rna_modality_name = 'rna',
                       protein_modality_name = 'adt',
                       detailed_annotation = 'celltype_l3',
                       model_name = 'model_small_counts',
                       max_epochs= 200,
                       eval_metric=['rmse'])

Successfully saved genes names for training model

Successfully saved proteins names for training model

Train dataset contains: 47782 cells, it is 90.0 % of input dataset
Test dataset contains: 5310 cells, it is 10.0 % of input dataset

Accelerator: cuda
Start training
epoch 0  | loss: 9767.29418| train_rmse: 107.13166809082031| valid_rmse: 103.86260223388672|  0:00:02s
epoch 1  | loss: 7849.0716| train_rmse: 94.767822265625| valid_rmse: 91.11871337890625|  0:00:05s
epoch 2  | loss: 5936.3745| train_rmse: 83.66477966308594| valid_rmse: 79.79672241210938|  0:00:07s
epoch 3  | loss: 4476.73814| train_rmse: 72.63130187988281| valid_rmse: 68.64192199707031|  0:00:10s
epoch 4  | loss: 3605.0523| train_rmse: 67.84993743896484| valid_rmse: 63.72526931762695|  0:00:13s
epoch 5  | loss: 3322.33961| train_rmse: 65.81279754638672| valid_rmse: 62.007720947265625|  0:00:15s


KeyboardInterrupt: 

In [5]:
# Create lists with paired test samples
lst_test = ['P1_3_P3_3', 'P1_7_P8_3', 'P2_3_P4_7', 'P2_7_P6_3', 'P3_7_P7_3', 'P4_3_P7_7', 'P5_3_P8_7', 'P5_7_P6_7']

In [8]:
for folder in lst_test:
    # Create adata_adt_test and adata_rna_test - 2 samples of 2 donors from integrated adata object (8 donors, 24 samples)
    adata_adt_test = mdata.mod['adt'][mdata.mod['adt'].obs['orig.ident'].isin([folder[0:4], folder[5:9]])].copy()
    adata_rna_test = mdata.mod['rna'][mdata.mod['rna'].obs['orig.ident'].isin([folder[0:4], folder[5:9]])].copy()
    # Predict surface proteins using pretrained scEve model
    adata_pred_adt_test = scparadise.sceve.predict(adata_rna_test, 
                                                   path_model = '/mnt/c/Users/vadim/scRNA/scParadise/scEVE/PBMC/3p/scparadise/model_small',
                                                   return_mdata = False)
    # Create and save regression report
    scparadise.scnoah.report_reg(adata_prot = adata_adt_test, 
                                 adata_pred_prot = adata_pred_adt_test, 
                                 report_name = folder + '_rmse_report_regression.csv', 
                                 save_path = '/mnt/c/Users/vadim/scRNA/scParadise/scEVE/PBMC/3p/scparadise/reports_small/', 
                                 save_report = True)

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Succ

In [6]:
scparadise.sceve.tune(mdata_train, 
                      path = '/mnt/c/Users/vadim/scRNA/scParadise/scEVE/PBMC/3p/scparadise/models/',
                      rna_modality_name = 'rna',
                      protein_modality_name = 'adt',
                      detailed_annotation = 'celltype_l3',
                      random_state=42,
                      model_name = 'model_regression_tuning_small',
                      eval_metric=['rmse'])

Successfully saved genes names for training model

Successfully saved proteins names for training model

Accelerator: cuda



[I 2024-08-11 11:35:16,891] A new study created in RDB with name: model_regression_tuning_small_test


Fold 1:
Stop training because you reached max_epochs = 100 with best_epoch = 98 and best_valid_rmse = 0.3267500102519989

Fold 2:
Stop training because you reached max_epochs = 100 with best_epoch = 97 and best_valid_rmse = 0.328029990196228

Fold 3:
Stop training because you reached max_epochs = 100 with best_epoch = 94 and best_valid_rmse = 0.3267199993133545

Fold 4:
Stop training because you reached max_epochs = 100 with best_epoch = 98 and best_valid_rmse = 0.32802000641822815


[I 2024-08-11 11:53:38,212] Trial 0 finished with value: 0.32737844437360764 and parameters: {'n_d': 8, 'n_a': 8, 'n_steps': 3, 'n_shared': 2, 'cat_emb_dim': 1, 'n_independent': 1, 'gamma': 1.3, 'momentum': 0.02, 'lr': 0.01, 'mask_type': 'entmax', 'lambda_sparse': 0.001, 'patience': 10, 'max_epochs': 100, 'virtual_batch_size': 128, 'batch_size': 1024}. Best is trial 0 with value: 0.32737844437360764.



Fold 1:

Early stopping occurred at epoch 22 with best_epoch = 2 and best_valid_rmse = 0.3808700144290924

Fold 2:
Stop training because you reached max_epochs = 25 with best_epoch = 24 and best_valid_rmse = 0.3833799958229065

Fold 3:
Stop training because you reached max_epochs = 25 with best_epoch = 7 and best_valid_rmse = 0.4031600058078766

Fold 4:
Stop training because you reached max_epochs = 25 with best_epoch = 14 and best_valid_rmse = 0.4408999979496002


[I 2024-08-11 12:11:46,426] Trial 1 finished with value: 0.4020775035023689 and parameters: {'n_d': 52, 'n_a': 124, 'n_steps': 8, 'n_shared': 6, 'cat_emb_dim': 2, 'n_independent': 2, 'gamma': 1.0580836121681996, 'momentum': 0.34780869685222476, 'lr': 0.3005973943704301, 'mask_type': 'entmax', 'lambda_sparse': 0.04147225000481637, 'patience': 20, 'max_epochs': 25, 'virtual_batch_size': 150, 'batch_size': 450}. Best is trial 0 with value: 0.32737844437360764.



Fold 1:
Stop training because you reached max_epochs = 5 with best_epoch = 4 and best_valid_rmse = 0.3650699853897095

Fold 2:
Stop training because you reached max_epochs = 5 with best_epoch = 3 and best_valid_rmse = 0.37130001187324524

Fold 3:
Stop training because you reached max_epochs = 5 with best_epoch = 4 and best_valid_rmse = 0.35995998978614807

Fold 4:
Stop training because you reached max_epochs = 5 with best_epoch = 4 and best_valid_rmse = 0.37646999955177307


[I 2024-08-11 12:13:10,443] Trial 2 finished with value: 0.36819906532764435 and parameters: {'n_d': 44, 'n_a': 72, 'n_steps': 5, 'n_shared': 3, 'cat_emb_dim': 7, 'n_independent': 2, 'gamma': 1.2921446485352182, 'momentum': 0.15288111888453979, 'lr': 0.22808938511009624, 'mask_type': 'entmax', 'lambda_sparse': 0.0024428866967349987, 'patience': 15, 'max_epochs': 5, 'virtual_batch_size': 350, 'batch_size': 1050}. Best is trial 0 with value: 0.32737844437360764.



Fold 1:

Early stopping occurred at epoch 31 with best_epoch = 21 and best_valid_rmse = 0.34685999155044556

Fold 2:

Early stopping occurred at epoch 24 with best_epoch = 14 and best_valid_rmse = 0.36215001344680786

Fold 3:
Stop training because you reached max_epochs = 70 with best_epoch = 62 and best_valid_rmse = 0.32892999053001404

Fold 4:

Early stopping occurred at epoch 56 with best_epoch = 46 and best_valid_rmse = 0.32739999890327454


[I 2024-08-11 12:32:02,140] Trial 3 finished with value: 0.34133338928222656 and parameters: {'n_d': 16, 'n_a': 124, 'n_steps': 10, 'n_shared': 9, 'cat_emb_dim': 4, 'n_independent': 1, 'gamma': 1.6842330265121568, 'momentum': 0.18165947255844453, 'lr': 0.06110691359890494, 'mask_type': 'entmax', 'lambda_sparse': 0.02845958019429148, 'patience': 10, 'max_epochs': 70, 'virtual_batch_size': 200, 'batch_size': 1200}. Best is trial 0 with value: 0.32737844437360764.



Fold 1:
Stop training because you reached max_epochs = 30 with best_epoch = 29 and best_valid_rmse = 0.37070000171661377

Fold 2:
Stop training because you reached max_epochs = 30 with best_epoch = 28 and best_valid_rmse = 0.36403998732566833

Fold 3:
Stop training because you reached max_epochs = 30 with best_epoch = 26 and best_valid_rmse = 0.3646700084209442


[I 2024-08-11 12:39:35,212] Trial 4 pruned. 


Fold 1:
Stop training because you reached max_epochs = 10 with best_epoch = 7 and best_valid_rmse = 0.39259999990463257

Fold 2:
Stop training because you reached max_epochs = 10 with best_epoch = 8 and best_valid_rmse = 0.38025999069213867

Fold 3:
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_valid_rmse = 0.38387998938560486


[I 2024-08-11 12:41:41,924] Trial 5 pruned. 


Fold 1:

Early stopping occurred at epoch 44 with best_epoch = 24 and best_valid_rmse = 0.38931000232696533

Fold 2:

Early stopping occurred at epoch 33 with best_epoch = 13 and best_valid_rmse = 0.4030199944972992

Fold 3:

Early stopping occurred at epoch 36 with best_epoch = 16 and best_valid_rmse = 0.396699994802475


[I 2024-08-11 12:45:03,398] Trial 6 pruned. 


Fold 1:
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_valid_rmse = 0.3939400017261505


[I 2024-08-11 12:45:30,069] Trial 7 pruned. 


Fold 1:

Early stopping occurred at epoch 18 with best_epoch = 13 and best_valid_rmse = 0.4281400144100189


[I 2024-08-11 12:46:21,554] Trial 8 pruned. 


Fold 1:
Stop training because you reached max_epochs = 30 with best_epoch = 26 and best_valid_rmse = 0.3778899908065796


[I 2024-08-11 12:47:52,316] Trial 9 pruned. 


Fold 1:

Early stopping occurred at epoch 17 with best_epoch = 2 and best_valid_rmse = 0.374099999666214

Fold 2:

Early stopping occurred at epoch 28 with best_epoch = 13 and best_valid_rmse = 0.36226001381874084

Fold 3:


[W 2024-08-11 12:55:44,138] Trial 10 failed with parameters: {'n_d': 100, 'n_a': 8, 'n_steps': 3, 'n_shared': 5, 'cat_emb_dim': 6, 'n_independent': 6, 'gamma': 1.4124969064691433, 'momentum': 0.01424216302093504, 'lr': 0.1399788137781406, 'mask_type': 'sparsemax', 'lambda_sparse': 0.00011936089141726931, 'patience': 15, 'max_epochs': 95, 'virtual_batch_size': 100, 'batch_size': 200} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/vadim/anaconda3/envs/scrna/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/home/vadim/anaconda3/envs/scrna/lib/python3.9/site-packages/scparadise/sceve.py", line 941, in objective
    score = train_params(
  File "/home/vadim/anaconda3/envs/scrna/lib/python3.9/site-packages/scparadise/sceve.py", line 669, in train_params
    clf.fit(
  File "/home/vadim/anaconda3/envs/scrna/lib/python3.9/site-packages/pytorch_tabnet/abstract_model.py

KeyboardInterrupt: 

In [12]:
# Train sceve model using mdata_train dataset
scparadise.sceve.train_tuned(mdata_train, 
                             path = '/mnt/c/Users/vadim/scRNA/scParadise/scEVE/PBMC/3p/scparadise/',
                             path_tuned='/mnt/c/Users/vadim/scRNA/scParadise/scEVE/PBMC/3p/scparadise/models/model_regression_tuning_v3/',
                             rna_modality_name = 'rna',
                             protein_modality_name = 'adt',
                             detailed_annotation = 'celltype_l3',
                             model_name = 'model_tuned_final',
                             eval_metric=['mse', 'mae', 'rmse'])

Successfully saved genes names for training model

Successfully saved proteins names for training model

Train dataset contains: 144909 cells, it is 90.0 % of input dataset
Test dataset contains: 16101 cells, it is 10.0 % of input dataset

Accelerator: cuda
Start training with following hyperparameters: {'n_d': 116, 'n_a': 84, 'n_steps': 4, 'n_shared': 7, 'cat_emb_dim': 5, 'n_independent': 1, 'gamma': 1.182694264124915, 'momentum': 0.3460221802550422, 'optimizer_params': {'lr': 0.022294899647574695}, 'mask_type': 'entmax', 'lambda_sparse': 0.0005581306441453191, 'patience': 20, 'max_epochs': 80, 'virtual_batch_size': 100, 'batch_size': 300, 'device_name': 'cuda'}

epoch 0  | loss: 0.23553 | train_mse: 0.1156499981880188| train_mae: 0.2520900070667267| train_rmse: 0.34007999300956726| valid_mse: 0.11585000157356262| valid_mae: 0.2524299919605255| valid_rmse: 0.3403699994087219|  0:00:22s
epoch 1  | loss: 0.08591 | train_mse: 0.10916999727487564| train_mae: 0.24507999420166016| train_rms

In [9]:
for folder in lst_test:
    # Create adata_adt_test and adata_rna_test - 2 samples of 2 donors from integrated adata object (8 donors, 24 samples)
    adata_adt_test = mdata.mod['adt'][mdata.mod['adt'].obs['orig.ident'].isin([folder[0:4], folder[5:9]])].copy()
    adata_rna_test = mdata.mod['rna'][mdata.mod['rna'].obs['orig.ident'].isin([folder[0:4], folder[5:9]])].copy()
    # Predict surface proteins using pretrained scEve model
    adata_pred_adt_test = scparadise.sceve.predict(adata_rna_test, 
                                                   path_model = '/mnt/c/Users/vadim/scRNA/scParadise/scEVE/PBMC/3p/scparadise/model_small_tuned/',
                                                   return_mdata = False)
    # Create and save regression report
    scparadise.scnoah.report_reg(adata_prot = adata_adt_test, 
                                 adata_pred_prot = adata_pred_adt_test, 
                                 report_name = folder + '_rmse_report_regression_tuned_small.csv', 
                                 save_path = '/mnt/c/Users/vadim/scRNA/scParadise/scEVE/PBMC/3p/scparadise/reports_small/', 
                                 save_report = True)

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of proteins used for training model

Succ

In [2]:
pip list

Package                       Version
----------------------------- --------------
absl-py                       2.1.0
adjustText                    1.2.0
aiohappyeyeballs              2.3.6
aiohttp                       3.10.3
aiosignal                     1.3.1
alabaster                     1.0.0
alembic                       1.13.2
anndata                       0.10.8
annoy                         1.17.3
anyio                         4.4.0
appdirs                       1.4.4
argon2-cffi                   23.1.0
argon2-cffi-bindings          21.2.0
array_api_compat              1.8
arrow                         1.3.0
asttokens                     2.4.1
async-lru                     2.0.4
attrs                         24.2.0
Babel                         2.14.0
beautifulsoup4                4.12.3
bleach                        6.1.0
bokeh                         3.4.2
branca                        0.7.2
Brotli                        1.1.0
cached-property               1.5.2
cachetools