In [1]:
import warnings 
warnings.simplefilter('ignore')

import scanpy as sc
import muon as mu
import scparadise
import pandas as pd
import os

In [2]:
# Load normalized integrated data
mdata = mu.read_h5mu('PBMC_3p_CITE/mdata_unintegrated.h5mu')

In [3]:
# Subset anndata object based on a selected marker genes
genes = pd.read_csv('PBMC_3p_CITE/genes_for_AI.csv')
mdata.mod['rna'] = mdata.mod['rna'][:, genes.genes].copy()
mdata.update()

In [4]:
# Create list of samples to leave in reference train dataset
lst_reference = ['P1_0', 'P2_0', 'P3_0', 'P4_0', 'P5_0', 'P6_0', 'P7_0', 'P8_0']

In [5]:
# Create mdata_train - 8 samples of 8 donors from unintegrated mdata object (8 donors, 24 samples)
mdata_train = mdata[mdata.obs['orig.ident'].isin(lst_reference)].copy()

In [6]:
# Train default scEve model using mdata_train dataset
scparadise.sceve.train(mdata_train, 
                       path = '',
                       rna_modality_name = 'rna',
                       second_modality_name = 'adt',
                       detailed_annotation = 'celltype_l3',
                       model_name = 'model_PBMC_scEve_default',
                       max_epochs= 200,
                       eval_metric=['rmse'])

Successfully saved genes names for training model

Successfully saved proteins names for training model

Train dataset contains: 47782 cells, it is 90.0 % of input dataset
Test dataset contains: 5310 cells, it is 10.0 % of input dataset

Accelerator: cuda
Start training
epoch 0  | loss: 0.4932  | train_rmse: 0.45833998918533325| valid_rmse: 0.45840001106262207|  0:00:03s
epoch 1  | loss: 0.1396  | train_rmse: 0.4065699875354767| valid_rmse: 0.40573999285697937|  0:00:07s
epoch 2  | loss: 0.11901 | train_rmse: 0.38762998580932617| valid_rmse: 0.38710999488830566|  0:00:10s
epoch 3  | loss: 0.10861 | train_rmse: 0.3716700077056885| valid_rmse: 0.3713800013065338|  0:00:13s
epoch 4  | loss: 0.10343 | train_rmse: 0.3655700087547302| valid_rmse: 0.36517998576164246|  0:00:17s
epoch 5  | loss: 0.10143 | train_rmse: 0.3630099892616272| valid_rmse: 0.36302000284194946|  0:00:20s
epoch 6  | loss: 0.10054 | train_rmse: 0.3608199954032898| valid_rmse: 0.3609299957752228|  0:00:23s
epoch 7  | loss

In [7]:
# Create list with paired test samples
lst_test = ['P1_3_P3_3', 'P1_7_P8_3', 'P2_3_P4_7', 'P2_7_P6_3', 'P3_7_P7_3', 'P4_3_P7_7', 'P5_3_P8_7', 'P5_7_P6_7']

In [8]:
for folder in lst_test:
    # Create adata_adt_test and adata_rna_test - 2 samples of 2 donors from integrated mdata object (8 donors, 24 samples)
    adata_adt_test = mdata.mod['adt'][mdata.mod['adt'].obs['orig.ident'].isin([folder[0:4], folder[5:9]])].copy()
    adata_rna_test = mdata.mod['rna'][mdata.mod['rna'].obs['orig.ident'].isin([folder[0:4], folder[5:9]])].copy()
    # Predict surface proteins using pretrained scEve model
    adata_pred_adt_test = scparadise.sceve.predict(adata_rna_test, 
                                                   path_model = 'model_PBMC_scEve_default',
                                                   return_mdata = False)
    # Create and save regression report
    scparadise.scnoah.report_reg(adata_prot = adata_adt_test, 
                                 adata_pred_prot = adata_pred_adt_test, 
                                 report_name = folder + '_report_regression.csv', 
                                 save_path = 'PBMC_3p_CITE/reports_model_PBMC_scEve_default', 
                                 save_report = True)
    df_corr = pd.DataFrame(columns=['Pearson coef', 'p-value'])
    for i in adata_adt_test.var_names.tolist():
        person_coef = scparadise.scnoah.pearson_coef_prot(adata_prot = adata_adt_test,
                                                          adata_pred_prot = adata_pred_adt_test,
                                                          protein = i,
                                                          protein_pred = i + '_pred')
        df_corr.loc[i] = [person_coef['Pearson coefficient'], person_coef['p-value']]
    df_corr['p-value'] = df_corr['p-value'].astype('float64')
    file = "results_pearson_" + folder + ".csv"
    df_corr.to_csv(os.path.join('PBMC_3p_CITE/reports_model_PBMC_scEve_default', file))

Successfully loaded list of genes used for training model

Successfully loaded list of features used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of features used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of features used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of features used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of features used for training model

Successfully loaded model

Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded list of features used for training model

Succ

In [12]:
import session_info
session_info.show()