In [11]:
import celltypist
import scanpy as sc
import anndata
import muon as mu
import scparadise
import pandas as pd
import os
import warnings 
warnings.simplefilter('ignore')

In [12]:
# Load normalized integrated data
mdata = mu.read_h5mu('/mnt/c/Users/vadim/Desktop/R/PBMC_ref/CITEseq/3p/mdata_unintegrated.h5mu')

In [13]:
# Select RNA modality from MuData object (mdata)
adata = mdata.mod['rna'].copy()
del mdata

In [14]:
adata = anndata.AnnData(X = adata.layers['counts'].copy(),
                        obs = adata.obs,
                        var = adata.var,
                        obsm = adata.obsm,
                        uns = adata.uns,
                        varm = adata.varm,
                        layers = adata.layers)

In [15]:
# Subset anndata object based on a selected marker genes
genes = pd.read_csv('/mnt/c/Users/vadim/scRNA/scParadise/scAdam/PBMC/3p/genes_for_AI.csv')
adata = adata[:, genes.genes].copy()

In [16]:
# Normalized expression to 10000 counts per cell (operating condition of the celltypist tool)
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)



In [17]:
# Create dataset for model training
adata_train = adata[adata.obs['orig.ident'].isin(['P1_0','P2_0','P3_0','P4_0','P5_0','P6_0','P7_0','P8_0'])].copy()
adata_train

AnnData object with n_obs × n_vars = 53092 × 635
    obs: 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'celltype_l3', 'celltype_l2', 'celltype_l1'
    var: 'gene_ids', 'feature_types', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg', 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
    layers: 'counts'

In [18]:
# Train and save models for celltype_l1, celltype_l2 and celltype_l3 annotation levels
lst_annotations = ['celltype_l1', 'celltype_l2', 'celltype_l3']
for i in lst_annotations:
    # Training step
    model = celltypist.train(adata_train, labels = i, feature_selection = False, n_jobs=12)
    # Save the model
    model.write(os.path.join('/mnt/c/Users/vadim/scRNA/scParadise/scAdam/PBMC/3p/celltypist/models_small', i + '.pkl'))

🍳 Preparing data before training
🔬 Input data has 53092 cells and 635 genes
⚖️ Scaling input data
🏋️ Training data using logistic regression
✅ Model training done!
🍳 Preparing data before training
🔬 Input data has 53092 cells and 635 genes
⚖️ Scaling input data
🏋️ Training data using logistic regression
✅ Model training done!
🍳 Preparing data before training
🔬 Input data has 53092 cells and 635 genes
⚖️ Scaling input data
🏋️ Training data using logistic regression
✅ Model training done!


In [20]:
# Create list of test datasets
lst_test = ['P1_3_P3_3', 'P1_7_P8_3', 'P2_3_P4_7', 'P2_7_P6_3', 'P3_7_P7_3', 'P4_3_P7_7', 'P5_3_P8_7', 'P5_7_P6_7']

In [21]:
for folder in lst_test:
    # Create adata_test - 2 samples of 2 donors from integrated adata object (8 donors, 24 samples)
    adata_test = adata[adata.obs['orig.ident'].isin([folder[0:4], folder[5:9]])].copy()
    # Prediction step using pretrained models
    for i in lst_annotations:
        predictions = celltypist.annotate(adata_test, model = os.path.join('/mnt/c/Users/vadim/scRNA/scParadise/scAdam/PBMC/3p/celltypist/models_small', i + '.pkl'), majority_voting = True)
        adata_test = predictions.to_adata()
        scparadise.scnoah.report_classif_full(adata_test, 
                                      celltype = i, 
                                      pred_celltype = 'majority_voting', 
                                      report_name = 'report_celltypist_635_' + i + '.csv',
                                      save_path = os.path.join('/mnt/c/Users/vadim/scRNA/scParadise/scAdam/PBMC/3p/celltypist/reports_small', folder),
                                      save_report = True)

🔬 Input data has 10823 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 10823 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 10823 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14926 cells and 635 genes


Successfully saved report



🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14926 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14926 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 11647 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering


Successfully saved report



⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 11647 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 11647 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 13035 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering


Successfully saved report



⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 13035 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 13035 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!


Successfully saved report



🔬 Input data has 13123 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 13123 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 13123 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14516 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering


Successfully saved report



⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14516 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14516 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14983 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering


Successfully saved report



⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14983 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14983 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14865 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering


Successfully saved report



⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14865 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 14865 cells and 635 genes
🔗 Matching reference genes in the model
🧬 635 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!


Successfully saved report

