In [1]:
import warnings 
warnings.simplefilter('ignore')

import celltypist
import scparadise
import scanpy as sc
import muon as mu
import anndata
import pandas as pd
import os

In [2]:
# Load normalized integrated data
mdata = mu.read_h5mu('Heart_3p_CITE/mdata_unintegrated_annotated.h5mu')

In [3]:
# Select RNA modality from MuData object (mdata)
mdata = mdata[mdata.obs['celltype_l2'] != 'classical DC1']
mdata = mdata[mdata.obs['celltype_l2'] != 'Neutrophil']
mdata = mdata[mdata.obs['celltype_l2'] != 'plasmacytoid DC']
mdata = mdata[mdata.obs['celltype_l2'] != 'ILC']
mdata = mdata[mdata.obs['celltype_l2'] != 'proliferating T']
adata = mdata.mod['rna'].copy()
del mdata

In [4]:
adata = anndata.AnnData(X = adata.layers['counts'].copy(),
                        obs = adata.obs,
                        var = adata.var,
                        obsm = adata.obsm,
                        uns = adata.uns,
                        varm = adata.varm,
                        layers = adata.layers)

In [5]:
# Subset anndata object based on a selected marker genes
genes = pd.read_csv('Heart_3p_CITE/genes_for_AI.csv')
adata = adata[:, genes.genes].copy()

In [6]:
# Normalized expression to 10000 counts per cell (operating condition of the celltypist tool)
sc.pp.normalize_total(adata, target_sum = 10000)
sc.pp.log1p(adata)

In [8]:
# Create dataset for model training
adata_train = adata[adata.obs['orig.ident'].isin([12, 13, 17, 27, 28, 29, 30, 32, 39, 42])].copy()

In [10]:
# Train and save models for celltype_l1, celltype_l2 annotation levels
lst_annotations = ['celltype_l1', 'celltype_l2']
for i in lst_annotations:
    # Training step
    model = celltypist.train(adata_train, labels = i, feature_selection = False, n_jobs=12)
    # Save the model
    model.write(os.path.join('Heart_3p_CITE/models_celltypist', i + '.pkl'))

🍳 Preparing data before training
🔬 Input data has 64258 cells and 600 genes
⚖️ Scaling input data
🏋️ Training data using logistic regression
✅ Model training done!
🍳 Preparing data before training
🔬 Input data has 64258 cells and 600 genes
⚖️ Scaling input data
🏋️ Training data using logistic regression
✅ Model training done!


In [11]:
# Create lists of test datasets
lst_test_1 = ['1_6', '2_7', '4_5', '8_9']
lst_test_2 = ['41', '34', '15', '33']

In [12]:
for folder in lst_test_1:
    os.makedirs(os.path.join('Heart_3p_CITE/reports_model_celltypist', folder))
    adata_test = adata[adata.obs['orig.ident'].isin([int(folder[0]), int(folder[2])])].copy()
    # Prediction step using pretrained models
    for i in lst_annotations:
        predictions = celltypist.annotate(adata_test, model = os.path.join('Heart_3p_CITE/models_celltypist', i + '.pkl'), majority_voting = True)
        adata_test = predictions.to_adata()
        scparadise.scnoah.report_classif_full(adata_test, 
                                              celltype = i, 
                                              pred_celltype = 'majority_voting', 
                                              report_name = 'report_celltypist_' + i + '.csv',
                                              save_path = os.path.join('Heart_3p_CITE/reports_model_celltypist', folder),
                                              save_report = True)
for folder in lst_test_2:
    os.makedirs(os.path.join('Heart_3p_CITE/reports_model_celltypist', folder))
    adata_test = adata[adata.obs['orig.ident'].isin([int(folder)])].copy()
    # Prediction step using pretrained models
    for i in lst_annotations:
        predictions = celltypist.annotate(adata_test, model = os.path.join('Heart_3p_CITE/models_celltypist', i + '.pkl'), majority_voting = True)
        adata_test = predictions.to_adata()
        scparadise.scnoah.report_classif_full(adata_test, 
                                              celltype = i, 
                                              pred_celltype = 'majority_voting', 
                                              report_name = 'report_celltypist_' + i + '.csv',
                                              save_path = os.path.join('Heart_3p_CITE/reports_model_celltypist', folder),
                                              save_report = True)

🔬 Input data has 15191 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 15191 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 13765 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering


Successfully saved report



⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 13765 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 9377 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering


Successfully saved report



⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 9377 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 12603 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering


Successfully saved report



⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 12603 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 6115 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering


Successfully saved report



⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 6115 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 10142 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering


Successfully saved report



⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 10142 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 6410 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 6410 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 6470 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 6470 cells and 600 genes
🔗 Matching reference genes in the model
🧬 600 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 10


Successfully saved report



🗳️ Majority voting the predictions
✅ Majority voting done!


Successfully saved report



In [13]:
import session_info
session_info.show()