# CellTypist Annotations
This notebook will load in datasets and process them. This will enable CellTypist to auto-annotate based on its references. The most predominant cell type will be used to reannotate the authors' cluster.

In [1]:
# Importing Require Libraries
import celltypist
import scanpy as sc
import session_info
import pandas as pd
import os

  .. math:: Q = \\frac{1}{m} \\sum_{ij} \\left(A_{ij} - \\frac{k_i^\mathrm{out} k_j^\mathrm{in}}{m} \\right)\\delta(\\sigma_i, \\sigma_j),
  .. math:: Q = \\sum_{ij} \\left(A_{ij} - \\gamma \\frac{k_i^\mathrm{out} k_j^\mathrm{in}}{m} \\right)\\delta(\\sigma_i, \\sigma_j),
  implementation therefore does not guarantee subpartition :math:`\gamma`-density.
  .. math:: Q = \sum_k \\lambda_k Q_k.


In [2]:
## Importing and Reading in Pilot Datasets 
# File Paths for Datasets
# Granja et al
file_path_two = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Granja_et_al/scRNA-hematopoiesis-Granja-2019.h5ad")
# Jardine et al 
file_path_three = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Jardine_et_al/fig1b_fbm_scaled_gex_updated_dr_20210104.h5ad")
# Heimlich et al 
file_path_four = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Heimlich_et_al/5e3e4027-0675-4d5f-b223-bb03a17ade71.h5ad")
# huuhtanen et al 
file_path_five = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Huuhtanen_et_al/Fig1B_cml_public_seurat_RDS.h5")
# Roy et al 
file_path_six = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Roy_et_al/GSE155259_Roy_et_al.h5ad")
# Petti et al 
#file_path_seven = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Petti_et_al/Petti_508084.h5ad")
#file_path_eight = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Petti_et_al/Petti_548327.h5ad")
#file_path_nine = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Petti_et_al/Petti_721214.h5ad")
#file_path_ten = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Petti_et_al/Petti_782328.h5ad")
#file_path_eleven = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Petti_et_al/Petti_809653.h5ad")
# Simone et al 
file_path_twelve = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Simone_et_al/7d821d98-5b42-4480-8173-641c1b37b237.h5ad")

# Reading in Datasets as ScanPy Objects
#adata_0 = sc.read_h5ad(file_path_two)
#adata_1 = sc.read_h5ad(file_path_three)
#adata_2 = sc.read_h5ad(file_path_four)
#adata_3 = sc.read_10x_h5(file_path_five)
#adata_4 = sc.read_h5ad(file_path_six)
#adata_5 = sc.read_h5ad(file_path_seven)
#adata_6 = sc.read_h5ad(file_path_eight)
#adata_7 = sc.read_h5ad(file_path_nine)
#adata_8 = sc.read_h5ad(file_path_ten)
#adata_9 = sc.read_h5ad(file_path_eleven)
adata_10 = sc.read_h5ad(file_path_twelve)


### Granja <i> et al. </i> (2019) <i> Nature Biotechnology </i>
Normalise (10 000 Counts per Cell) <br>
Log <br>
Annotate (Immune_All_Low and Immune_All_High) <br>
Label Individual Cells then Clusters <br>
Authors' Clusters are "BioClassification" <br>

In [None]:
# Predicts Cluster Type by Most Common Cell in Cluster
# Normalize to 10,000 counts per cell
sc.pp.normalize_total(adata_0, target_sum=1e4)

# Log1p transform
sc.pp.log1p(adata_0)

# Run CellTypist Annotation on Single Cells (Immune_All_Low.pkl or Immune_All_high.pkl)
results = celltypist.annotate(adata_0, model='Immune_All_Low.pkl')

# Add predicted labels to the original AnnData object
adata_0.obs['celltypist_labels'] = results.predicted_labels.predicted_labels

# Compute the most common celltypist label in each cluster
cluster_map = (
    adata_0.obs
    .groupby('BioClassification')['celltypist_labels']
    .agg(lambda x: x.value_counts().idxmax())
)

# Map cluster labels to each cell
adata_0.obs['cluster_cell_type'] = adata_0.obs['BioClassification'].map(cluster_map)

In [None]:
# Cell Level Cluster Annotations in Tabular Format
# Visualise
adata_0.obs[['BioClassification', 'cluster_cell_type']].head(n=5)

# Cluster Level Cluster Annotations in Tabular Format
# Create a DataFrame of cluster IDs and their assigned cell type
cluster_assignments = (
    adata_0.obs[['BioClassification', 'cluster_cell_type']]
    .drop_duplicates()
    .sort_values('BioClassification')
    .reset_index(drop=True)
)
# Visualise
cluster_assignments.head(n=5)
# Save as CSV
cluster_assignments.to_csv("/Users/alexantill/Göran_Karlsson_Lab/benchLLM/Granja_et_al/author_vs_celltypist_Roy_Immune_All_Low_new.csv", index=False)

### Jardine <i> et al.</i> (2021) <i> Nature </i>
Normalise (10 000 Counts per Cell) <br>
Log <br>
Annotate (Immune_All_Low and Immune_All_High) <br>
Label Individual Cells then Clusters <br>
Authors' Clusters are "cell.labels" <br>

In [None]:
# Normalize to 10,000 counts per cell
sc.pp.normalize_total(adata_1, target_sum=1e4)

# Log1p transform
sc.pp.log1p(adata_1)

# Run CellTypist Annotation on Single Cells (Immune_All_Low.pkl or Immune_All_high.pkl)
results = celltypist.annotate(adata_1, model='Immune_All_Low.pkl')

# Add predicted labels to the original AnnData object
adata_1.obs['celltypist_labels'] = results.predicted_labels.predicted_labels

# Compute the most common celltypist label in each cluster
cluster_map = (
    adata_1.obs
    .groupby('cell.labels')['celltypist_labels']
    .agg(lambda x: x.value_counts().idxmax())
)

# Map cluster labels to each cell
adata_1.obs['cluster_cell_type'] = adata_1.obs['cell.labels'].map(cluster_map)

In [None]:
# Cell Level Cluster Annotations in Tabular Format
# Visualise
adata_1.obs[['cell.labels', 'cluster_cell_type']].head(n=5)

# Cluster Level Cluster Annotations in Tabular Format
# Create a DataFrame of cluster IDs and their assigned cell type
cluster_assignments = (
    adata_1.obs[['cell.labels', 'cluster_cell_type']]
    .drop_duplicates()
    .sort_values('cell.labels')
    .reset_index(drop=True)
)
# Visualise
cluster_assignments.head(n=5)
# Save as CSV
cluster_assignments.to_csv("/Users/alexantill/Göran_Karlsson_Lab/benchLLM/Jardine_et_al/author_vs_celltypist_Jardine_Immune_All_High.csv", index=False)

### Roy <i> et al. </i> (2021) <i> Cell Reports </i>
Normalise (10 000 Counts per Cell) <br>
Log <br>
Annotate (Immune_All_Low and Immune_All_High) <br>
Label Individual Cells then Clusters <br>
Authors' Clusters are "cell_type" <br>

In [None]:
# Normalize to 10,000 counts per cell
sc.pp.normalize_total(adata_4, target_sum=1e4)

# Log1p transform
sc.pp.log1p(adata_4)

# Run CellTypist Annotation on Single Cells (Immune_All_Low.pkl or Immune_All_high.pkl)
results = celltypist.annotate(adata_4, model='Immune_All_Low.pkl')

# Add predicted labels to the original AnnData object
adata_4.obs['celltypist_labels'] = results.predicted_labels.predicted_labels

# Compute the most common celltypist label in each cluster
cluster_map = (
    adata_4.obs
    .groupby('cell_type')['celltypist_labels']
    .agg(lambda x: x.value_counts().idxmax())
)

# Map cluster labels to each cell
adata_4.obs['cluster_cell_type'] = adata_4.obs['cell_type'].map(cluster_map)

In [None]:
# Cell Level Cluster Annotations in Tabular Format
# Visualise
adata_4.obs[['cell_type', 'cluster_cell_type']].head(n=5)

# Cluster Level Cluster Annotations in Tabular Format
# Create a DataFrame of cluster IDs and their assigned cell type
cluster_assignments = (
    adata_4.obs[['cell_type', 'cluster_cell_type']]
    .drop_duplicates()
    .sort_values('cell_type')
    .reset_index(drop=True)
)
# Visualise
cluster_assignments.head(n=5)
# Save as CSV
cluster_assignments.to_csv("/Users/alexantill/Göran_Karlsson_Lab/benchLLM/Roy_et_al/author_vs_celltypist_Roy_Immune_All_Low.csv", index=False)

### De Simone <i> et al. </i> (2025) <i> Nucleic Acids Research </i>
Normalise (10 000 Counts per Cell) <br>
Log <br>
Annotate (Immune_All_Low and Immune_All_High) <br>
Label Individual Cells then Clusters <br>
Authors' Clusters are "cell_type" <br>
<b> HAVE NOT FIXED (NEED TO USE gene_name NOT ensemble_ID) </b>

In [3]:
# Normalize to 10,000 counts per cell
sc.pp.normalize_total(adata_10, target_sum=1e4)

# Log1p transform
sc.pp.log1p(adata_10)

# Backup original Ensemble IDs
adata_10.var['ensembl_id'] = adata_10.var_names

# Replace var_names with Gene Symbols
adata_10.var_names = adata_10.var['gene_name']

# Run CellTypist Annotation on Single Cells (Immune_All_Low.pkl or Immune_All_high.pkl)
results = celltypist.annotate(adata_10, model='Immune_All_Low.pkl')

# Add predicted labels to the original AnnData object
adata_10.obs['celltypist_labels'] = results.predicted_labels.predicted_labels

# Compute the most common celltypist label in each cluster
cluster_map = (
    adata_10.obs
    .groupby('cell_type')['celltypist_labels']
    .agg(lambda x: x.value_counts().idxmax())
)

# Map cluster labels to each cell
adata_10.obs['cluster_cell_type'] = adata_10.obs['cell_type'].map(cluster_map)

🔬 Input data has 124132 cells and 11182 genes
🔗 Matching reference genes in the model
🧬 3934 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Cell Level Cluster Annotations in Tabular Format
# Visualise
adata_10.obs[['cell_type', 'cluster_cell_type']].head(n=5)

# Cluster Level Cluster Annotations in Tabular Format
# Create a DataFrame of cluster IDs and their assigned cell type
cluster_assignments = (
    adata_10.obs[['cell_type', 'cluster_cell_type']]
    .drop_duplicates()
    .sort_values('cell_type')
    .reset_index(drop=True)
)
# Visualise
cluster_assignments.head(n=5)
# Save as CSV
cluster_assignments.to_csv("/Users/alexantill/Göran_Karlsson_Lab/benchLLM/Simone_et_al/author_vs_celltypist_Simone_Immune_All_Low.csv", index=False)

### Heimlich <i> et al. </i> (2024) <i> Blood Advances </i>
Normalise (10 000 Counts per Cell) <br>
Log <br>
Annotate (Immune_All_Low and Immune_All_High) <br>
Label Individual Cells then Clusters <br>
Authors' Clusters are "cell_type" <br>
<b> gene_name TO REMOVE ENSEMBLE_IDs IS JUST A GUESS FIND ACTUAL VARIABLE NAME </b>


In [None]:
# Normalize to 10,000 counts per cell
sc.pp.normalize_total(adata_2, target_sum=1e4)

# Log1p transform
sc.pp.log1p(adata_2)

# Backup original Ensemble IDs
adata_2.var['ensembl_id'] = adata_2.var_names

# Replace var_names with Gene Symbols
adata_2.var_names = adata_2.var['gene_name']

# Run CellTypist Annotation on Single Cells (Immune_All_Low.pkl or Immune_All_high.pkl)
results = celltypist.annotate(adata_2, model='Immune_All_Low.pkl')

# Add predicted labels to the original AnnData object
adata_2.obs['celltypist_labels'] = results.predicted_labels.predicted_labels

# Compute the most common celltypist label in each cluster
cluster_map = (
    adata_2.obs
    .groupby('cell_type')['celltypist_labels']
    .agg(lambda x: x.value_counts().idxmax())
)

# Map cluster labels to each cell
adata_2.obs['cluster_cell_type'] = adata_2.obs['cell_type'].map(cluster_map)

In [None]:
# Cell Level Cluster Annotations in Tabular Format
# Visualise
adata_2.obs[['cell_type', 'cluster_cell_type']].head(n=5)

# Cluster Level Cluster Annotations in Tabular Format
# Create a DataFrame of cluster IDs and their assigned cell type
cluster_assignments = (
    adata_2.obs[['cell_type', 'cluster_cell_type']]
    .drop_duplicates()
    .sort_values('cell_type')
    .reset_index(drop=True)
)
# Visualise
cluster_assignments.head(n=5)
# Save as CSV
cluster_assignments.to_csv("/Users/alexantill/Göran_Karlsson_Lab/benchLLM/Simone_et_al/author_vs_celltypist_Simone_Immune_All_Low.csv", index=False)