# Clustering tutorial 

In [None]:
#Import relevant python libraries
import scanpy as sc
import pandas as pd

In [None]:
# Create output dir for this session
import os

prefix_output = "../Data/results/04"
os.mkdirs(prefix_output, exist_ok=True)

In [None]:
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 10)

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['figure.dpi'] = 80

In [None]:
#import data

In [None]:
ad = sc.read_h5ad("../Data/Caron_batch_corrected.500.h5ad")

In [None]:
# explore the object

In [None]:
ad

In [None]:
ad.obs.columns

In [None]:
ad.var.head()

In [None]:
sc.pl.embedding(ad, basis='X_pca', color='SampleGroup', title='PCA')
sc.pl.embedding(ad, basis='X_corrected', color='SampleGroup', title='PCA')

In [None]:
sc.pl.embedding(ad, basis='X_tsne', color='SampleGroup', title='T-sne')
sc.pl.embedding(ad, basis='X_tsne_corrected', color='SampleGroup', title='T-sne')

In [None]:
sc.pl.embedding(ad, basis='X_umap', color='SampleGroup', title='UMAP')
sc.pl.embedding(ad, basis='X_umap_corrected', color='SampleGroup', title='UMAP_corrected')

In [None]:
sc.pp.neighbors(ad, use_rep="X_corrected", n_neighbors=10, n_pcs=40)

In [None]:
ad

In [None]:
sc.tl.leiden(ad, resolution=1, key_added="leiden_res1")

In [None]:
ad

In [None]:
resolutions = [0.3, 0.5, 1.0, 1.5]

# Perform Leiden clustering for each resolution and store the results with unique keys
for res in resolutions:
    key = f"leiden_res{res}"
    sc.tl.leiden(ad, resolution=res, key_added=key)

In [None]:
ad

In [None]:
sc.tl.umap(ad)

In [None]:
# Visualize the clustering using UMAP

sc.pl.umap(ad, color=['leiden_res0.3', 'leiden_res0.5','leiden_res1.0', 'leiden_res1.5'],wspace=0.2,frameon=False)

In [None]:
sc.pl.umap(ad, color=['leiden_res0.3','SampleGroup',"SampleName"],wspace=0.2,frameon=False)

In [None]:
from sklearn.metrics import silhouette_score

# Loop through each Leiden resolution that we have computed and calculate the silhouette score
for res in [0.3,0.5, 1.0, 1.5]:
    key = f"leiden_res{res}"
    score = silhouette_score(ad.obsm['X_corrected'], ad.obs[key])  
    print(f"Silhouette Score for resolution {res}: {score:.4f}")

In [None]:
sc.pl.embedding(ad, basis='X_umap', color=['leiden_res0.3','SampleName'], title='UMAP')
sc.pl.embedding(ad, basis='X_umap_corrected', color=['leiden_res0.3','SampleName'], title='UMAP_corrected')

In [None]:
for res in [0.3, 0.5, 1.0, 1.5]:
    key = f"leiden_res{res}"
    num_clusters = ad.obs[key].nunique()
    print(f"Number of clusters for resolution {res}: {num_clusters}")

In [None]:
sc.pl.umap(ad, color=['leiden_res0.3','CD79A',"LYZ","SampleName"],wspace=0.2,frameon=False)

In [None]:
sc.pp.normalize_total(ad, target_sum=None)
sc.pp.log1p(ad)

In [None]:
sc.pl.embedding(ad, basis='X_umap_corrected', color=['leiden_res0.3','CD79A',"LYZ","SampleName"], title='UMAP_corrected')

In [None]:
sc.pl.violin(ad, 'LYZ', groupby='leiden_res0.3', color='leiden_res0.3', use_raw=False)

In [None]:
sc.pl.violin(ad, 'CD79A', groupby='leiden_res0.3', color='leiden_res0.3', use_raw=False)

# Compute marker genes and annotate clusters

In [None]:
ad1 = sc.read_h5ad("../Data/Caron_clustered.500.h5ad")

In [None]:
ad1.X.todense()

In [None]:
ad1.raw

In [None]:
ad1.obs["label"]

In [None]:
sc.pp.normalize_total(ad1, target_sum=None)
sc.pp.log1p(ad1)

In [None]:
sc.pl.embedding(ad1, basis='X_umap_corrected', color=['label','CD79A',"LYZ"],legend_loc="on data")

In [None]:
# visualise monocyte-specific marker
sc.pl.embedding(ad1, basis='X_umap_corrected', color=['CST3'])

In [None]:
sc.tl.rank_genes_groups(ad1, groupby="label", method="wilcoxon",key_added="rank_genes_groups_wilcoxon",)

In [None]:
#We can then visualize the top 5 differentially-expressed genes on a dotplot.

sc.pl.rank_genes_groups_dotplot(
    ad1, groupby="label", standard_scale="var", n_genes=5,key="rank_genes_groups_wilcoxon"
)

In [None]:
sc.get.rank_genes_groups_df(ad1, group="11",key="rank_genes_groups_wilcoxon").head(5)

In [None]:
sc.pl.embedding(ad1, basis='X_umap_corrected', color=['label',"LYZ"],legend_loc="on data")

In [None]:
sc.pl.violin(ad1, 'LYZ', groupby='label', color='label', use_raw=False)

In [None]:
# Exercise 1 ----

# CD3D suggests cluster 6 and 7 are T cells
sc.pl.embedding(ad1, basis='X_umap_corrected', color=['label',"CD3D"],legend_loc="on data")
sc.pl.violin(ad1, 'CD3D', groupby='label', color='label', use_raw=False)

# Confirm this by identifying other genes that differentiate
# these two clusters from the rest of the cells.

# 1. Extract results for cluster 6 and convert it to data.frame

# 2. Visualise the expression of genes that seem interesting from your filters.




In [None]:
sc.get.rank_genes_groups_df(ad1, group="6",key="rank_genes_groups_wilcoxon").head(5)

In [None]:
sc.get.rank_genes_groups_df(ad1, group="7",key="rank_genes_groups_wilcoxon").head(5)

In [None]:
dc_cluster_genes = sc.get.rank_genes_groups_df(ad1, group="6",key="rank_genes_groups_wilcoxon").head(10)["names"]
sc.pl.embedding(ad1, basis='X_umap_corrected',
    color=[*dc_cluster_genes, "label"],
    legend_loc="on data",
    frameon=False,
    ncols=3,
)

In [None]:
dc_cluster_genes = sc.get.rank_genes_groups_df(ad1, group="7",key="rank_genes_groups_wilcoxon").head(5)["names"]
sc.pl.embedding(ad1, basis='X_umap_corrected',
    color=[*dc_cluster_genes, "label"],
    legend_loc="on data",
    frameon=False,
    ncols=3,
)

In [None]:
# Define known cell type-specific genes
known_genes = [
    "HBA1",    # erythrocytes
    "CST3",    # monocytes
    "CD3E",    # T cells
    "NKG7",    # NK T cells
    "CD79A",   # B cells
    "MS4A1"    # CD20 B cells
]

# Ensure all genes are present in ad1.var_names
known_genes = [gene for gene in known_genes if gene in ad1.var_names]

# Violin plot of cell type-specific genes across clusters
sc.pl.violin(ad1, known_genes, groupby='label')

In [None]:
cell_annotation = {
    "1": "B (c1)",
    "2": "B (c2)",
    "3": "B (c3)",
    "4": "B (c4)",
    "5": "CD20+ B (c5)",
    "6": "T (c6)",
    "7": "NK T (c7)",
    "8": "Erythrocytes (c8)",
    "9": "Erythrocytes (c9)",
    "10": "Erythrocytes (c10)",
    "11": "Monocytes (c11)",
    "12": "B (c12)"
}

# Apply new labels to the 'label' column
ad1.obs['cellType'] = ad1.obs['label'].map(cell_annotation)

In [None]:
sc.pl.embedding(ad1, basis='X_umap_corrected', color=['cellType',"label","SampleName"],legend_loc="on data")