In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
from muon import prot as pt

from matplotlib import colors
%matplotlib inline

import muon as mu
import matplotlib.pyplot as plt

from sklearn.metrics import silhouette_score

In [None]:
mdata = mu.read("./citeseq_mdata_allsamples_filtered_crude_clustering.h5mu")

In [None]:
mdata["rna"].uns['log1p']["base"] = None

In [None]:
# investigate each subcluster to remove doublets/identify doublets

## subcluster Myelo

In [None]:
mdata_subset = mdata[mdata.obs['celltype_wnn'].isin(["Myelo"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_wnn'] = mdata_subset.obs['celltype_wnn'].cat.remove_unused_categories()

In [None]:
# recalculate nearest neighbors
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset, key_added='wnn')
mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r0.5'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_wnn',"leiden_wnn_subcluster","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=2, size=10)

In [None]:
marker_genes = ["CD14_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC","CD45_TotalSeqC","CD11c_TotalSeqC",
                "CD3_TotalSeqC","CD4_TotalSeqC","CD8a_TotalSeqC"]
mdata_subset["prot"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["prot"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["prot"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=True, use_raw=False,
             standard_scale="var");

In [None]:
marker_genes = ["JCHAIN","THY1","DCN","NCAM1","LTB","APOD","CD1C","CD14","XCR1","CLEC9A","MKI67","NCAM1","CD3E",
               "SELL","S100A8","S100A9","CCR7","CXCL9","HLA-G","MRC1"]
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=False,
             standard_scale="var");

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["CD14_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC",
                                                                   "CD45_TotalSeqC","CD11c_TotalSeqC","IL1B",
                                                                   "CD3_TotalSeqC","CD4_TotalSeqC","CD8a_TotalSeqC",
                                                                   "NCAM1","THY1","DCN","LTB","JCHAIN"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "MAC", "1": "MAC","2": "DC2","3":"MAC","4":"DC1","5":"MAC_TC_doublet","6":"MAC","7":"MAC_FIB_doublet",
    "8":"MAC_NK_doublet","9":"MAC_FIB_doublet","10":"DC2","11":"MAC_TROPHO_doublet","12":"PLASMA","13":"MAC_EPI_doublet"
}

mdata_subset.obs['celltype_subset'] = mdata_subset.obs.leiden_wnn_subcluster.astype("str").values
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.replace(new_cluster_names)
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(mdata_subset, color=['tissue',"celltype_subset"], ncols=1, size=10)

In [None]:
# create new clustering variable
mdata.obs["celltype_hires"]=mdata.obs["celltype_wnn"]

In [None]:
# add new categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories(
    ["DC1", "DC2", "MAC", "MAC_FIB_doublet", "MAC_NK_doublet",
     "MAC_TC_doublet", "MAC_EPI_doublet", "MAC_TROPHO_doublet", "PLASMA"]
)

# assign new labels from the subset
mdata.obs.loc[mdata_subset.obs_names, "celltype_hires"] = (
    mdata_subset.obs["celltype_subset"].values
)

# cast to categorical
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].astype("category")

In [None]:
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster DC2

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["DC2"]),:]

In [None]:
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")

mu.pp.neighbors(mdata_subset, key_added='wnn')

mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r0.3'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
sc.pl.umap(mdata_subset, color=["leiden_wnn_subcluster","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1)

In [None]:
marker_genes = ["CD4_TotalSeqC","CD8a_TotalSeqC","CD3_TotalSeqC","CD194_TotalSeqC","CD20_TotalSeqC",
                "CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD16_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC","CD14_TotalSeqC",
               "CD27_TotalSeqC","TIGIT_TotalSeqC","CD25_TotalSeqC","CD103_TotalSeqC","CD62L_TotalSeqC",
               "CD28_TotalSeqC","CD127_TotalSeqC","CD366_TotalSeqC","CD11b_TotalSeqC","CD314_TotalSeqC","CD1c_TotalSeqC",
               "CD40_TotalSeqC","CD14_TotalSeqC"]
mdata_subset["prot"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["prot"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["prot"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=False
              ,standard_scale="var"
             );

In [None]:
marker_genes = ["MKI67","CD14","PTPRC","CD74","THY1","HLA-G","NOTUM","XCR1","SELL","CCR7","NCAM1","S100A8",
               "CD163","CLEC10A","CLEC4C","LILRB4","BTLA","CLEC9A","CXCL9"]
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=True,
              standard_scale="var");

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["CD1c_TotalSeqC","CD14_TotalSeqC","CD45_TotalSeqC",
                                                                   "CD68","CD1C","MKI67","ITGAM","CD207","CD103_TotalSeqC"],
                ncols=2, color_map="YlOrRd")

In [None]:
# subcluster leiden 2

In [None]:
mdata_subset2 = mdata_subset[mdata_subset.obs['leiden_wnn_subcluster'].isin(["2"]),:]

In [None]:
# remove unused categories
mdata_subset2.obs['leiden_wnn_subcluster'] = mdata_subset2.obs['leiden_wnn_subcluster'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset2, color=['leiden_wnn_subcluster'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset2['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset2['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset2, key_added='wnn')

mu.tl.umap(mdata_subset2, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset2['rna'].obsm['X_harmony'], mdata_subset2['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset2_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset2, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset2.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset2, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset2['rna'].obs[key] = mdata_subset2.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset2['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset2['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset2.obs['leiden_wnn_subcluster2'] = mdata_subset2.obs['subset2_leiden_r0.3'].copy()
mdata_subset2["rna"].obs["leiden_wnn_subcluster2"]=mdata_subset2.obs["leiden_wnn_subcluster2"]


In [None]:
sc.pl.umap(mdata_subset2, color=["leiden_wnn_subcluster2","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1, size=10)

In [None]:
mu.pl.embedding(mdata_subset2, basis="X_umap", frameon=False, color=["leiden_wnn_subcluster2","CD14_TotalSeqC","CD1c_TotalSeqC"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "MAC", "1": "DC2_prol"}

mdata_subset2.obs['celltype_subset2'] = mdata_subset2.obs.leiden_wnn_subcluster2.astype("str").values
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.replace(new_cluster_names)
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.astype("category")

In [None]:
mdata_subset.obs["celltype_subset"] = mdata_subset.obs["leiden_wnn_subcluster"]
mdata_subset.obs["celltype_subset"] = mdata_subset.obs["celltype_subset"].cat.add_categories("MAC")
mdata_subset.obs["celltype_subset"] = mdata_subset.obs["celltype_subset"].cat.add_categories("DC2")
mdata_subset.obs["celltype_subset"] = mdata_subset.obs["celltype_subset"].cat.add_categories("DC2_prol")
for cell_id, celltype_subset2 in zip(mdata_subset2.obs_names, mdata_subset2.obs["celltype_subset2"]):
    if cell_id in mdata_subset.obs.index:
        mdata_subset.obs.loc[cell_id, "celltype_subset"] = celltype_subset2

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset", "leiden_wnn_subcluster"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "MAC", "1": "DC2", "3": "DC2"
}


mdata_subset.obs['celltype_subset'] = mdata_subset.obs.celltype_subset.astype("str").values
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.replace(new_cluster_names)
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(mdata_subset, color=["celltype_subset"], ncols=1, size=10)

In [None]:
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("DC2_prol")
for cell_id, celltype_subset in zip(mdata_subset.obs_names, mdata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster MAC

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["MAC"]),:]

In [None]:
# remove unused categories 
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset, key_added='wnn')

mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r0.7'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
sc.pl.umap(mdata_subset, color=["leiden_wnn_subcluster","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1, size=20)

In [None]:
marker_genes = ["CD4_TotalSeqC","CD8a_TotalSeqC","CD3_TotalSeqC","CD194_TotalSeqC","CD20_TotalSeqC",
                "CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD16_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC","CD14_TotalSeqC",
               "CD27_TotalSeqC","TIGIT_TotalSeqC","CD25_TotalSeqC","CD103_TotalSeqC","CD62L_TotalSeqC",
               "CD28_TotalSeqC","CD127_TotalSeqC","CD366_TotalSeqC","CD11b_TotalSeqC","CD314_TotalSeqC","CD1c_TotalSeqC",
               "CD40_TotalSeqC","CD14_TotalSeqC"]
mdata_subset["prot"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["prot"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["prot"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=False
              ,standard_scale="var"
             );

In [None]:
marker_genes = ["MKI67","CD14","PTPRC","CD74","THY1","HLA-G","NOTUM","XCR1","SELL","CCR7","NCAM1","S100A8",
               "CD163","CLEC10A","CLEC4C","LILRB4","BTLA","CLEC9A","CXCL9","CD3E","NCAM1","LYZ",
               "FN1","FTL","ITGAX","SPP1","IDO1","THY1","HLA-G","SLAMF7","CXCL10","IL1B","TNF","MARCO","SPP1"]
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=True,
              standard_scale="var"
             );

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["CD1c_TotalSeqC","CD14_TotalSeqC","CD45_TotalSeqC",
                                                                   "CD68","CD1C","MKI67","ITGAM","CD207","CD103_TotalSeqC",
                                                                   "CCR7","SELL","CD11c_TotalSeqC","CXCL9","WARS","SLAMF7",
                                                                   "CXCL10","SPP1","MARCO"],
                ncols=2, color_map="YlOrRd")

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["leiden_wnn_subcluster","FOLR2","IL1B","PTGS2","FN1",
                                                                    "VCAN","S100A8","LYZ","CD28_TotalSeqC"],
                ncols=2, color_map="YlOrRd")

In [None]:
marker_genes = {"SPP1":["SPP1","MARCO","FBP1","APOC1","LIPA"],
               "FOLR2":["FOLR2","LYVE1","SELENOP","SLC40A1","MRC1"],
               "IL1B":["IL1B","IL1A","NLRP3","PTGS2","CCL3"],
               "mono":["LYZ","VCAN","S100A8","S100A9"],
               "MT":["MT1H","MT1G","MT1X","MT1E","MT2A"],
               "HSP":["HSPA6","SERPINH1","BAG3","HSPB1","HSPD1"],
               "prol":["MKI67","TOP2A","PCLAF","UBE2C","TK1"]}
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=True,
              standard_scale="var"
             );

In [None]:
new_cluster_names = {
    "0":"decBAM2", "1": "monoMAC", "2": "decBAM1","3": "decPAM1", "4": "DC2", "5": "decPAM2", "6": "MAC_prol", "7": "decBAM1",
    "8": "MAC_Tcell_doublet"
}


mdata_subset.obs['celltype_subset'] = mdata_subset.obs.leiden_wnn_subcluster.astype("str").values
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.replace(new_cluster_names)
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.astype("category")

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("MAC_Tcell_doublet")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("decPAM2")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("decBAM2")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("decBAM1")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("decPAM1")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("monoMAC")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("MAC_prol")
for cell_id, celltype_subset in zip(mdata_subset.obs_names, mdata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster DC1

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["DC1"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset, key_added='wnn')

mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r0.5'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
sc.pl.umap(mdata_subset, color=["leiden_wnn_subcluster","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1)

In [None]:
marker_genes = ["CD4_TotalSeqC","CD8a_TotalSeqC","CD3_TotalSeqC","CD194_TotalSeqC","CD20_TotalSeqC",
                "CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD16_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC","CD14_TotalSeqC",
               "CD27_TotalSeqC","TIGIT_TotalSeqC","CD25_TotalSeqC","CD103_TotalSeqC","CD62L_TotalSeqC",
               "CD28_TotalSeqC","CD127_TotalSeqC","CD366_TotalSeqC","CD11b_TotalSeqC","CD314_TotalSeqC","CD1c_TotalSeqC",
               "CD40_TotalSeqC","CD14_TotalSeqC"]
mdata_subset["prot"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["prot"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["prot"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=False
              ,standard_scale="var"
             );

In [None]:
marker_genes = ["MKI67","CD14","PTPRC","CD74","THY1","HLA-G","NOTUM","XCR1","SELL","CCR7","NCAM1","S100A8",
               "CD163","CLEC10A","CLEC4C","LILRB4","BTLA","CLEC9A","CXCL9","CD3E","NCAM1","LYZ",
               "FN1","FTL","ITGAX","SPP1","IDO1","THY1","HLA-G","SLAMF7","CXCL10"]
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=True,
              standard_scale="var"
             );

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["CD1c_TotalSeqC","CD14_TotalSeqC","CD45_TotalSeqC",
                                                                   "CD68","CD1C","MKI67","ITGAM","CD207","CD103_TotalSeqC",
                                                                   "CCR7","SELL","CD11c_TotalSeqC","CXCL9","WARS","SLAMF7",
                                                                   "CXCL10","CD3_TotalSeqC"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "DC1", "1": "DC1", "2": "DC1","3": "DC1_prol", "4": "DC_Tcell_doublet", "5": "DC_MAC_doublet"
}


mdata_subset.obs['celltype_subset'] = mdata_subset.obs.leiden_wnn_subcluster.astype("str").values
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.replace(new_cluster_names)
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.astype("category")

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("DC_MAC_doublet")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("DC1_prol")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("DC_Tcell_doublet")

for cell_id, celltype_subset in zip(mdata_subset.obs_names, mdata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster Granulo

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["Granulo"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset, key_added='wnn')

mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r0.5'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
sc.pl.umap(mdata_subset, color=["leiden_wnn_subcluster","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1)

In [None]:
marker_genes = ["CD4_TotalSeqC","CD8a_TotalSeqC","CD3_TotalSeqC","CD194_TotalSeqC","CD20_TotalSeqC",
                "CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD16_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC","CD14_TotalSeqC",
               "CD27_TotalSeqC","TIGIT_TotalSeqC","CD25_TotalSeqC","CD103_TotalSeqC","CD62L_TotalSeqC",
               "CD28_TotalSeqC","CD127_TotalSeqC","CD366_TotalSeqC","CD11b_TotalSeqC","CD314_TotalSeqC","CD1c_TotalSeqC",
               "CD40_TotalSeqC","CD14_TotalSeqC"]
mdata_subset["prot"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["prot"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["prot"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=False
              ,standard_scale="var"
             );

In [None]:
marker_genes = ["MKI67","THY1","CD14","PTPRC","IDO1","FCGR3A","MPO","S100A8","TPSAB1"]
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=True,
              standard_scale="var"
             );

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["CD1c_TotalSeqC","CD14_TotalSeqC","CD45_TotalSeqC",
                                                                   "CD68","CD1C","MKI67","ITGAM","CD207","CD103_TotalSeqC",
                                                                   "CCR7","SELL","CD11c_TotalSeqC","CXCL9","WARS","SLAMF7",
                                                                   "CXCL10"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "MAST", "1": "MAST", "2": "MAST","3": "MAST_myelo_doublet", "4": "MAST_FIB_doublet"
}


mdata_subset.obs['celltype_subset'] = mdata_subset.obs.leiden_wnn_subcluster.astype("str").values
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.replace(new_cluster_names)
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.astype("category")

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("MAST")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("MAST_FIB_doublet")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("MAST_myelo_doublet")
for cell_id, celltype_subset in zip(mdata_subset.obs_names, mdata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster fibroblasts

In [None]:
mdata_subset = mdata[mdata.obs['celltype_wnn'].isin(["FIB","FIB_Doublet"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_wnn'] = mdata_subset.obs['celltype_wnn'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_wnn'], ncols=1, size=10)

In [None]:
mdata_subset["rna"].obs["celltype_wnn"] = mdata_subset.obs["celltype_wnn"]

In [None]:
adata_subset=mdata_subset["rna"]

In [None]:
# recalculate nn
sc.pp.neighbors(adata_subset, use_rep="X_harmony")

sc.tl.umap(adata_subset, random_state=10)

In [None]:
X = mdata_subset['rna'].obsm['X_harmony']

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(adata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = adata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(adata_subset, color=key, legend_loc='on data')

    # RNA markers
    adata_subset.obs[key] = adata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(adata_subset, groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(adata_subset, n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
adata_subset.obs['leiden_wnn_subcluster'] = adata_subset.obs['subset_leiden_r0.5'].copy()

In [None]:
sc.pl.umap(adata_subset, color=['celltype_wnn',"leiden_wnn_subcluster","tissue","pct_counts_mt",
                               "donor_id","PRL","CD82","IGFBP1","NREP","UTY"], ncols=2, size=10)

In [None]:
sc.tl.rank_genes_groups(adata_subset, 'leiden_wnn_subcluster', method='wilcoxon')
result = adata_subset.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["PTPRC","CD14","CD74","NCAM1","CD3E","TIMP3","THY1","UTY","PRL","IGFBP1","PDGFRB","NOTCH3","ACTA2","NREP",
               "CCL21","NOTUM","HLA-G","CCL8","UTY","SLPI","VWF","HLA-DRA","LTB","FCGR3A","XCR1","LYZ","S100A8","S100A9",
               "APOD","CFD","COL18A1"]
sc.tl.dendrogram(adata_subset, 'leiden_wnn_subcluster')
sc.pl.dotplot(adata_subset, marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False,
              use_raw=True, standard_scale="var");

In [None]:
new_cluster_names = {
    "0": "hpFib", "1": "hpFib", "2": "hpFib", "3": "hpFib", "4": "hpFib", "5": "hpFib", 
    "6": "decFIB", "7": "periFIB", "8": "FIB_Myelo_doublet", "9": "FIB_Lymph_doublet",
    "10": "PLASMA", "11": "EpiCell", "12": "FIB_Myelo_doublet", "13": "Myelo_Lymphatic_doublet", 
    "14": "FIB_Lymph_doublet", "15": "FIB_Myelo_doublet", "16": "FIB_Myelo_doublet", "17": "FIB_Myelo_doublet",
    "18": "FIB_Myelo_doublet", "19": "FIB_Myelo_doublet"
}

adata_subset.obs['celltype_subset'] = adata_subset.obs.leiden_wnn_subcluster.astype("str").values
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.replace(new_cluster_names)
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(adata_subset, color=["celltype_subset"], ncols=1, size=10)

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("FIB_EVT_doublet")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("FIB_Lymph_doublet")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("FIB_Myelo_doublet")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Myelo_Lymphatic_doublet")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("decFIB")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("hpFib")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("periFIB")


for cell_id, celltype_subset in zip(adata_subset.obs_names, adata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster decFIB

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["decFIB"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
adata_subset=mdata_subset["rna"]

In [None]:
# recalculate nn
sc.pp.neighbors(adata_subset, use_rep="X_harmony")

sc.tl.umap(adata_subset, random_state=10)

In [None]:
X = mdata_subset['rna'].obsm['X_harmony']

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(adata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = adata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(adata_subset, color=key, legend_loc='on data')

    # RNA markers
    adata_subset.obs[key] = adata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(adata_subset, groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(adata_subset, n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
adata_subset.obs['leiden_wnn_subcluster'] = adata_subset.obs['subset_leiden_r0.3'].copy()

In [None]:
sc.pl.umap(adata_subset, color=["leiden_wnn_subcluster","tissue","pct_counts_mt",
                               "donor_id","PRL","CD82","IGFBP1","TIMP3","PTPRC","NCAM1",
                               "TAGLN","SERPINF1","PDGFRB","NOTCH3"], ncols=2, size=35)

In [None]:
sc.tl.rank_genes_groups(adata_subset, 'leiden_wnn_subcluster', method='wilcoxon')
result = adata_subset.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["PRL","CD82","IGFBP1","TIMP3","PTPRC","NCAM1",
                               "TAGLN","SERPINF1","LUM","FTH1","VEGFA","ACTA2"]
sc.tl.dendrogram(adata_subset, 'leiden_wnn_subcluster')
sc.pl.dotplot(adata_subset, marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False,
              use_raw=True, standard_scale="var");

In [None]:
new_cluster_names = {
    "0": "decFIB", "1": "hpFib", "2": "decFIB", "3": "decFIB", "4": "decFIB_immune_doublet",
    "5":"hpFib","6":"decFIB_immune_doublet"
}

adata_subset.obs['celltype_subset'] = adata_subset.obs.leiden_wnn_subcluster.astype("str").values
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.replace(new_cluster_names)
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(adata_subset, color=["celltype_subset"], ncols=1, size=10)

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("decFIB_immune_doublet")


for cell_id, celltype_subset in zip(adata_subset.obs_names, adata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster mural cells

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["periFIB"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
adata_subset=mdata_subset["rna"]

In [None]:
# recalculate nn
sc.pp.neighbors(adata_subset, use_rep="X_harmony")
sc.tl.umap(adata_subset, random_state=10)

In [None]:
X = mdata_subset['rna'].obsm['X_harmony']

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(adata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = adata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(adata_subset, color=key, legend_loc='on data')

    # RNA markers
    adata_subset.obs[key] = adata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(adata_subset, groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(adata_subset, n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
adata_subset.obs['leiden_wnn_subcluster'] = adata_subset.obs['subset_leiden_r0.5'].copy()

In [None]:
sc.pl.umap(adata_subset, color=["leiden_wnn_subcluster","tissue","pct_counts_mt",
                               "donor_id","PRL","CD82","IGFBP1","TIMP3","PTPRC","NCAM1",
                               "TAGLN","SERPINF1","PDGFRB","NOTCH3","ACTA2","VEGFA",
                               "MCAM","MYH11"], ncols=2, size=55)

In [None]:
sc.tl.rank_genes_groups(adata_subset, 'leiden_wnn_subcluster', method='wilcoxon')
result = adata_subset.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
sc.tl.rank_genes_groups(adata_subset, 'leiden_wnn_subcluster', method='wilcoxon')
result = adata_subset.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["PTPRC","SOD2","FTH1","CD68","CTSL","DCN","RND3","PLIN2","VEGFA","NFKBIA",
               "ACTA2","TAGLN","THY1","CALD1","TPM2","MYLK","MYL9","TPM1","MCAM","NOTCH3",
               "CSPG4","KCNJ8","CD14","TAGLN","SERPING1","JUN"]
sc.tl.dendrogram(adata_subset, 'leiden_wnn_subcluster')
sc.pl.dotplot(adata_subset, marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False,
              use_raw=True
              , standard_scale = "var"
             );

In [None]:
marker_genes = ["PTPRC","SOD2","FTH1","CD68","CTSL","DCN","RND3","PLIN2","VEGFA","NFKBIA",
               "ACTA2","TAGLN","THY1","CALD1","TPM2","MYLK","MYL9","TPM1","MCAM","NOTCH3",
               "CSPG4","KCNJ8","COL18A1","MMP14","IGF1","PDGFRA","PDGFRB","IGFBP1","PRL","CD82",
               "UTY","HLA-G","CD14","CFD","APOD","RGS5","LGALS3","FTL","GPNMB"]
sc.tl.dendrogram(adata_subset, 'leiden_wnn_subcluster')
sc.pl.dotplot(adata_subset, marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False,
              use_raw=True
              , standard_scale = "var"
             );

In [None]:
new_cluster_names = {
    "0": "MURAL", "1": "MURAL", "2":"MURAL","3":"fetFIB_doublets","4":"decFIB","5":"murFIB_doublets","6":"murFIB_doublets"}

adata_subset.obs['celltype_subset'] = adata_subset.obs.leiden_wnn_subcluster.astype("str").values
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.replace(new_cluster_names)
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(adata_subset, color=["celltype_subset"], ncols=1)

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("murFIB_doublets")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("fetFIB_doublets")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("PERI")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("SMC")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("MURAL")

for cell_id, celltype_subset in zip(adata_subset.obs_names, adata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster hpFib

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["hpFib"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
adata_subset=mdata_subset["rna"]

In [None]:
# recalculate nn
sc.pp.neighbors(adata_subset, use_rep="X_harmony")

sc.tl.umap(adata_subset, random_state=10)

In [None]:
X = mdata_subset['rna'].obsm['X_harmony']

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(adata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = adata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(adata_subset, color=key, legend_loc='on data')

    # RNA markers
    adata_subset.obs[key] = adata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(adata_subset, groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(adata_subset, n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
adata_subset.obs['leiden_wnn_subcluster'] = adata_subset.obs['subset_leiden_r0.3'].copy()

In [None]:
sc.pl.umap(adata_subset, color=["leiden_wnn_subcluster","tissue","pct_counts_mt",
                               "donor_id","PRL","CD82","IGFBP1","TIMP3","PTPRC","NCAM1",
                               "TAGLN","SERPINF1","PDGFRB","NOTCH3","ACTA2","VEGFA",
                               "MCAM","MYH11","PDGFRA","PTGDS","APOD","PLA2G2A"], ncols=2, size=25)

In [None]:
sc.tl.rank_genes_groups(adata_subset, 'leiden_wnn_subcluster', method='wilcoxon')
result = adata_subset.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["PTPRC","SOD2","FTH1","CD68","CTSL","DCN","RND3","PLIN2","VEGFA","NFKBIA",
               "ACTA2","TAGLN","THY1","CALD1","TPM2","MYLK","MYL9","TPM1","MCAM","NOTCH3",
               "CSPG4","KCNJ8",
               "CFD", "FAP", "PDGFRA", "LGR5", "CD55", "SFRP2", "LUM", "VCAN", "FBLN1", "SERPING1",
                "MDK",
               "COL18A1","ACTA2","MYH11","POSTN","CD9","MCAM","PDGFRB","RGS5","APOD","PTGDS","PLA2G2A",
               "LUM","TIMP3","DCN","IGFBP1","ARG1","CD82","PRL","CD14","LYVE1"]
sc.tl.dendrogram(adata_subset, 'leiden_wnn_subcluster')
sc.pl.dotplot(adata_subset, marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False,
              use_raw=True
              #, standard_scale = "var"
             );

In [None]:
new_cluster_names = {
    "0": "hpFib", "1": "hpFib", "2": "hpFib", "3": "hpFib", "4":"hpFib"}

adata_subset.obs['celltype_subset'] = adata_subset.obs.leiden_wnn_subcluster.astype("str").values
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.replace(new_cluster_names)
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(adata_subset, color=["celltype_subset"], ncols=1, size=10)

In [None]:
for cell_id, celltype_subset in zip(adata_subset.obs_names, adata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## investigate new FIB clusters

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["hpFib","decFIB","MURAL"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
mdata_subset["rna"].obs["celltype_hires"] = mdata_subset.obs["celltype_hires"]
adata_subset=mdata_subset["rna"]

In [None]:
# recalculate nn
sc.pp.neighbors(adata_subset, use_rep="X_harmony")

In [None]:
sc.tl.umap(adata_subset, random_state=10)

In [None]:
sc.pl.umap(adata_subset, color=["celltype_hires","tissue","pct_counts_mt",
                               "donor_id","PRL","CD82","IL15","IGFBP5","IGFBP1","TIMP3","PTPRC","NCAM1",
                               "TAGLN","SERPINF1","PDGFRB","NOTCH3","ACTA2","VEGFA",
                               "MCAM","MYH11","PDGFRA","PTGDS","APOD","PLA2G2A"], ncols=2, size=25)

In [None]:
sc.tl.rank_genes_groups(adata_subset, 'celltype_hires', method='wilcoxon')
result = adata_subset.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = {"hpFib":["CFD", "FAP", "PDGFRA", "LGR5", "CD55", "SFRP2", "LUM", "VCAN", "FBLN1",
                          "SERPING1", "MDK"],
               "pericytes":["CSPG4", "PDGFRB", "THY1", "MCAM", "RGS5", "TPM2", "NDUFA4L2",
                            "TGFB1", "KCNJ8", "FRZB", "CD9", "GJA4", "PTP4A3", "NOTCH3", "COL18A1"],
               "myofib":["ACTA2", "POSTN", "CTSK", "DCN"],
               "smooth-muscle":["MYL9", "MYH11", "SMTN", "SMTNL1", "SMTNL2", "CNN1", "TAGLN",
                                "TPM2","COL4A1"],
               "other":["IL1RL1", "MMP2", "PMEL", "VEGFA", "MMP14"],
               "decid":["TIMP3","PRL","ARG1","CD82","IGFBP1","PRLR"],
               "undecid":["IL15","IGFBP5","IGF1","PGR",],
               "PERI":["SOD2","FTH1","CD68","CTSL","RND3","PLIN2","NFKBIA","COL18A1"]}
sc.tl.dendrogram(adata_subset, 'celltype_hires')
sc.pl.dotplot(adata_subset, marker_genes, groupby='celltype_hires', dendrogram=True,
              use_raw=True, swap_axes = True,
               standard_scale = "var",
              save="FIB_subtypes-2024-01-24.pdf"
             );

## subcluster epithelial

In [None]:
mdata["rna"].obs["celltype_wnn"] = mdata.obs["celltype_wnn"]

In [None]:
mdata_subset = mdata[mdata.obs['celltype_wnn'].isin(["EpiCell"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_wnn'] = mdata_subset.obs['celltype_wnn'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_wnn'], ncols=1, size=10)

In [None]:
adata_subset=mdata_subset["rna"]

In [None]:
# recalculate nn
sc.pp.neighbors(adata_subset, use_rep="X_harmony")
sc.tl.umap(adata_subset, random_state=10)

In [None]:
adata_subset

In [None]:
X = mdata_subset['rna'].obsm['X_harmony']

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(adata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = adata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(adata_subset, color=key, legend_loc='on data')

    # RNA markers
    adata_subset.obs[key] = adata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(adata_subset, groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(adata_subset, n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
adata_subset.obs['leiden_wnn_subcluster'] = adata_subset.obs['subset_leiden_r0.7'].copy()

In [None]:
sc.pl.umap(adata_subset, color=['celltype_wnn',"leiden_wnn_subcluster","tissue","pct_counts_mt",
                               "donor_id","UTY","NOTUM","HLA-G","CGA","SLPI","AOC1","PRG2","MKI67",
                               "ENDOU","NOTCH1","VIM"], ncols=2, size=15)

In [None]:
adata_subset.uns['log1p']["base"] = None

In [None]:
sc.tl.rank_genes_groups(adata_subset, 'leiden_wnn_subcluster', method='wilcoxon')
result = adata_subset.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["PTPRC","VIM","HLA-G","PRG2","CGA","ENDOU","AOC1","CD74","CD14","MMP2","NOTUM","NOTCH1","EGFR",
               "IL1B","LAIR2","HTRA1","GDF15","ERVW-1","CGB1","CGB5","CGB7","CGB8","HSD3B1","CD24","MKI67",
               "SPP1","HTRA1","SLPI","ERVW-1"]
sc.tl.dendrogram(adata_subset, 'leiden_wnn_subcluster')
sc.pl.dotplot(adata_subset, marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False,
              use_raw=True,standard_scale="var");

In [None]:
new_cluster_names = {
    "0": "EVT", "1": "vCTB", "2": "EVT", "3": "EVT", "4": "EVT", "5": "STB", 
    "6": "EVT", "7": "EVT", "8": "STB", "9": "TROPHO_doublet",
    "10": "TROPHO_doublet", "11": "TROPHO_doublet", "12": "TROPHO_doublet", "13": "TROPHO_doublet",
    "14":"TROPHO_doublet","15":"TROPHO_doublet"
}

adata_subset.obs['celltype_subset'] = adata_subset.obs.leiden_wnn_subcluster.astype("str").values
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.replace(new_cluster_names)
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(adata_subset, color=["celltype_subset","leiden_wnn_subcluster"], ncols=1, size=10)

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("CCT")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("vCTB")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("EVT")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("EVT")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("STB")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("TROPHO_doublet")


for cell_id, celltype_subset in zip(adata_subset.obs_names, adata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

In [None]:
mdata["rna"].obs["celltype_wnn"] = mdata.obs["celltype_wnn"]

## subcluster STB

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["STB"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_wnn'] = mdata_subset.obs['celltype_wnn'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_wnn'], ncols=1, size=10)

In [None]:
adata_subset=mdata_subset["rna"]

In [None]:
# recalculate nn
sc.pp.neighbors(adata_subset, use_rep="X_harmony")
sc.tl.umap(adata_subset, random_state=10)

In [None]:
X = mdata_subset['rna'].obsm['X_harmony']

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(adata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = adata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(adata_subset, color=key, legend_loc='on data')

    # RNA markers
    adata_subset.obs[key] = adata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(adata_subset, groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(adata_subset, n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
adata_subset.obs['leiden_wnn_subcluster'] = adata_subset.obs['subset_leiden_r0.5'].copy()

In [None]:
sc.pl.umap(adata_subset, color=['celltype_wnn',"leiden_wnn_subcluster","tissue","pct_counts_mt",
                               "donor_id","UTY","NOTUM","HLA-G","CGA","SLPI","AOC1","PRG2","MKI67",
                               "ENDOU","NOTCH1","VIM"], ncols=2, size=25)

In [None]:
sc.tl.rank_genes_groups(adata_subset, 'leiden_wnn_subcluster', method='wilcoxon')
result = adata_subset.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["PTPRC","VIM","HLA-G","PRG2","CGA","ENDOU","AOC1","CD74","CD14","MMP2","NOTUM","NOTCH1","EGFR",
               "IL1B","LAIR2","HTRA1","GDF15","ERVW-1","CGB1","CGB5","CGB7","CGB8","HSD3B1","CD24","MKI67",
               "SPP1","HTRA1","SLPI","ERVW-1","MKI67","LYVE1"]
sc.tl.dendrogram(adata_subset, 'leiden_wnn_subcluster')
sc.pl.dotplot(adata_subset, marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False,
              use_raw=True
              #,standard_scale="var"
             );

In [None]:
new_cluster_names = {
    "0": "CCT", "1": "CCT", "2": "STB", "3": "STB","4": "STB"
}

adata_subset.obs['celltype_subset'] = adata_subset.obs.leiden_wnn_subcluster.astype("str").values
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.replace(new_cluster_names)
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(adata_subset, color=["celltype_subset","leiden_wnn_subcluster"], ncols=1, size=10)

In [None]:
sc.pl.umap(adata_subset, color=["celltype_subset","leiden_wnn_subcluster"], ncols=1, size=10)

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("CCT")
for cell_id, celltype_subset in zip(adata_subset.obs_names, adata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster NK cells

In [None]:
mdata_subset = mdata[mdata.obs['celltype_wnn'].isin(["NK"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")

mu.pp.neighbors(mdata_subset, key_added='wnn')

mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r0.5'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires',"leiden_wnn_subcluster","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1, size=10)

In [None]:
marker_genes = ["XCL1","XCL2","GNLY","CXCR4","MKI67","FCGR3A","SPINK2","NCAM1","GZMA","IGFBP7","ENTPD1","ANXA1","ITGB2",
               "KLRB1","CD160","ITGB1","SH3BGRL3","SPINK2","SELL","CXCR4","CD69"]
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=True,
             standard_scale="var");

In [None]:
mdata_subset["prot"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.rank_genes_groups(mdata_subset["prot"], 'leiden_wnn_subcluster', method='wilcoxon')
result = mdata_subset["prot"].uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(10)

In [None]:
marker_genes = ["CD11c_TotalSeqC","CD39_TotalSeqC","CD38_TotalSeqC","CD103_TotalSeqC","CD16_TotalSeqC",
                "CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD31_TotalSeqC","CD56_TotalSeqC","CD314_TotalSeqC"]
mdata_subset["prot"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["prot"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["prot"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=False
              #,standard_scale="var"
             );

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["leiden_wnn_subcluster","CD11c_TotalSeqC",
                                                                    "CD39_TotalSeqC","CD38_TotalSeqC","CD103_TotalSeqC",
                                                                    "CD16_TotalSeqC","CD45RA_TotalSeqC","CD45RO_TotalSeqC",
                                                                    "CD31_TotalSeqC","CD160","KLRB1","ENTPD1","CYP26A1",
                                                                   "ITGB1","CD27_TotalSeqC","GZMA","CD56_TotalSeqC","NCAM1",
                                                                   "CD44_TotalSeqC","MAML3","RUNX2","CD40_TotalSeqC",
                                                                   "CD69_TotalSeqC","CD11b_TotalSeqC","CD314_TotalSeqC"],
                ncols=2, color_map="YlOrRd")

In [None]:
# split clusters 2,3,6 into CD39 positive and negative.


In [None]:
mdata_subset2 = mdata_subset[mdata_subset.obs['leiden_wnn_subcluster'].isin(["2"]),:]

In [None]:
# remove unused categories
mdata_subset2.obs['leiden_wnn_subcluster'] = mdata_subset2.obs['leiden_wnn_subcluster'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset2, color=['leiden_wnn_subcluster'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset2['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset2['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset2, key_added='wnn')

mu.tl.umap(mdata_subset2, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset2['rna'].obsm['X_harmony'], mdata_subset2['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset2_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset2, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset2.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset2, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset2['rna'].obs[key] = mdata_subset2.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset2['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset2['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset2.obs['leiden_wnn_subcluster2'] = mdata_subset2.obs['subset2_leiden_r0.3'].copy()
mdata_subset2["rna"].obs["leiden_wnn_subcluster2"]=mdata_subset2.obs["leiden_wnn_subcluster2"]


In [None]:
sc.pl.umap(mdata_subset2, color=["leiden_wnn_subcluster2","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1, size=10)

In [None]:
mu.pl.embedding(mdata_subset2, basis="X_umap", frameon=False, color=["leiden_wnn_subcluster2","CD11c_TotalSeqC",
                                                                    "CD39_TotalSeqC","CD38_TotalSeqC","CD103_TotalSeqC",
                                                                    "CD16_TotalSeqC","CD45RA_TotalSeqC","CD45RO_TotalSeqC",
                                                                    "CD31_TotalSeqC","CD160","KLRB1","ENTPD1","CYP26A1",
                                                                   "ITGB1","CD27_TotalSeqC","GZMA","CD56_TotalSeqC","NCAM1",
                                                                   "CD44_TotalSeqC","MAML3","RUNX2","CD40_TotalSeqC",
                                                                   "CD69_TotalSeqC","CD11b_TotalSeqC"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "NK_CD39-", "1": "NK_CD39+"}

mdata_subset2.obs['celltype_subset2'] = mdata_subset2.obs.leiden_wnn_subcluster2.astype("str").values
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.replace(new_cluster_names)
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.astype("category")

In [None]:
mdata_subset.obs["celltype_subset"]=mdata_subset.obs["leiden_wnn_subcluster"]

# update categories
mdata_subset.obs["celltype_subset"] = mdata_subset.obs["celltype_subset"].cat.add_categories("NK_CD39-")
mdata_subset.obs["celltype_subset"] = mdata_subset.obs["celltype_subset"].cat.add_categories("NK_CD39+")

for cell_id, celltype_subset2 in zip(mdata_subset2.obs_names, mdata_subset2.obs["celltype_subset2"]):
    if cell_id in mdata_subset.obs.index:
        mdata_subset.obs.loc[cell_id, "celltype_subset"] = celltype_subset2

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
# split cluster 6

In [None]:
mdata_subset2 = mdata_subset[mdata_subset.obs['leiden_wnn_subcluster'].isin(["6"]),:]

In [None]:
# remove unused categories from 'celltype_wnn' 
mdata_subset2.obs['leiden_wnn_subcluster'] = mdata_subset2.obs['leiden_wnn_subcluster'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset2, color=['leiden_wnn_subcluster'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset2['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset2['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset2, key_added='wnn')

mu.tl.umap(mdata_subset2, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset2['rna'].obsm['X_harmony'], mdata_subset2['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset2_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset2, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset2.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset2, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset2['rna'].obs[key] = mdata_subset2.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset2['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset2['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset2.obs['leiden_wnn_subcluster2'] = mdata_subset2.obs['subset2_leiden_r0.5'].copy()
mdata_subset2["rna"].obs["leiden_wnn_subcluster2"]=mdata_subset2.obs["leiden_wnn_subcluster2"]


In [None]:
sc.pl.umap(mdata_subset2, color=["leiden_wnn_subcluster2","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1)

In [None]:
mu.pl.embedding(mdata_subset2, basis="X_umap", frameon=False, color=["leiden_wnn_subcluster2","CD11c_TotalSeqC",
                                                                    "CD39_TotalSeqC","CD38_TotalSeqC","CD103_TotalSeqC",
                                                                    "CD16_TotalSeqC","CD45RA_TotalSeqC","CD45RO_TotalSeqC",
                                                                    "CD31_TotalSeqC","CD160","KLRB1","ENTPD1","CYP26A1",
                                                                   "ITGB1","CD27_TotalSeqC","GZMA","CD56_TotalSeqC","NCAM1",
                                                                   "CD44_TotalSeqC","MAML3","RUNX2","CD40_TotalSeqC",
                                                                   "CD69_TotalSeqC","CD11b_TotalSeqC"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "NK_CD39-", "1": "NK_CD39-","2": "NK_CD39+", "3": "NK_CD39-","4": "NK_CD39-"}

mdata_subset2.obs['celltype_subset2'] = mdata_subset2.obs.leiden_wnn_subcluster2.astype("str").values
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.replace(new_cluster_names)
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.astype("category")

In [None]:
for cell_id, celltype_subset2 in zip(mdata_subset2.obs_names, mdata_subset2.obs["celltype_subset2"]):
    if cell_id in mdata_subset.obs.index:
        mdata_subset.obs.loc[cell_id, "celltype_subset"] = celltype_subset2

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
cluster_name_mapping = {
    '0': 'NK_CD39-',
    '1': 'NK_CD39+',
    "3": "NK_prol",
    "4": "NK_CD39-CD103+",
    "5": "NK_CD16+"
}

# rename clusters
mdata_subset.obs['celltype_subset'] = mdata_subset.obs['celltype_subset'].astype(str).replace(cluster_name_mapping)

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
marker_genes = ["CD11c_TotalSeqC","CD39_TotalSeqC","CD38_TotalSeqC","CD103_TotalSeqC","CD16_TotalSeqC",
                "CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD31_TotalSeqC","CD56_TotalSeqC"]
mdata_subset["prot"].obs["celltype_subset"]=mdata_subset.obs["celltype_subset"]
sc.pl.dotplot(mdata_subset["prot"], marker_genes, groupby='celltype_subset', dendrogram=False, use_raw=False
              #,standard_scale="var"
             );

In [None]:
mdata_subset["rna"].obs["celltype_subset"]=mdata_subset.obs["celltype_subset"]
sc.tl.rank_genes_groups(mdata_subset["rna"], 'celltype_subset', method='wilcoxon')
result = mdata_subset["rna"].uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["XCL1","XCL2","GNLY","CXCR4","MKI67","FCGR3A","SPINK2","NCAM1","GZMA","IGFBP7","ENTPD1","ANXA1","ITGB2",
               "KLRB1","CD160","ITGB1","SH3BGRL3","SPINK2","SELL","GNG2","CCL5","CD7","MKI67"]
mdata_subset["rna"].obs["celltype_subset"]=mdata_subset.obs["celltype_subset"]
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='celltype_subset', dendrogram=False, use_raw=True,standard_scale="var");

In [None]:
# check prostaglandin markers:
marker_genes = ["XCL1","XCL2","PTGFR", "PTGDS", "PTGS2", "PTGS1", "PTGES2", "PTGES", "PTGER1", "PTGER2", "PTGER3",
                "PTGER4", "PTGDR","PTGDR2", "PTGFR", "PTGIR", "TBXA2R", "PTGIS", "PTGFR", "HPGDS", "PTGES3",
                "HPGD", "SLCO2A1", "ITGB1", "CD160", "CYP26A1", "B4GALNT1","ANXA1", "ITGB2", "KLRB1", "IL7R"]
mdata_subset["rna"].obs["celltype_subset"]=mdata_subset.obs["celltype_subset"]
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='celltype_subset', dendrogram=False,
              use_raw=True, standard_scale="var");

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("NK_prol")
for cell_id, celltype_subset in zip(mdata_subset.obs_names, mdata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
# remove unused categories
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster T cells

In [None]:
mdata_subset = mdata[mdata.obs['celltype_wnn'].isin(["TCell"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_wnn'] = mdata_subset.obs['celltype_wnn'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_wnn'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset, key_added='wnn')

mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r0.5'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata_subset, color=["leiden_wnn_subcluster","celltype_hires","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=2, size=15)

In [None]:
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

sc.tl.rank_genes_groups(mdata_subset["rna"], 'leiden_wnn_subcluster', method='wilcoxon')
result = mdata_subset["rna"].uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["CD4_TotalSeqC","CD8a_TotalSeqC","CD3_TotalSeqC","CD194_TotalSeqC","CD20_TotalSeqC",
                "CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD16_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC","CD14_TotalSeqC",
               "CD27_TotalSeqC","TIGIT_TotalSeqC","CD25_TotalSeqC","CD103_TotalSeqC","CD62L_TotalSeqC",
               "CD28_TotalSeqC","CD127_TotalSeqC"]
mdata_subset["prot"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
#sc.tl.dendrogram(mdata_subset["prot"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["prot"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False,
              use_raw=False
              , standard_scale="var"
             );

In [None]:
# split clusters 2,3,4,6
# cluster 0 and 5 are CD4+
# cluster 1 and 7 are CD8+

In [None]:
mdata_subset2 = mdata_subset[mdata_subset.obs['leiden_wnn_subcluster'].isin(["2"]),:]

In [None]:
# remove unused categories
mdata_subset2.obs['leiden_wnn_subcluster'] = mdata_subset2.obs['leiden_wnn_subcluster'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset2, color=['leiden_wnn_subcluster'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset2['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset2['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset2, key_added='wnn')

mu.tl.umap(mdata_subset2, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset2['rna'].obsm['X_harmony'], mdata_subset2['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset2_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset2, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset2.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset2, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset2['rna'].obs[key] = mdata_subset2.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset2['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset2['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset2.obs['leiden_wnn_subcluster2'] = mdata_subset2.obs['subset2_leiden_r0.3'].copy()
mdata_subset2["rna"].obs["leiden_wnn_subcluster2"]=mdata_subset2.obs["leiden_wnn_subcluster2"]


In [None]:
sc.pl.umap(mdata_subset2, color=["leiden_wnn_subcluster2","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1, size=10)

In [None]:
mu.pl.embedding(mdata_subset2, basis="X_umap", frameon=False, color=["leiden_wnn_subcluster2","CD4_TotalSeqC","CD4","CD8a_TotalSeqC","CD3_TotalSeqC","CD194_TotalSeqC","CD20_TotalSeqC",
                "CD8A","CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD16_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC","CD14_TotalSeqC",
               "CD27_TotalSeqC","TIGIT_TotalSeqC","CD25_TotalSeqC","CD103_TotalSeqC","CD62L_TotalSeqC",
               "CD28_TotalSeqC","CD127_TotalSeqC"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "Tcell_CD8+", "1": "Tcell_CD4+", "2": "Tcell_CD8+"}

mdata_subset2.obs['celltype_subset2'] = mdata_subset2.obs.leiden_wnn_subcluster2.astype("str").values
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.replace(new_cluster_names)
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.astype("category")

In [None]:
mdata_subset.obs["celltype_subset"]=mdata_subset.obs["leiden_wnn_subcluster"]

# update categories
mdata_subset.obs["celltype_subset"] = mdata_subset.obs["celltype_subset"].cat.add_categories("Tcell_CD4+")
mdata_subset.obs["celltype_subset"] = mdata_subset.obs["celltype_subset"].cat.add_categories("Tcell_CD8+")

for cell_id, celltype_subset2 in zip(mdata_subset2.obs_names, mdata_subset2.obs["celltype_subset2"]):
    if cell_id in mdata_subset.obs.index:
        mdata_subset.obs.loc[cell_id, "celltype_subset"] = celltype_subset2

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
# cluster 3

In [None]:
mdata_subset2 = mdata_subset[mdata_subset.obs['leiden_wnn_subcluster'].isin(["3"]),:]

In [None]:
# remove unused categories
mdata_subset2.obs['leiden_wnn_subcluster'] = mdata_subset2.obs['leiden_wnn_subcluster'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset2, color=['leiden_wnn_subcluster'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset2['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset2['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset2, key_added='wnn')

mu.tl.umap(mdata_subset2, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset2['rna'].obsm['X_harmony'], mdata_subset2['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset2_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset2, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset2.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset2, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset2['rna'].obs[key] = mdata_subset2.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset2['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset2['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset2.obs['leiden_wnn_subcluster2'] = mdata_subset2.obs['subset2_leiden_r0.3'].copy()
mdata_subset2["rna"].obs["leiden_wnn_subcluster2"]=mdata_subset2.obs["leiden_wnn_subcluster2"]


In [None]:
sc.pl.umap(mdata_subset2, color=["leiden_wnn_subcluster2","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1, size=10)

In [None]:
mu.pl.embedding(mdata_subset2, basis="X_umap", frameon=False, color=["leiden_wnn_subcluster2","CD4_TotalSeqC",
                                                                    "CD8a_TotalSeqC","CD3_TotalSeqC","CD103_TotalSeqC",
                                                                    "CD8A","CD4","CD366_TotalSeqC", "THY1","HLA-G",
                                                                    "CD45_TotalSeqC","CD14","TRDC"],
                ncols=2, color_map="YlOrRd")

In [None]:
mdata_subset2["rna"].obs["leiden_wnn_subcluster2"]=mdata_subset2.obs["leiden_wnn_subcluster2"]

sc.tl.rank_genes_groups(mdata_subset2["rna"], 'leiden_wnn_subcluster2', method='wilcoxon')
result = mdata_subset2["rna"].uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["CD4_TotalSeqC","CD8a_TotalSeqC","CD3_TotalSeqC","CD194_TotalSeqC","CD20_TotalSeqC",
                "CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD16_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC","CD14_TotalSeqC",
               "CD27_TotalSeqC","TIGIT_TotalSeqC","CD25_TotalSeqC","CD103_TotalSeqC","CD62L_TotalSeqC",
               "CD28_TotalSeqC","CD127_TotalSeqC","CD366_TotalSeqC"]
mdata_subset2["prot"].obs["leiden_wnn_subcluster2"]=mdata_subset2.obs["leiden_wnn_subcluster2"]
#sc.tl.dendrogram(mdata_subset["prot"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset2["prot"], marker_genes, groupby='leiden_wnn_subcluster2', dendrogram=False,
              use_raw=False
              #, standard_scale="var"
             );

In [None]:
# move gamma delta T cells to the CD4 cells at first
new_cluster_names = {
    "0": "Tcell_CD4+", "1": "Tcell_CD8+", "2": "Tcell_CD4+"}

mdata_subset2.obs['celltype_subset2'] = mdata_subset2.obs.leiden_wnn_subcluster2.astype("str").values
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.replace(new_cluster_names)
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.astype("category")

In [None]:
for cell_id, celltype_subset2 in zip(mdata_subset2.obs_names, mdata_subset2.obs["celltype_subset2"]):
    if cell_id in mdata_subset.obs.index:
        mdata_subset.obs.loc[cell_id, "celltype_subset"] = celltype_subset2

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
# cluster 4

In [None]:
mdata_subset2 = mdata_subset[mdata_subset.obs['leiden_wnn_subcluster'].isin(["4"]),:]

In [None]:
# remove unused categories
mdata_subset2.obs['leiden_wnn_subcluster'] = mdata_subset2.obs['leiden_wnn_subcluster'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset2, color=['leiden_wnn_subcluster'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset2['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset2['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset2, key_added='wnn')

mu.tl.umap(mdata_subset2, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset2['rna'].obsm['X_harmony'], mdata_subset2['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset2_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset2, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset2.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset2, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset2['rna'].obs[key] = mdata_subset2.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset2['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset2['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset2.obs['leiden_wnn_subcluster2'] = mdata_subset2.obs['subset2_leiden_r0.5'].copy()
mdata_subset2["rna"].obs["leiden_wnn_subcluster2"]=mdata_subset2.obs["leiden_wnn_subcluster2"]

In [None]:
sc.pl.umap(mdata_subset2, color=["leiden_wnn_subcluster2","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1, size=10)

In [None]:
mu.pl.embedding(mdata_subset2, basis="X_umap", frameon=False, color=["leiden_wnn_subcluster2","CD4_TotalSeqC",
                                                                    "CD8a_TotalSeqC","CD3_TotalSeqC","CD103_TotalSeqC",
                                                                    "CD8A","CD4"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "Tcell_CD8+", "1": "Tcell_CD8+", "2": "Tcell_CD4+", "3":"Tcell_CD8+"}

mdata_subset2.obs['celltype_subset2'] = mdata_subset2.obs.leiden_wnn_subcluster2.astype("str").values
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.replace(new_cluster_names)
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.astype("category")

In [None]:
for cell_id, celltype_subset2 in zip(mdata_subset2.obs_names, mdata_subset2.obs["celltype_subset2"]):
    if cell_id in mdata_subset.obs.index:
        mdata_subset.obs.loc[cell_id, "celltype_subset"] = celltype_subset2

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
# cluster 6

In [None]:
mdata_subset2 = mdata_subset[mdata_subset.obs['leiden_wnn_subcluster'].isin(["6"]),:]

In [None]:
# remove unused categories
mdata_subset2.obs['leiden_wnn_subcluster'] = mdata_subset2.obs['leiden_wnn_subcluster'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset2, color=['leiden_wnn_subcluster'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset2['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset2['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset2, key_added='wnn')

mu.tl.umap(mdata_subset2, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset2['rna'].obsm['X_harmony'], mdata_subset2['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7):
    key = f'subset2_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset2, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset2.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset2, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset2['rna'].obs[key] = mdata_subset2.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset2['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset2['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset2.obs['leiden_wnn_subcluster2'] = mdata_subset2.obs['subset2_leiden_r0.3'].copy()
mdata_subset2["rna"].obs["leiden_wnn_subcluster2"]=mdata_subset2.obs["leiden_wnn_subcluster2"]

In [None]:
sc.pl.umap(mdata_subset2, color=["leiden_wnn_subcluster2","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1, size=10)

In [None]:
mu.pl.embedding(mdata_subset2, basis="X_umap", frameon=False, color=["leiden_wnn_subcluster2","CD4_TotalSeqC",
                                                                    "CD8a_TotalSeqC","CD3_TotalSeqC","CD103_TotalSeqC",
                                                                    "CD8A","CD4"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "Tcell_CD4+", "1": "Tcell_CD8+"}

mdata_subset2.obs['celltype_subset2'] = mdata_subset2.obs.leiden_wnn_subcluster2.astype("str").values
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.replace(new_cluster_names)
mdata_subset2.obs.celltype_subset2 = mdata_subset2.obs.celltype_subset2.astype("category")

In [None]:
for cell_id, celltype_subset2 in zip(mdata_subset2.obs_names, mdata_subset2.obs["celltype_subset2"]):
    if cell_id in mdata_subset.obs.index:
        mdata_subset.obs.loc[cell_id, "celltype_subset"] = celltype_subset2

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
cluster_name_mapping = {
    '0': 'Tcell_CD4+',
    '1': 'Tcell_CD8+',
    "5": "Tcell_CD4+",
    "7": "Tcell_CD8+"
}

# rename clusters
mdata_subset.obs['celltype_subset'] = mdata_subset.obs['celltype_subset'].astype(str).replace(cluster_name_mapping)

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset"],
                ncols=2, color_map="YlOrRd")

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["celltype_subset","CD4_TotalSeqC",
                                                                    "CD8a_TotalSeqC","IL7R"],
                ncols=2, color_map="YlOrRd")

In [None]:
for cell_id, celltype_subset in zip(mdata_subset.obs_names, mdata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires'], ncols=1, size=10)

## subcluster Tcell_CD4+

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["Tcell_CD4+"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")

mu.pp.neighbors(mdata_subset, key_added='wnn')

mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7, 1.2):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r1.2'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
sc.pl.umap(mdata_subset, color=["leiden_wnn_subcluster","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=1, size=10)

In [None]:
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.rank_genes_groups(mdata_subset["rna"], 'leiden_wnn_subcluster', method='wilcoxon')
result = mdata_subset["rna"].uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["CD4_TotalSeqC","CD8a_TotalSeqC","CD3_TotalSeqC","CD194_TotalSeqC","CD20_TotalSeqC",
                "CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD16_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC","CD14_TotalSeqC",
               "CD27_TotalSeqC","TIGIT_TotalSeqC","CD25_TotalSeqC","CD103_TotalSeqC","CD62L_TotalSeqC",
               "CD28_TotalSeqC","CD127_TotalSeqC","CD366_TotalSeqC","CD11b_TotalSeqC","CD314_TotalSeqC"]
mdata_subset["prot"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["prot"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["prot"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=False
              ,standard_scale="var"
             );

In [None]:
marker_genes = ["IL7R","FOXP3","TIGIT","CTLA4","IL2RA","GNLY","NCAM1","CD3E","PTPRC","DCN","MKI67","SELL","PDCD1",
               "GZMK","CST7","KLRC1","TRGV2","CD4","CD8A","PTPRC","KLRK1","CCR7","KIT","LTB","KLRB1","TRBC1"]
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=True,
              standard_scale="var");

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["IL7R","FOXP3","TIGIT","CTLA4","IL2RA",
                                                                    "GNLY","NCAM1","CD3E","PTPRC","DCN","MKI67","SELL",
                                                                   "PDCD1","KLRC1","CD3_TotalSeqC","TRGV2","TRAV1-2",
                                                                   "FCGR3A","CD16_TotalSeqC","CD8A","KIT",
                                                                    "LTB","KLRB1","TRBC1"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "Tcell_CD4+tr", "1": "Tcell_CD4+blood","2": "Tcell_CD4+tr","3":"Tcell_CD4+tr","4":"Tcell_CD4+tr",
    "5":"Tcell_CD4+cyto","6":"Tcell_CD4+tr","7":"Tcell_CD4+cyto",
    "8":"Tcell_CD4+prol","9":"Tcell_CD4+exh","10": "Tcell_reg", "11":"Tcell_CD4+cyto", "12":"Tcell_NK_doublet",
    "13":"Tcell_gd",
    "14":"Tcell_CD4+tr"
}


mdata_subset.obs['celltype_subset'] = mdata_subset.obs.leiden_wnn_subcluster.astype("str").values
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.replace(new_cluster_names)
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(mdata_subset, color=["celltype_subset"], ncols=1, size=10)

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_CD4+cyto")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_CD4+exh")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_CD4+prol")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_CD4+tr")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_CD4+blood")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_NK_doublet")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_gd")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_reg")


for cell_id, celltype_subset in zip(mdata_subset.obs_names, mdata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster Tcell_CD8+

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["Tcell_CD8+"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset, key_added='wnn')

mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7, 1.2):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r0.7'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
sc.pl.umap(mdata_subset, color=["leiden_wnn_subcluster","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=2, size=10)

In [None]:
sc.tl.rank_genes_groups(mdata_subset["rna"], 'leiden_wnn_subcluster', method='wilcoxon')
result = mdata_subset["rna"].uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["CD4_TotalSeqC","CD8a_TotalSeqC","CD3_TotalSeqC","CD194_TotalSeqC","CD20_TotalSeqC",
                "CD45RA_TotalSeqC","CD45RO_TotalSeqC","CD16_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC","CD14_TotalSeqC",
               "CD27_TotalSeqC","TIGIT_TotalSeqC","CD25_TotalSeqC","CD103_TotalSeqC","CD62L_TotalSeqC",
               "CD28_TotalSeqC","CD127_TotalSeqC","CD366_TotalSeqC","CD11b_TotalSeqC","CD314_TotalSeqC"]
mdata_subset["prot"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["prot"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["prot"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=False,
             standard_scale="var");

In [None]:
marker_genes = ["IL7R","FOXP3","TIGIT","CTLA4","IL2RA","GNLY","NCAM1","CD3E","PTPRC","DCN","MKI67","SELL","PDCD1",
               "GZMK","CST7","KLRC1","TRGV2","CD4","CD8A","PTPRC","TRAV1-2","KLRB1","ITGAE","FGFBP2","NKG7","GZMB","GZMH"
               ,"CD7","KLRC2","TYROBP","KLRC3","NCR1","NCR3","LEF1","PABPC1","LTB","IFIT1","IFIT3","IFNG","CD28",
               "MCTP2","PRF1","NKG7","XCL1","ENTPD1","ITGAX","CDHR1","ITGAE","ITGA4"]
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False, use_raw=True,
             standard_scale="var"
             );

# due to expression of ITGAX and CDHR1 cluster 7 is most likely a NK doublet instead of an NKTcell

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["IL7R","FOXP3","TIGIT","CTLA4","IL2RA",
                                                                    "GNLY","NCAM1","CD3E","PTPRC","DCN","MKI67","SELL",
                                                                   "PDCD1","KLRC1","CD3_TotalSeqC","TRGV2","TRAV1-2",
                                                                   "KLRB1","CD103_TotalSeqC","CD39_TotalSeqC","CD56_TotalSeqC"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "Tcell_CD8+tr", "1": "Tcell_CD8+tr","2": "Tcell_CD8+eff","3":"Tcell_CD8+tr","4":"Tcell_NK_doublet",
    "5":"MAITcell","6":"Tcell_CD8+prol","7":"Tcell_CD8+tr",
    "8":"Tcell_CD8+blood","9":"Tcell_CD8+tr"
}


mdata_subset.obs['celltype_subset'] = mdata_subset.obs.leiden_wnn_subcluster.astype("str").values
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.replace(new_cluster_names)
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(mdata_subset, color=["celltype_subset"], ncols=1, size=10)

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("MAITcell")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_CD8+tr")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_CD8+eff")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_CD8+prol")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Tcell_CD8+blood")


for cell_id, celltype_subset in zip(mdata_subset.obs_names, mdata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## Endothelial cells

In [None]:
mdata_subset = mdata[mdata.obs['celltype_wnn'].isin(["EndoCell"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_wnn'] = mdata_subset.obs['celltype_wnn'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_wnn'], ncols=1, size=10)

In [None]:
adata_subset=mdata_subset["rna"]

In [None]:
# recalculate nn
sc.pp.neighbors(adata_subset, use_rep="X_harmony")

sc.tl.umap(adata_subset, random_state=10)

In [None]:
X = mdata_subset['rna'].obsm['X_harmony']

for r in (0.3, 0.5, 0.7):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(adata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = adata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(adata_subset, color=key, legend_loc='on data')

    # RNA markers
    adata_subset.obs[key] = adata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(adata_subset, groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(adata_subset, n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
adata_subset.obs['leiden_wnn_subcluster'] = adata_subset.obs['subset_leiden_r0.7'].copy()

In [None]:
sc.pl.umap(adata_subset, color=['celltype_wnn',"leiden_wnn_subcluster","tissue","pct_counts_mt",
                               "donor_id","VWF","DCN","IL33","PTPRC","CD3E","NCAM1",
                               "SERPINF1","CST7","ZFP36", "EFNB2", "ADGRF5", "SOX17", "ICAM2", "SLC9A3R2",
                               "SELE", "ICAM1", "PECAM1", "SPARC", "ACKR1", "SELP"], ncols=2, size=20)

In [None]:
sc.tl.rank_genes_groups(adata_subset, 'leiden_wnn_subcluster', method='wilcoxon')
result = adata_subset.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["PECAM1","ACKR1","VWF","DCN","IL33","PTPRC","CD3E","NCAM1",
                               "SERPINF1","CST7","ZFP36", "EFNB2", "ADGRF5", "SOX17", "ICAM2", "SLC9A3R2",
                               "SELE", "ICAM1", "PECAM1", "SPARC", "ACKR1", "SELP","IGFBP7","PDE3A","NR2F2","CCL21"]
sc.tl.dendrogram(adata_subset, 'leiden_wnn_subcluster')
sc.pl.dotplot(adata_subset, marker_genes, groupby='leiden_wnn_subcluster', dendrogram=False,
              use_raw=True, standard_scale="var");

In [None]:
new_cluster_names = {
    "0": "ENDO", "1": "ENDO", "2": "ENDO", "3": "ENDO_Immune_doublet", "4": "ENDO", "5": "ENDO", 
    "6": "ENDO_FIB_doublet", "7": "ENDO"
}

adata_subset.obs['celltype_subset'] = adata_subset.obs.leiden_wnn_subcluster.astype("str").values
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.replace(new_cluster_names)
adata_subset.obs.celltype_subset = adata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(adata_subset, color=["celltype_subset"], ncols=1, size=10)

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("ENDO")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("ENDO_FIB_doublet")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("ENDO_Immune_doublet")


for cell_id, celltype_subset in zip(adata_subset.obs_names, adata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster plasma cells

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["PLASMA"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")

mu.pp.neighbors(mdata_subset, key_added='wnn')

mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7, 1.2):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r0.3'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
sc.pl.umap(mdata_subset, color=["leiden_wnn_subcluster","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=2, size=10)

In [None]:
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.rank_genes_groups(mdata_subset["rna"], 'leiden_wnn_subcluster', method='wilcoxon')
result = mdata_subset["rna"].uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["JCHAIN","THY1","DCN","NCAM1","LTB","APOD","COTL1","CTSH","GSN","CD14","NCAM1","CD74","CD19","ITGAX",
                "GZMB","IL3RA","XCR1","TCF4","CD27","SDC1","SLAMF7","IL12A","CSF2","TNF","FLT3","CD80","CD86","CD83",
               "CCR7","CD4","HLA-DRA","IL3RA","CLEC4C","TLR7","TLR9","GAS5","MS4A1"]
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=True, use_raw=True,
             standard_scale="var");

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["CD14_TotalSeqC","CD1c_TotalSeqC","CD56_TotalSeqC",
                                                                   "CD45_TotalSeqC","CD11c_TotalSeqC","IL1B",
                                                                   "CD3_TotalSeqC","CD4_TotalSeqC","CD8a_TotalSeqC",
                                                                   "NCAM1","THY1","DCN","LTB","JCHAIN","CD20_TotalSeqC",
                                                                   "CD14","CD74","CD27_TotalSeqC","CD38_TotalSeqC",
                                                                   "CD39_TotalSeqC","CD19","MS4A1","CD27","HLA-DR_TotalSeqC",
                                                                   "PECAM1","VWF"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "PLASMA", "1": "PLASMA", "2": "BCell", "3": "pDC_FIB_doublet","4":"pDC_MAC_doublet"
}

mdata_subset.obs['celltype_subset'] = mdata_subset.obs.leiden_wnn_subcluster.astype("str").values
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.replace(new_cluster_names)
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(mdata_subset, color=["celltype_subset"], ncols=1, size=10)

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("pDC_MAC_doublet")
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("pDC_FIB_doublet")


for cell_id, celltype_subset in zip(mdata_subset.obs_names, mdata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

## subcluster decPAM2 (decPAM2)

In [None]:
mdata_subset = mdata[mdata.obs['celltype_hires'].isin(["decPAM2"]),:]

In [None]:
# remove unused categories
mdata_subset.obs['celltype_hires'] = mdata_subset.obs['celltype_hires'].cat.remove_unused_categories()

In [None]:
sc.pl.umap(mdata_subset, color=['celltype_hires'], ncols=1, size=10)

In [None]:
# recalculate nn
sc.pp.neighbors(mdata_subset['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata_subset['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata_subset, key_added='wnn')

mu.tl.umap(mdata_subset, neighbors_key='wnn', random_state=10)

In [None]:
X = np.hstack([mdata_subset['rna'].obsm['X_harmony'], mdata_subset['prot'].obsm['X_harmony']])

for r in (0.3, 0.5, 0.7, 1.2):
    key = f'subset_leiden_r{r:.1f}'
    sc.tl.leiden(mdata_subset, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata_subset.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata_subset, color=key, legend_loc='on data')

    # RNA markers
    mdata_subset['rna'].obs[key] = mdata_subset.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata_subset['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata_subset['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mdata_subset.obs['leiden_wnn_subcluster'] = mdata_subset.obs['subset_leiden_r0.3'].copy()
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]

In [None]:
sc.pl.umap(mdata_subset, color=["leiden_wnn_subcluster","tissue","rna:pct_counts_mt",
                               "donor_id"], ncols=2, size=10)

In [None]:
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.rank_genes_groups(mdata_subset["rna"], 'leiden_wnn_subcluster', method='wilcoxon')
result = mdata_subset["rna"].uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
marker_genes = ["JCHAIN","THY1","DCN","NCAM1","LTB","APOD","COTL1","CTSH","GSN","CD14","NCAM1","CD74","CD19","ITGAX",
                "GZMB","IL3RA","XCR1","TCF4","CD27","SDC1","SLAMF7","IL12A","CSF2","TNF","FLT3","CD80","CD86","CD83",
               "CCR7","CD4","HLA-DRA","IL3RA","CLEC4C","TLR7","TLR9","GAS5","MS4A1"]
mdata_subset["rna"].obs["leiden_wnn_subcluster"]=mdata_subset.obs["leiden_wnn_subcluster"]
sc.tl.dendrogram(mdata_subset["rna"], 'leiden_wnn_subcluster')
sc.pl.dotplot(mdata_subset["rna"], marker_genes, groupby='leiden_wnn_subcluster', dendrogram=True, use_raw=True,
             standard_scale="var");

In [None]:
mu.pl.embedding(mdata_subset, basis="X_umap", frameon=False, color=["CD14_TotalSeqC","CD1c_TotalSeqC","CD16_TotalSeqC",
                                                                   "WARS","SELL","CD69_TotalSeqC", "donor_id","CD14"],
                ncols=2, color_map="YlOrRd")

In [None]:
new_cluster_names = {
    "0": "decPAM2", "1": "Mono_CD16+"
}

mdata_subset.obs['celltype_subset'] = mdata_subset.obs.leiden_wnn_subcluster.astype("str").values
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.replace(new_cluster_names)
mdata_subset.obs.celltype_subset = mdata_subset.obs.celltype_subset.astype("category")

In [None]:
sc.pl.umap(mdata_subset, color=["celltype_subset"], ncols=1, size=10)

In [None]:
# update categories
mdata.obs["celltype_hires"] = mdata.obs["celltype_hires"].cat.add_categories("Mono_CD16+")


for cell_id, celltype_subset in zip(mdata_subset.obs_names, mdata_subset.obs["celltype_subset"]):
    if cell_id in mdata.obs.index:
        mdata.obs.loc[cell_id, "celltype_hires"] = celltype_subset

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue"], ncols=1, size=10)

In [None]:
# dotplot to prove doublet identity

In [None]:
marker_genes = ["CD19","PECAM1","VWF","TPSAB1","KIT","CCL21","CD1C","CD14","PTPRC","DCN","THY1","NCAM1","CD3E","LTB","CD74","XCR1","NOTUM",
               "PRL","UTY","NOTCH3","CD82","KRT8","HLA-G","CGA","FCGR3A","ENTPD1","ITGAX","CD160","ITGAE","CD3E","CD4","CD8A",
               "ITGAX","JCHAIN"]
mdata["rna"].obs["celltype_hires"]=mdata.obs["celltype_hires"]
sc.pl.dotplot(mdata["rna"], marker_genes, groupby='celltype_hires', dendrogram=True, use_raw=True, standard_scale="var");

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue", "rna:phase"], ncols=1, size=3)

In [None]:
# remove doublets
mdata = mdata[~mdata.obs['celltype_hires'].str.contains("doublet")].copy()

In [None]:
# recalculate nn
sc.pp.neighbors(mdata['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata['prot'], use_rep="X_harmony")


mu.pp.neighbors(mdata, key_added='wnn')

In [None]:
mu.tl.umap(mdata, neighbors_key='wnn', random_state=12)
mu.tl.umap(mdata['rna'], random_state=10)
mu.tl.umap(mdata['prot'], random_state=10)

In [None]:
mdata.obs['celltype_hires'] = mdata.obs['celltype_hires'].cat.remove_unused_categories()
sc.pl.umap(mdata, color=['celltype_hires', "tissue", "rna:phase"], ncols=1, size=3)

In [None]:
sc.pl.umap(mdata["rna"], color=['celltype_hires'], ncols=1, size=3)
mdata["prot"].obs["celltype_hires"] = mdata.obs["celltype_hires"]
sc.pl.umap(mdata["prot"], color=['celltype_hires'], ncols=1, size=3)

In [None]:
sc.pl.umap(mdata, color=[ "rna:phase"])

In [None]:
mdata["rna"].uns['log1p']["base"] = None
sc.tl.rank_genes_groups(mdata["rna"], 'celltype_hires', method='wilcoxon')
result = mdata["rna"].uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(20)

In [None]:
sc.tl.dendrogram(mdata["rna"], 'celltype_hires')
sc.pl.rank_genes_groups_dotplot(mdata["rna"], n_genes=10, dendrogram=True,standard_scale="var",
                                swap_axes=True)

In [None]:
mdata["prot"].obs["celltype_hires"]=mdata.obs["celltype_hires"]
sc.tl.rank_genes_groups(mdata["prot"], 'celltype_hires', method='wilcoxon')
result = mdata["prot"].uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.set_option('display.max_columns', 100)
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names']}).head(10)

In [None]:
sc.tl.dendrogram(mdata["prot"], 'celltype_hires')
sc.pl.rank_genes_groups_dotplot(mdata["prot"], n_genes=10, dendrogram=True,standard_scale="var",
                                swap_axes=True)

In [None]:
sc.pl.violin(mdata, groupby='celltype_hires', keys='rna:mod_weight', size=0, rotation=90)

In [None]:
new_cluster_names = {
    "DC1": "MYELO", "DC2": "MYELO", "DC1_prol": "MYELO", "DC2_prol": "MYELO", "decBAM2": "MYELO",
    "decBAM1": "MYELO", "decPAM1": "MYELO",
    "monoMAC": "MYELO","Mono_CD16+": "MYELO", "decPAM2": "MYELO", "MAC_prol": "MYELO","SMC":"MUR","PERI":"MUR",
    "decFIB": "FIB", "MURAL": "FIB",
    "hpFib": "FIB", "PERI": "FIB","vCTB":"TROPHO",
    "EVT": "TROPHO","EVT": "TROPHO", "CCT": "TROPHO", "STB": "TROPHO",
    "NK_CD16+": "NK", "NK_CD39+": "NK", "NK_CD39-": "NK", "NK_CD39-CD103+": "NK","NK_prol":"NK",
    "Tcell_CD4+cyto": "Tcell", "Tcell_CD4+exh": "Tcell", "Tcell_CD4+prol": "Tcell",
    "Tcell_CD4+tr": "Tcell", "Tcell_CD4+blood": "Tcell", "Tcell_gd": "Tcell",
    "Tcell_reg": "Tcell", "MAITcell": "Tcell", "Tcell_CD8+tr": "Tcell", "Tcell_CD8+eff": "Tcell", "Tcell_CD8+kir": "Tcell",
    "Tcell_CD8+prol": "Tcell", "Tcell_CD8+blood": "Tcell", "PLASMA": "PLASMA"
}

mdata.obs['celltype_lores'] = mdata.obs.celltype_hires.astype("str").values
mdata.obs.celltype_lores = mdata.obs.celltype_lores.replace(new_cluster_names)
mdata.obs.celltype_lores = mdata.obs.celltype_lores.astype("category")

In [None]:
sc.pl.umap(mdata, color=["celltype_lores","celltype_hires", "tissue"], size=10, ncols=1)

In [None]:
mdata.write("./citeseq_mdata_allsamples_filtered_fine_clustering.h5mu")

In [None]:
adata = mdata['rna'].copy()

In [None]:
adata.write('./citeseq_mdata_allsamples_filtered_fine_clustering.h5ad')