### Notebook for inspecting denoised protein expression with `Scanpy` with output from `totalVI`

#### Environment: Scanpy

- **Developed by:** Alexandra Cirnu
- **Modified by:** Alexandra Cirnu
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- **Date of creation:** 240601
- **Date of modification:** 240601

### Import required modules

In [None]:
import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import muon as mu
from muon import atac as ac
from muon import prot as pt
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib import rcParams
from scipy.sparse import csr_matrix

### Set up working environment

In [None]:
%matplotlib inline
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')

In [None]:
def X_is_raw(adata): return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

In [None]:
def clr_normalize_each_cell(adata, inplace=True):
    """Normalize count vector for each cell, i.e. for each row of .X"""

    import numpy as np
    import scipy

    def seurat_clr(x):
        # TODO: support sparseness
        s = np.sum(np.log1p(x[x > 0]))
        exp = np.exp(s / len(x))
        return np.log1p(x / exp)

    if not inplace:
        adata = adata.copy()

    # apply to dense or sparse matrix, along axis. returns dense matrix
    adata.X = np.apply_along_axis(
        seurat_clr, 1, (adata.X.A if scipy.sparse.issparse(adata.X) else adata.X)
    )
    return adata

### Read in MuData set

In [None]:
mdata = mu.read_h5mu('/Users/alex/data/ACM_cardiac_leuco/CITE-Seq/ACM_myeloids+lymphoids_integrated_CITE_surface_markers_ac240510.raw.h5mu')
mdata

In [None]:
rna = mdata.mod['rna']
protein = mdata.mod['prot']

In [None]:
protein.obsm['X_umap'] = rna.obsm['X_umap']
protein.obs['model'] = rna.obs['model'] 
protein.obs['condition'] = rna.obs['condition'] 
protein.uns['C_scANVI_colors'] = rna.uns['C_scANVI_colors']
protein.uns['classification_colors'] = rna.uns['classification_colors']
protein.uns['neighbors'] = rna.uns['neighbors']
protein.uns['leiden'] = rna.uns['leiden']
protein.uns['umap'] = rna.uns['umap']
protein.obsp['connectivities'] = rna.obsp['connectivities']

In [None]:
protein_raw = protein.X.copy()

In [None]:
markers_to_remove = ['prot:Hashtag1_TotalA', 'prot:Hashtag2_TotalA', 'prot:Hashtag3_TotalA','prot:Hashtag4_TotalA', 'prot:Hashtag5_TotalA', 'prot:Hashtag6_TotalA','prot:Hashtag7_TotalA', 'prot:Hashtag8_TotalA', 'prot:Hashtag9_TotalA','prot:Hashtag10_TotalA']  # Replace these with actual gene names you want to remove

# Filter out these genes
protein = protein[:, ~protein.var.index.isin(markers_to_remove)]
protein

In [None]:
protein.X = protein.layers['counts']

In [None]:
X_data = protein.X.copy()
X_data_sparse = csr_matrix(X_data)
X_data_df = pd.DataFrame.sparse.from_spmatrix(X_data_sparse, index=protein.obs.index, columns=protein.var.index)
print("Shape of counts DataFrame:", X_data_df.shape)
X_data_df

In [None]:
protein.X = protein.layers['denoised_protein']

In [None]:
X_data = protein.X.copy()
X_data_sparse = csr_matrix(X_data)
X_data_df = pd.DataFrame.sparse.from_spmatrix(X_data_sparse, index=protein.obs.index, columns=protein.var.index)
print("Shape of counts DataFrame:", X_data_df.shape)
X_data_df

### Normalize denoised protein expression

In [None]:
clr_normalize_each_cell(protein)

In [None]:
X_data = protein.X.copy()
X_data_sparse = csr_matrix(X_data)
X_data_df = pd.DataFrame.sparse.from_spmatrix(X_data_sparse, index=protein.obs.index, columns=protein.var.index)
print("Shape of counts DataFrame:", X_data_df.shape)
X_data_df

In [None]:
sc.pl.umap(protein, title='UMAP of Protein Data', frameon= False)

### Cluster the cells

In [None]:
sc.tl.leiden(protein)

In [None]:
sc.pl.umap(protein, color=['leiden', 'C_scANVI', 'classification'], frameon = False, ncols=1)

### Differential expression analysis

In [None]:
sc.tl.rank_genes_groups(protein, "leiden", method="wilcoxon", n_genes= 100)
sc.pl.rank_genes_groups(protein, n_genes=25, sharey=False)

In [None]:
pd.DataFrame(protein.uns["rank_genes_groups"]["names"]).head(3)

In [None]:
# Initialize an empty list to store arrays for vertical stacking
out = []
result = protein.uns["rank_genes_groups"]
groups = result["names"].dtype.names

# Iterate over each group
for group in groups:
    # Stack arrays vertically and append to the 'out' list
    out.append(np.vstack((
        result['names'][group],
        result['scores'][group],
        result['pvals_adj'][group],
        result['logfoldchanges'][group],
        np.array([group] * len(result['names'][group])).astype('object')
    )).T)

# Vertically stack all arrays in the 'out' list
out = np.vstack(out)

#Create a data frame
markers = pd.DataFrame(out, columns= ['Gene', 'Score', 'Adj_pvalue', 'Logfoldchange', 'Cluster'])
markers_filtered = markers[(markers.Adj_pvalue < 0.05) & (abs(markers.Logfoldchange) > 1)]
markers_filtered.shape

In [None]:
sc.pl.rank_genes_groups_dotplot(protein, groupby= 'leiden', n_genes=2, cmap='RdPu')

In [None]:
sc.pl.rank_genes_groups_dotplot(protein, groupby= 'classification', n_genes=4, cmap='RdPu')

In [None]:
groups_to_keep = ['Monocytes_6', 'Monocytes_11', 'Monocytes_13', 'Monocytes_17']
filtered_protein = protein[protein.obs['classification'].isin(groups_to_keep)].copy()

# Now create the dotplot using the filtered data
sc.pl.rank_genes_groups_dotplot(filtered_protein, groupby='classification', n_genes=4, cmap='RdPu', dendrogram= False)


In [None]:
sc.pl.rank_genes_groups_dotplot(protein, groupby= 'C_scANVI', n_genes=3, cmap='RdPu')

In [None]:
protein_Pkp2 = protein[protein.obs['model'] == 'Pkp2'].copy()
protein_Ttn = protein[protein.obs['model'] == 'Ttn'].copy()
protein_Pkp2_ctrnoninf = protein[protein.obs['condition'] == 'Pkp2_noninf'].copy()

In [None]:
num_genes = len(protein_Pkp2.var_names)
ncols = 8
nrows = (num_genes + ncols - 1) // ncols  # Calculate rows needed

# Create a large figure to hold all subplots
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(5*ncols, 5*nrows))
axes = axes.flatten()  # Flatten the array of axes for easier iteration

# Loop over each gene and plot on the corresponding axes
for idx, gene in enumerate(protein_Pkp2.var_names):
    sc.pl.umap(protein_Pkp2, color=gene, title=f'UMAP colored by {gene}', ax=axes[idx], show=False, frameon=False, cmap='RdPu')

# Hide any unused axes if the number of genes is not a perfect multiple of ncols
for ax in axes[idx+1:]:
    ax.set_visible(False)

plt.tight_layout()

In [None]:
num_genes = len(protein_Ttn.var_names)
ncols = 8
nrows = (num_genes + ncols - 1) // ncols  # Calculate rows needed

# Create a large figure to hold all subplots
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(5*ncols, 5*nrows))
axes = axes.flatten()  # Flatten the array of axes for easier iteration

# Loop over each gene and plot on the corresponding axes
for idx, gene in enumerate(protein_Ttn.var_names):
    sc.pl.umap(protein_Ttn, color=gene, title=f'UMAP colored by {gene}', ax=axes[idx], show=False, frameon=False, cmap='Blues')

# Hide any unused axes if the number of genes is not a perfect multiple of ncols
for ax in axes[idx+1:]:
    ax.set_visible(False)

plt.tight_layout()

In [None]:
def split_umap(adata, split_by, ncol=2, nrow=None, **kwargs):
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    fig, axs = plt.subplots(nrow, ncol, figsize=(6.5*ncol, 4*nrow))
    axs = axs.flatten()
    for i, cat in enumerate(categories):
        ax = axs[i]
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, **kwargs)
    plt.tight_layout()

In [None]:
protein_subset = ['prot:CD26_TotalA'] #'prot:CD26_TotalA'
protein_Pkp2_subset = protein_Pkp2[:, protein_subset]

split_umap(protein_Pkp2_subset, color=protein_subset, frameon=False, cmap='Blues', split_by= 'condition', size=10)

# Hide any unused axes if the number of genes is not a perfect multiple of ncols
for ax in axes[idx+1:]:
    ax.set_visible(False)

plt.tight_layout()


In [None]:
highlight_DC = ['DC_12', 'DC_14', 'DC_16']
highlight_DOCK4MØ = [ 'DOCK4+MØ_3',  'DOCK4+MØ_9']
highlight_LYVE1MØ = [ 'LYVE1+MØ_1', 'LYVE1+MØ_2','LYVE1+MØ_4','LYVE1+MØ_8']
highlight_Mast = ['Mast_15']
highlight_Monocytes = ['Monocytes_6', 'Monocytes_11',  'Monocytes_13','Monocytes_17']
highlight_MØ_general = ['MØ_general_0', 'MØ_general_7','MØ_general_10']
highlight_Neutrophils = ['Neutrophils_5'] 
highlight_Macrophages = [ 'DOCK4+MØ_3',  'DOCK4+MØ_9', 'LYVE1+MØ_1', 'LYVE1+MØ_2','LYVE1+MØ_4','LYVE1+MØ_8', 'MØ_general_0', 'MØ_general_7','MØ_general_10']

In [None]:
groups = [
    ('classification', highlight_DC),
    ('classification', highlight_DOCK4MØ),
    ('classification', highlight_LYVE1MØ),
    ('classification', highlight_Mast),
    ('classification', highlight_Monocytes),
    ('classification', highlight_Macrophages),
    ('classification', highlight_MØ_general),
    ('classification', highlight_Neutrophils),
    ('C_scANVI', None)  # This one does not use 'groups' so we pass None
]

# Setup the subplot grid
fig, axs = plt.subplots(2, 4, figsize=(20, 10))  # Adjust figsize as needed

for ax, (color, group) in zip(axs.flat, groups):
    if group is not None:
        sc.pl.umap(protein, color=color, groups=group,
                   legend_loc='right margin', legend_fontsize=5, frameon=False, show=False, ax=ax)
    else:
        sc.pl.umap(protein, color=color,
                   legend_loc='right margin', legend_fontsize=5, frameon=False, show=False, ax=ax)

    ax.set_title(color)  # Optionally set title to distinguish each subplot

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()