### Notebook for the manifold generation of annotated PBMC data (from Yoshida 2022) using `bbknn+scanpy`

COVID-19 airway and matched PBMCs H5AD(raw) downloaded from [here](https://www.covid19cellatlas.org/index.patient.html)

- **Developed by**: Carlos Talavera-López PhD
- **Modified by**: Alexandra Cirnu
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- **Date of creation**: 230828
- **Date of modification**: 240215

### Import required modules

In [None]:
import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import bbknn
import seaborn as sns
from pywaffle import Waffle
import matplotlib.pyplot as plt

### Set up working environment

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma', dpi_save = 300, vector_friendly = True, format = 'svg')

### Read in all data - Use `sqrt_norm` counts for analysis

In [None]:
adata = sc.read_h5ad('/home/guest1/data/ACM_cardiac_leuco/Annotated_PBMC/meyer_nikolic_covid_pbmc_raw.h5ad')

sample_id = 'A'
adata.obs["sample"] = sample_id
adata.obs

adata

### Use `sqrt_norm` counts for analysis

In [None]:
adata.raw = adata.copy()
adata.layers['counts'] = adata.X.copy()
adata.layers["sqrt_norm"] = np.sqrt(
    sc.pp.normalize_total(adata, inplace = False)["X"]
)
adata

In [None]:
adata_raw = adata.copy()                        #Make a copy of adata and name it adata_raw
adata.X = adata.layers['sqrt_norm'].copy()      #Set a copy of the square root normalized data as the data matrix X

### Calculate Highly Variable Genes

In [None]:
adata.layers['counts'] = adata_raw.X.copy()     #Make a copy of the unnormalized counts layer from the adata_raw.X data matrix

In [None]:
adata.obs['sample'] = adata.obs['sample'].astype('str') 

In [None]:
sc.pp.highly_variable_genes(
    adata,                                      #further analysis is performed on adata again
    flavor = "seurat_v3",
    n_top_genes = 7000,                         #7000 for heterogeneous cell types, 2000 - 4000 for homogeneous cell types like PBMCs
    layer = "counts",                           
    batch_key = "sample",                       
    subset = True
)
adata

### Calculate graph

In [None]:
sc.pp.pca(adata, n_comps = 50, use_highly_variable = True, svd_solver = 'arpack', random_state = 1712)
sc.pp.neighbors(adata, use_rep = "X_pca", n_neighbors = 50, metric = 'minkowski', random_state = 1786)
bbknn_sample = bbknn.bbknn(adata, neighbors_within_batch = 4, batch_key = 'sample', approx = True, copy = True)

In [None]:
sc.tl.umap(bbknn_sample, min_dist = 0.3, spread = 4, random_state = 1789)


In [None]:
sc.pl.umap(bbknn_sample, frameon = False, color = ['nFeature_RNA', 'nCount_RNA', 'patient_id', 'sample_id'], size = 1.5, legend_fontsize = 8, ncols = 4, cmap = 'RdPu')              #add also 'genotype' if there is this key in obs

In [None]:
sc.pl.umap(bbknn_sample, frameon = False, color = ['sample', 'TTN', 'NPPA', 'DCN', 'VWF', 'MYH11', 'RGS4', 'KCNJ8', 'C1QA', 'CD3E', 'TREM2', 'ADIPOQ', 'NRXN1', 'MSLN'], size = 0.6, legend_fontsize = 8, ncols = 4, cmap = 'magma')                #add also 'genotype' if there is this key in obs

#TTN    = titin, expressed in heart and skeletal muscle // cardiomyocytes and skeletal myocytes
#NPPA   = natriuretic peptide A, expressed in heart, can be secreted to blood // cardiomyocytes
#DCN    = decorin // fibroblasts
#VWF    = von Willebrand factor, endothelial cells and adipocytes 
#MYH11  = myosin heavy chain 11 // myoepithelial and smooth muscle cells
#RGS4   = regulator of G protein signaling 4 // neuronal cells, endocrine cells
#KCNJ8  = potassium inwardly rectifying channel subfamily J member 8 // specialized epithelial cells, muscle cells
#C1QA   = complement C1q A chain, expressed in immune cell subsets //  monocytes macrophages
#CD3E   = CD3 epsilon subunit of T/cell receptor complex // T cells
#TREM2  = triggering receptor expressed on myeloid cells 2 // macrophages
#ADIPOQ = adiponectin, C1Q and collagen domain containing // adipocytes and endothelial cells 
#NRXN1  = neurexin 1 // neurons
#MSLN   = mesothelin, secretory cells

In [None]:
sc.pl.umap(bbknn_sample, frameon = False, color = ['CD34', 'HBA1', 'GP1BA', 'CD3E', 'CD4', 'CD8A', 'CD19', 'NCAM1', 'CD14', 'CCR2', 'FUT4', 'CD1C', 'CLEC9A'], size = 0.6, legend_fontsize = 8, ncols = 4, cmap = 'RdPu')

#CD34       // hematopoetic stem cells
#HBA1       // erythrocytes
#GP1BA      // platelets
#CD3E       // T cells
#CD4        // CD4 T cells
#CD8        // CD8 T cells
#CD19       // B cells
#CD56/NCAM1 // NK cells (CD3-)
#CD14       // Monocytes
#CCR2       // Monocytes
#CD15/FUT4  // Granulocytes
#CD1C       // cDC2
#CLEC9A     // cDC1



### Visualise co-variate distribution per sample

In [None]:
sns.set(style="whitegrid")
covariate_to_visualize = 'doublet_scores'

plt.figure(figsize=(10, 6))
sns.histplot(data=adata.obs, x=covariate_to_visualize, hue='sample', stat='count', common_norm=False)
plt.xlabel(covariate_to_visualize)
plt.ylabel('Abundance')
plt.title(f'Abundance Plot of {covariate_to_visualize} by Sample')
plt.legend(title='Sample', loc='upper right')

plt.show()

In [None]:
sns.set(style="whitegrid")
covariate_to_visualize = 'doublet_scores'

sample_names = adata.obs['sample'].unique()
num_samples = len(sample_names)
color_palette = sns.color_palette("Set1", n_colors=num_samples)

g = sns.FacetGrid(adata.obs, col="sample", col_wrap=3, height=5, palette=color_palette)
g.map_dataframe(sns.histplot, x=covariate_to_visualize, stat='count', common_norm=False)

g.set_axis_labels(covariate_to_visualize, 'Abundance')
g.set_titles(col_template="{col_name}")
g.add_legend(title='Sample', loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
sns.set(style = "whitegrid")
covariate_to_visualize = 'nCount_RNA'

plt.figure(figsize = (10, 6))
sns.histplot(data = adata.obs, x = covariate_to_visualize, hue = 'sample', stat = 'count', common_norm = False)
plt.xlabel(covariate_to_visualize)
plt.ylabel('Abundance')
plt.title(f'Abundance Plot of {covariate_to_visualize} by Sample')
plt.legend(title = 'Sample', loc = 'upper right')

plt.show()

### UMI Count per cell

Shows all barcodes detected in an experiment ranked from highest to lowest UMI count. It should help to identify which GEMs included an intact cell and which GEMs were empty droplets with ambient RNA. Real cells can be found until the changing point.

In [None]:
sample_names = adata.obs['sample'].unique()

# Calculate the number of rows and columns for the subplot layout
num_rows = 1
num_cols = 2

# Create a single figure with multiple panels arranged in a grid
fig, axs = plt.subplots(num_rows, num_cols, figsize=(8, 3))

# Flatten the axs array to make it easier to iterate
axs = axs.flatten()

# Define a color palette for the lines
color_palette = plt.cm.get_cmap('tab10')

# Loop through each sample and generate the UMI count plot in a separate panel
for idx, sample_name in enumerate(sample_names):
    # Select cells belonging to the current sample
    mask = adata.obs['sample'] == sample_name
    sample_adata = adata[mask].copy()

    # Sort cells by UMI counts in descending order
    sample_adata.obs['umi_counts'] = sample_adata.X.sum(axis=1)
    sample_adata = sample_adata[sample_adata.obs['umi_counts'].argsort()[::-1]]

    # Generate the UMI count plot in the current panel
    ax = axs[idx]
    color = color_palette(idx)
    ax.plot(range(1, len(sample_adata) + 1), sample_adata.obs['umi_counts'], marker='o', linestyle='-', label=sample_name, color=color, linewidth=1)
    ax.set_xlabel('Cell Rank')
    ax.set_ylabel('UMI Count')
    ax.set_title(f'UMI Count per Cell for Sample: {sample_name}')
    ax.set_xlim(-10000, 80000)
    ax.set_ylim(0, 1500)  # Set x-axis limit
    ax.legend()

# Adjust layout and save the figure
plt.tight_layout()
output_filename = 'umi_count_subplot.png'
plt.savefig(output_filename)

# Show the figure
plt.show()
