## Stewart: preprocessing of Cell Ranger data

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import scvelo as scv
import re
import louvain
from skmisc.loess import loess

#### create colormap for UMAPs

In [None]:
# list of HEX colors
hex_colors = ["#66BA7B", "#C08C1C", "#CB67EF", "#DC726C", "#75C1C3", "#7DB93F", "#DF60B5", "#9390F3", "#5FACF2", "#9EA022"]

# convert HEX colors to RGB
rgb_colors = [(int(color[1:3], 16)/255, int(color[3:5], 16)/255, int(color[5:7], 16)/255) for color in hex_colors]

# create colormap
cmap = ListedColormap(rgb_colors)

# create dictionary to assign colors to clusters
clusters = ["Trans", "Naive", "M-mem1", "M-mem2", "C-mem1", "C-mem2", "DN1", "DN2", "DN3", "DN4"]
umap_colors = {}
i = 0
for cluster in clusters:
    umap_colors[cluster] = cmap.colors[i]
    i+=1

#### load data

In [None]:
adata = sc.read_h5ad("../data/stewart_cellranger_unprocessed.h5ad")
salmon = sc.read_h5ad("../data/stewart_salmon_unprocessed.h5ad")
ref = sc.read_h5ad("../data/stewart_reference.h5ad")

#### subset to get only cells & genes present in cellranger, salmon and Stewart

In [None]:
# get subset present in all 3
cell_subset =  list(set(adata.obs_names.tolist() & set(salmon.obs_names.tolist()) & set(ref.obs_names.tolist())))
gene_subset =  list(set(adata.var_names.tolist() & set(salmon.var_names.tolist()) & set(ref.var_names.tolist())))

# reduce to subset
adata = adata[cell_subset, gene_subset]
adata

#### preprocessing

In [None]:
# save unprocessed counts
adata.layers["counts"] = adata.X

In [None]:
# normalization
sc.pp.normalize_total(adata, target_sum=10000)
# log transform
sc.pp.log1p(adata)

In [None]:
# get highly variable features from reference
variable_mask = [ref.var['vst.variable'][gene] for gene in adata.var_names.tolist()]
adata.var['highly_variable'] = variable_mask

In [None]:
adata.var['highly_variable'].sum()

In [None]:
# scale data
sc.pp.scale(adata, max_value=10)

In [None]:
# get PCA from reference
adata.obsm["X_pca"] = ref[adata.obs_names, adata.var_names].obsm["X_pca"] 

In [None]:
# neighbors
sc.pp.neighbors(adata, n_pcs=14, n_neighbors=20)

In [None]:
# get clusters from reference
adata.obs["cluster"] = [str(ref[bc,:].obs["cluster"][0]) for bc in adata.obs_names]

In [None]:
# check gene expression
markers = ["HOPX", "PDE4D", "IGHE", "SELL", "EMP3", "CIB1", "PSAP", "CD72", "DAPP1", "LTB", "HCK", "ZEB2", 
           "RHOB", "TNFRSF1B", "FCRL3", "FCRL5", "FGR", "MPP6", "TAGLN2", "IGHA2", "AHNAK", "S100A4", "CRIP2", 
           "ITGB1", "JCHAIN", "VIM", "PLPP5", "FCER2", "IL4R", "CRIP1", "LGALS1", "IGHA1", "CTSH", "IGHG2", 
           "VPREB3", "PPP1R14A", "PCDH9", "PLD4", "IGHM", "MT-ATP8", "IGHD", "SOX4", "AL139020.1", "TCL1A"]

# missing: "S100A1", "IGLL5"

sc.pl.dotplot(adata, markers, groupby='cluster', vmin=-2, vmax=2, vcenter=0, cmap="coolwarm",
             categories_order = ["DN4", "DN3", "DN2", "DN1", "C-mem2", "C-mem1", "M-mem2", "M-mem1", "Naive", "Trans"])

In [None]:
# get UMAP coordinates from reference
umap_xyz = np.empty((0, 3))
for bc in adata.obs_names.tolist():
    umap_xyz = np.vstack((umap_xyz, np.array(ref[bc,:].obsm['X_umap'])))
adata.obsm['X_umap'] = umap_xyz

In [None]:
# plot UMAP
sc.pl.umap(adata, color="cluster", dimensions=(0,1), palette=umap_colors)
sc.pl.umap(adata, color="cluster", dimensions=(0,2), palette=umap_colors)
sc.pl.umap(adata, color="cluster", dimensions=(1,2), palette=umap_colors)

In [None]:
# save
adata.write_h5ad(filename="../data/stewart_cellranger.h5ad")