## Stewart: filtering of Cell Ranger data

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix
from matplotlib_venn import venn2
import scvelo as scv

#### load expression matrices of all samples

In [None]:
naive = sc.read_10x_mtx("../../cellranger/stewart/naive/outs/filtered_feature_bc_matrix")
trans = sc.read_10x_mtx("../../cellranger/stewart/trans/outs/filtered_feature_bc_matrix")
igmmem = sc.read_10x_mtx("../../cellranger/stewart/igmmem/outs/filtered_feature_bc_matrix")
classical = sc.read_10x_mtx("../../cellranger/stewart/classical/outs/filtered_feature_bc_matrix")
dn = sc.read_10x_mtx("../../cellranger/stewart/dn/outs/filtered_feature_bc_matrix")

In [None]:
# add population to obs
naive.obs['population'] = 'Naive'
trans.obs['population'] = 'Transitional'
igmmem.obs['population'] = 'IgM-Memory'
classical.obs['population'] = 'Classical-Memory'
dn.obs['population'] = 'DN'

In [None]:
# make list of all samples
adata_list = [classical, naive, trans, igmmem, dn]

#### compare barcodes to reference paper

In [None]:
# get processed object from reference paper
ref = sc.read_h5ad("../../data/Stewart/stewart_reference.h5ad")

In [None]:
# rename barcode suffixes to match naming of reference object
for i in range(len(adata_list)):
    adata_list[i].obs_names = adata_list[i].obs_names.str.replace("1", str(i+1))

# merge all samples
adata_unfiltered = ad.concat(adata_list)

In [None]:
# get intersection with reference
venn2([set(adata_unfiltered.obs_names.tolist()), set(ref.obs_names.tolist())], set_labels = ["Cellranger unfiltered", "Stewart processed"])

#### manual filtering

In [None]:
# select only B-cells (note: GCGR3A not in dataset!)
adata_filtered_1 = adata_unfiltered[adata_unfiltered[:, ["CD3E", "GNLY", "CD14", "FCER1A", "LYZ", "PPBP", "CD8A"]].X.sum(axis=1) == 0, :]

In [None]:
# select only cells which express at least 200 distinct genes

# add nFeatureRNA as obs
adata_filtered_1.obs['nFeature_RNA'] = adata_filtered_1.X.getnnz(axis=1)

# filter
adata_filtered_2 = adata_filtered_1[adata_filtered_1.obs['nFeature_RNA'] >= 200].copy()

In [None]:
# remove cells with total transcript count in the top 1% percentile

# calculate total transcript count for each cell
total_counts = adata_filtered_2.X.sum(axis=1)

# get threshold = 99% percentile
threshold = np.percentile(total_counts.A1, 99)

# filter out cells with total transcript count above the threshold
adata_filtered = adata_filtered_2[total_counts < threshold, :]

In [None]:
# compare to reference again
venn2([set(adata_filtered.obs_names.tolist()), set(ref.obs_names.tolist())], set_labels = ["Cellranger processed", "Stewart processed"])

In [None]:
# save 
adata_filtered.write_h5ad(filename="../../data/Stewart/stewart_cellranger_unprocessed.h5ad")