## Stewart: filtering of salmon data

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix
from matplotlib_venn import venn2
import scvelo as scv
from pyroe import load_fry

#### load expression matrices of all samples

In [None]:
# load alevin-fry output files
naive = load_fry("/mnt/volume/students/doreen/salmon/naive/naive_quant_res", output_format = "all")
trans = load_fry("/mnt/volume/students/doreen/salmon/trans/trans_quant_res", output_format = "all")
igmmem = load_fry("/mnt/volume/students/doreen/salmon/igmmem/igmmem_quant_res", output_format = "all")
classical = load_fry("/mnt/volume/students/doreen/salmon/classical/classical_quant_res", output_format = "all")
dn = load_fry("/mnt/volume/students/doreen/salmon/dn/dn_quant_res", output_format = "all")

In [None]:
# add population to obs
naive.obs['population'] = 'Naive'
trans.obs['population'] = 'Transitional'
igmmem.obs['population'] = 'IgM-Memory'
classical.obs['population'] = 'Classical-Memory'
dn.obs['population'] = 'DN'

In [None]:
# make list of all samples
adata_list = [classical, naive, trans, igmmem, dn]

#### compare barcodes to reference paper

In [None]:
# get processed object from reference paper
ref = sc.read_h5ad("../../data/Stewart/stewart_reference.h5ad")

In [None]:
# rename barcode suffixes to match naming of reference object
for i in range(len(adata_list)):
    adata_list[i].obs_names = adata_list[i].obs_names + "-" + str(i+1)

# merge all samples
adata_unfiltered = ad.concat(adata_list)

In [None]:
# get intersection with reference
venn2([set(adata_unfiltered.obs_names.tolist()), set(ref.obs_names.tolist())], set_labels = ["Salmon unfiltered", "Stewart"])

#### rename genes

In [None]:
# convert gene IDs to gene names
e2n_path = "../../splici_references/GRCh38-2020-A_geneid_to_name.txt"
e2n = dict([ l.rstrip().split() for l in open(e2n_path).readlines()])
adata_unfiltered.var_names = [e2n[e] for e in adata_unfiltered.var_names]

In [None]:
adata_unfiltered.var_names_make_unique()

#### add spliced, unspliced and ambiguous counts

In [None]:
# load alevin-fry output files using mode = raw
naive = load_fry("../../salmon/naive/naive_quant_res", output_format = "raw")
trans = load_fry("../../salmon/trans/trans_quant_res", output_format = "raw")
igmmem = load_fry("../../salmon/igmmem/igmmem_quant_res", output_format = "raw")
classical = load_fry("../../salmon/classical/classical_quant_res", output_format = "raw")
dn = load_fry("../../salmon/dn/dn_quant_res", output_format = "raw")

In [None]:
# rename barcode suffixes to match naming of reference object
usa_list = [classical, naive, trans, igmmem, dn]
for i in range(len(usa_list)):
    usa_list[i].obs_names = usa_list[i].obs_names + "-" + str(i+1)

usa = ad.concat(usa_list)

In [None]:
# convert gene IDs to gene names
usa.var_names = [e2n[e] for e in usa.var_names]
usa.var_names_make_unique()

In [None]:
# get same ordering as in adata
usa = usa[adata.obs_names, adata.var_names]

In [None]:
# add spliced, unspliced, ambiguous layers to adata
adata.layers["spliced"] = usa.layers["spliced"]
adata.layers["unspliced"] = usa.layers["unspliced"]
adata.layers["ambiguous"] = usa.layers["ambiguous"]

#### manual filtering

In [None]:
# select only B-cells (note: GCGR3A not in dataset!)
adata_filtered_1 = adata_unfiltered[adata_unfiltered[:, ["CD3E", "GNLY", "CD14", "FCER1A", "LYZ", "PPBP", "CD8A"]].X.sum(axis=1) == 0, :]

In [None]:
# select only cells which express at least 200 distinct genes

# add nFeatureRNA as obs
adata_filtered_1.obs['nFeature_RNA'] = adata_filtered_1.X.getnnz(axis=1)

# filter
adata_filtered_2 = adata_filtered_1[adata_filtered_1.obs['nFeature_RNA'] >= 200].copy()

In [None]:
# remove cells with total transcript count in the top 1% percentile

# calculate total transcript count for each cell
total_counts = adata_filtered_2.X.sum(axis=1)

# get threshold = 99% percentile
threshold = np.percentile(total_counts.A1, 99)

# filter out cells with total transcript count above the threshold
adata_filtered = adata_filtered_2[total_counts < threshold, :]

In [None]:
# compare to reference again
venn2([set(adata_filtered.obs_names.tolist()), set(ref.obs_names.tolist())], set_labels = ["Salmon processed", "Stewart processed"])

In [None]:
# save
adata_filtered.write_h5ad(filename="../../data/Stewart/stewart_salmon_unprocessed.h5ad")