## preprocessing for CellRanger (Mathew data)

In [91]:
import numpy as np
import pandas as pd
import os
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib_venn import venn2
import scvelo as scv
import re
import louvain
from skmisc.loess import loess
from skbio.diversity.alpha import simpson, gini_index
import warnings
import sys
import ensembl_rest
import time

#### create colormap for UMAPs

In [7]:
# list of HEX colors
hex_colors = ["#CD665F", "#BF781C", "#AB871D", "#92941D", "#77A119", "#57AA17", "#64B15A", "#5CB488",
              "#61AFB4", "#5AA6DC", "#5A91F8", "#7D7AFA", "#A760FA", "#C54FE4", "#CB4EC0", "#D3558E"]

# convert HEX colors to RGB
rgb_colors = [(int(color[1:3], 16)/255, int(color[3:5], 16)/255, int(color[5:7], 16)/255) for color in hex_colors]

# create colormap
cmap = ListedColormap(rgb_colors)

# create dictionary to assign colors to clusters
clusters = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16"]
umap_colors = {}
i = 0
for cluster in clusters:
    umap_colors[cluster] = cmap.colors[i]
    i+=1

### load expression matrices

In [8]:
# load metadata
meta = pd.read_csv("../data/metadata_2.csv", index_col=0)
meta

Unnamed: 0_level_0,assay,chemistry,mouse_nr,infection,day_post_infection,organ,organ_day,name
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SampleID_1_27feb19,rna,v2,M7,naive,D0,spleen,spleen0,spleen0_1
SampleID_2_29apr19,rna,v2,M8,naive,D0,spleen,spleen0,spleen0_2
SampleID_51_24feb20,rna,v2,M9,infected,D7,spleen,spleen7,spleen7_1
SampleID_54_24feb20,rna,v2,M10,infected,D7,spleen,spleen7,spleen7_2
SampleID_2_27feb19,rna,v2,M1,infected,D14,spleen,spleen14,spleen14_1
SampleID_4_2apr19,rna,v2,M2,infected,D14,spleen,spleen14,spleen14_2
SampleID_4_26apr19,rna,v2,M3,infected,D14,spleen,spleen14,spleen14_2
SampleID_3_27feb19,rna,v2,M4,infected,D28,spleen,spleen28,spleen28_1
SampleID_4_1apr19,rna,v2,M5,infected,D28,spleen,spleen28,spleen28_2
SampleID_52_24feb20,rna,v2,M9,infected,D7,mln,mln7,mln7_1


In [9]:
adatas = []
path = "/mnt/volume/students/doreen/cellranger/mathew"

for _, folders, _ in os.walk(path):
    for folder in folders:
        
        # get adata
        adata = sc.read_10x_mtx(path + "/" + folder)
        
        # add metadata
        for col_name, col_data in meta.loc[folder].items():
            adata.obs[col_name] = col_data

        # add ID
        adata.obs["id"] = folder

        # append to list
        adatas.append(adata)

In [10]:
# merge all samples
adata = ad.concat(adatas)
adata

AnnData object with n_obs × n_vars = 58486 × 32285
    obs: 'assay', 'chemistry', 'mouse_nr', 'infection', 'day_post_infection', 'organ', 'organ_day', 'name', 'id'

### quality control and filtering

In [11]:
# calculate quality control metrics
sc.pp.calculate_qc_metrics(adata, inplace = True)

#### filter cells

In [12]:
# number of unique features or total counts in top/bottom 0.5%
sc.pp.filter_cells(adata, min_genes = np.percentile(adata.obs["n_genes_by_counts"], 0.5))
sc.pp.filter_cells(adata, max_genes = np.percentile(adata.obs["n_genes_by_counts"], 99.5))
sc.pp.filter_cells(adata, min_counts = np.percentile(adata.obs["total_counts"], 0.5))
sc.pp.filter_cells(adata, max_counts = np.percentile(adata.obs["total_counts"], 99.5))

In [13]:
# number of unique features < 200
sc.pp.filter_cells(adata, min_genes = 200)

In [14]:
# percentage of mitochondrial counts / ribosomal > 25%

# get mitochondrial, ribosomal genes
adata.var["mt"] = adata.var_names.str.startswith("mt-")
adata.var["ribo"] = adata.var_names.str.startswith(("Rps", "Rpl"))

# get percentages
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo"], inplace=True, percent_top=[25, 50, 75], log1p=True)

# filter
adata = adata[adata.obs.pct_counts_mt <= 25, :]
adata = adata[adata.obs.pct_counts_ribo <= 25, :]

In [33]:
# Gini or Simpson diversity index < 0.8

# calculate Simpson index per cell
adata.obs["simpson"] = [simpson(adata.X[i,:].toarray().flatten()) for i in range(adata.n_obs)]

# calculate Gini index per cell
adata.obs["gini"] = [gini_index(adata.X[i,:].toarray().flatten()) for i in range(adata.n_obs)]

# filter
adata = adata[adata.obs.simpson >= 0.8, :]
adata = adata[adata.obs.gini >= 0.8, :]

#### filter genes

In [36]:
# remove mitochondrial genes
adata = adata[:,adata.var.mt == False]

In [103]:
# define genes to keep
keep = ["Ighd" "Ighm", "Ighg1", "Ighg2c", "Ighg2b", "Ighg3", "Igha", "Ighe"]
adata.var["keep"] = [gene in keep for gene in adata.var_names]

In [106]:
# remove genes expressed in < 5 cells, but keep Ig genes
adata = adata[:,((adata.var.total_counts >= 5) | (adata.var.keep == True))]

In [107]:
# function to look up biotype in ensembl database
def get_biotype(species, gene):

    # add waiting time to prevent server error
    time.sleep(0.1)
    
    try:
        return ensembl_rest.symbol_lookup(species, gene)["biotype"]
    # return None, if gene is not in ensembl database
    except ensembl_rest.HTTPError as e:
        return None

In [108]:
adata.var["biotype"] = [get_biotype("mouse", gene) for gene in adata.var_names]

In [113]:
# remove non protein-coding genes, but keep Ig genes
adata = adata[:,((adata.var.biotype == "protein_coding") | (adata.var.keep == True))]

### result

In [114]:
adata.shape

(39390, 15233)

### normalization

In [116]:
# save unprocessed counts
adata.layers["counts"] = adata.X

In [117]:
# normalization
sc.pp.normalize_total(adata, target_sum=1000)

In [118]:
# log transform
sc.pp.log1p(adata)

In [120]:
# scale
sc.pp.scale(adata)

In [124]:
# regress out effects of number of features & percentage of mitochondrial counts
sc.pp.regress_out(adata, ["total_counts", "pct_counts_mt"])

In [125]:
# save
adata.write_h5ad("/mnt/volume/students/doreen/data/Mathew/adata.h5ad")

### data integration