# Introduction

Generate filtered AnnData matrices for our kallisto & alevin runs.

I want to try and apply the same paper filter to the dataset.

In [1]:
import numpy
import anndata
import os
import pandas
import scipy
import scanpy
import time
import upsetplot
from matplotlib import pyplot
from pathlib import Path

from common import (
    scanpy_load_solo_mtx, 
    scanpy_load_alevin_mtx, 
    scanpy_load_kallisto_gene_mtx, 
    build_anndata
)

In [2]:
%matplotlib inline

In [3]:
genome_dir = Path('~/proj/genome').expanduser() / 'mm10-M21-male'

store = pandas.HDFStore(genome_dir / 'mm10-M21-male.h5')
gtf = store[store.keys()[0]]
store.close()

def add_gene_symbols_to_anndata(adata, gtf, key="gene_id"):
    if key == "gene_id":
        # We have a gene matrixCell
        info = gtf[gtf["type"].isin(["gene", "tRNA"]) | (gtf["source"] == "spikein")]
        info = info.set_index("gene_id")
        feature_type = "gene"
    elif key == "transcript_id":
        info = gtf[(gtf["type"].isin(["transcript", "tRNA"])) | (gtf["source"] == "spikein")]
        info = info.set_index("transcript_id")
        feature_type = "transcript"
    else:
        raise ValueError(
            "Unrecognized key expected gene_id or transcript_id got {}".format(key)
            )
    gene_names = []
    gene_types = []
    for feature in adata.var_names:
        gene_names.append(info.loc[feature, "gene_name"])
        gene_types.append(info.loc[feature, "gene_type"])

    adata.var["gene_symbols"] = gene_names
    adata.var["gene_types"] = gene_types
    adata.uns["feature_type"] = feature_type
    return adata


In [4]:
analysis_root = Path('10x_paper')

In [5]:
analysis_dirs = {
    'kallisto_em': analysis_root / 'kallisto_em',
    'kallisto_em_minimal': analysis_root / 'kallisto_em_minimal',
    'alevin': analysis_root / 'alevin',
    'alevin_minimal': analysis_root / 'alevin_minimal',
}

In [6]:
timecourse = {
    '10x-7': 'e10.5',
    '10x-3': 'e11.0',
    '10x-4': 'e12.0',
    '10x-5': 'e13.0',
    '10x-12': 'e13.0',
    '10x-1': 'e13.5',
    '10x-13': 'e14.0',
    '10x-6': 'e15.0',
}
sausage = {
    '10x-8': 'e15.0 whole',
    #'10x-9': 'e15.0 proximal',
    '10x-10': 'e15.0 mid',
    '10x-11': 'e15.0 distal',    
}

In [7]:
algorithm = 'kallisto_em_minimal'
#algorithm = 'alevin'
run_matrices = []
t0 = time.monotonic()
tprev = t0
tnow = t0
for run in timecourse:

    run_dir = analysis_dirs[algorithm]/run
    if algorithm.startswith('kallisto'):
        filtered_list = run_dir / 'filtered-barcodes.txt'
        run_matrices.append(scanpy_load_kallisto_gene_mtx(run_dir / 'genecount', filtered_list))
    elif algorithm.startswith('alevin'):
        run_matrices.append(scanpy_load_alevin_mtx(run_dir))    
    # filter out cells 
    
    run_matrices[-1].obs['run'] = run
    run_matrices[-1].obs['timepoint'] = timecourse[run]
    add_gene_symbols_to_anndata(run_matrices[-1], gtf)
    run_matrices[-1].var['mt'] = run_matrices[-1].var['gene_symbols'].fillna('').str.startswith('mt-')
    

    tnow = time.monotonic()
    print("{} {} {} {:.4}s".format(algorithm, run, run_matrices[-1].shape, tnow-tprev))
    tprev = tnow
    
run_matrix = anndata.AnnData.concatenate(*run_matrices)
scanpy.pp.calculate_qc_metrics(run_matrix, qc_vars=['mt'], percent_top=None, inplace=True)
run_matrix

kallisto_em_minimal 10x-7 (7835, 31635) 7.6e+01s
kallisto_em_minimal 10x-3 (18818, 31635) 9.8e+01s
kallisto_em_minimal 10x-4 (7178, 31635) 4.3e+01s
kallisto_em_minimal 10x-5 (11230, 31635) 5.3e+01s
kallisto_em_minimal 10x-12 (12140, 31635) 8.4e+01s
kallisto_em_minimal 10x-1 (8899, 31635) 6.1e+01s
kallisto_em_minimal 10x-13 (10629, 31635) 7.3e+01s
kallisto_em_minimal 10x-6 (14989, 31635) 6.6e+01s


AnnData object with n_obs × n_vars = 91718 × 31635 
    obs: 'batch', 'counts', 'ngenes', 'run', 'timepoint', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt'
    var: 'gene_symbols', 'gene_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

In [11]:
run_matrix.shape

(47909, 31635)

In [10]:
run_matrix[run_matrix.obs.n_genes_by_counts < 1000, :].shape

(7337, 31635)

In [9]:
scanpy.pp.calculate_qc_metrics?