# Introduction

I've found it helpful to have one notebook build up a cache of the data I want to analyze. I'm still not quite sure if anndata or loom is a better choice.

This notebook is to collect the 10x e10.5 forelimb runs.

In [1]:
import pandas
import scanpy
from pathlib import Path
import sys
from urllib import parse
import loompy
import numpy
import anndata
import scanpy

from common import (
    scanpy_load_solo_mtx, 
    scanpy_load_alevin_mtx, 
    scanpy_load_kallisto_gene_mtx, 
    build_anndata
)

In [2]:
LRSC = str(Path('~/proj/long-rna-seq-condor').expanduser())
if LRSC not in sys.path:
    sys.path.append(LRSC)
from woldrnaseq.madqc import load_rsem_quantifications, load_genomic_quantifications, load_transcriptome_quantifications, replicate_scores
from woldrnaseq.models import load_library_tables, load_experiments, load_all_star_final, load_all_star_counts

# Load GTF

In [3]:
store = pandas.HDFStore(Path('~/proj/genome/mm10-M21-male/mm10-M21-male.h5').expanduser())
gtf = store[store.keys()[0]]
store.close()

In [4]:
gtf.columns

Index(['chromosome', 'source', 'type', 'start', 'stop', 'score', 'strand',
       'frame', 'gene_id', 'transcript_id', 'gene_type', 'gene_name',
       'transcript_type', 'transcript_name', 'level', 'havana_gene',
       'transcript_support_level', 'tag', 'havana_transcript', 'exon_number',
       'exon_id', 'protein_id', 'ccdsid', 'ont'],
      dtype='object')

In [5]:
gene_info = gtf[gtf['type'].isin(['gene', 'tRNA']) | (gtf['source'] == 'spikein')].set_index('gene_id')

# Load Transcript Map

In [6]:
def load_transcript_map(triplet):
    root = Path('genome')
    txp_map = {}
    with open(root /triplet/'txp2gene.tsv', 'rt') as instream:
        for line in instream:
            txp, gene = line.rstrip().split('\t')
            txp_map[txp] = gene
    return txp_map


In [7]:
mm10_M21_map = load_transcript_map('mm10-M21-male')
len(mm10_M21_map)

168207

In [8]:
root = Path('~/proj/encode-202006-jamboree-detrout-rna-sc-pipeline/').expanduser()
tenx_sc_root = root / 'ENCSR874BOF_e10_5_limb'
tenx_pop_root =  root / '10x_e10.5'

# Load Single Cell tables into sums

In [9]:
def load_sc_anndata(filename, label):
    adata = scanpy.read_h5ad(filename)
    matrix = adata.X.T.sum(axis=1)
    return pandas.DataFrame(matrix, index=adata.var_names, columns=[label])

In [10]:
cellr_sc = load_sc_anndata(tenx_sc_root / "cellranger_filtered.sparse.h5ad", 'sc_cellr')
solo_sc = load_sc_anndata(tenx_sc_root / "solo_filtered.sparse.h5ad", 'sc_solo')
alevin_sc = load_sc_anndata(tenx_sc_root / 'alevin_filtered.h5ad', 'sc_alevin')
kallisto_sc = load_sc_anndata(tenx_sc_root / 'kallisto_filtered.h5ad', 'sc_kallisto')
kallisto_em_sc = load_sc_anndata(tenx_sc_root / 'kallisto_em_filtered.h5ad', 'sc_kallisto_em')


# Load Population RSEM values

In [11]:
pop_rsem_experiment = load_experiments([root / 'experiments_10x_e10.5_merged.tsv'])
pop_rsem_libraries = load_library_tables([root / 'libraries_10x_e10.5.merged.tsv'])

In [12]:
pop_star_stats = load_all_star_final(pop_rsem_libraries)

In [13]:
pop_star_stats.loc['10x_e10.5', ('', 'Number of input reads')]

528888565.0

In [14]:
pop_rsem_gene_count = load_genomic_quantifications(pop_rsem_experiment.loc['10x_e10.5_merged'], pop_rsem_libraries, column='expected_count')
pop_rsem_gene_tpm = load_genomic_quantifications(pop_rsem_experiment.loc['10x_e10.5_merged'], pop_rsem_libraries, column='TPM')

quantifications 10x_e10.5_merged (81881, 1)
quantifications 10x_e10.5_merged (81881, 1)


In [15]:
pop_rsem_transcript_count = load_transcriptome_quantifications(pop_rsem_experiment.loc['10x_e10.5_merged'], pop_rsem_libraries, column='expected_count')
pop_rsem_transcript_tpm = load_transcriptome_quantifications(pop_rsem_experiment.loc['10x_e10.5_merged'], pop_rsem_libraries, column='TPM')

quantifications 10x_e10.5_merged (168207, 1)
quantifications 10x_e10.5_merged (168207, 1)


# Read rsem paired

In [16]:
pop_rsem_paired_experiment = load_experiments([root / 'experiments_10x_e10.5_paired_all.tsv'])
pop_rsem_paired_libraries = load_library_tables([root / 'libraries_10x_e10.5_paired_all.tsv'])
pop_rsem_gene_paired_count = load_genomic_quantifications(pop_rsem_paired_experiment.loc['10x_e10.5_paired_all'], pop_rsem_paired_libraries, column='expected_count')
pop_rsem_gene_paired_tpm = load_genomic_quantifications(pop_rsem_paired_experiment.loc['10x_e10.5_paired_all'], pop_rsem_paired_libraries, column='TPM')
pop_rsem_transcript_paired_count = load_transcriptome_quantifications(pop_rsem_paired_experiment.loc['10x_e10.5_paired_all'], pop_rsem_paired_libraries, column='expected_count')
pop_rsem_transcript_paired_tpm = load_transcriptome_quantifications(pop_rsem_paired_experiment.loc['10x_e10.5_paired_all'], pop_rsem_paired_libraries, column='TPM')

quantifications 10x_e10.5_paired_all (81881, 1)
quantifications 10x_e10.5_paired_all (81881, 1)
quantifications 10x_e10.5_paired_all (168207, 1)
quantifications 10x_e10.5_paired_all (168207, 1)


# Read STAR ReadsPerGene.tab

In [17]:
pop_star_gene_count = load_all_star_counts(pop_rsem_libraries, column='U').reindex(pop_rsem_gene_count.index)

# Read STAR Paired ReadsPerGene.tab

In [18]:
pop_star_gene_paired_count = load_all_star_counts(pop_rsem_paired_libraries, column='U').reindex(pop_rsem_gene_paired_count.index)

# Read kallisto results

In [19]:
pop_kallisto_transcript = pandas.read_csv(tenx_pop_root / 'kallisto_e10.5' / 'abundance.tsv', sep='\t', index_col=0, usecols=['target_id', 'est_counts', 'tpm'])
#pop_kallisto_transcript.columns = ['kallisto_gene_counts']
pop_kallisto_transcript['gene_id'] = [mm10_M21_map[x] for x in pop_kallisto_transcript.index]
pop_kallisto_gene_count = pop_kallisto_transcript.groupby('gene_id')['est_counts'].sum()
pop_kallisto_gene_tpm = pop_kallisto_transcript.groupby('gene_id')['tpm'].sum()
pop_kallisto_gene_count.shape


(81881,)

In [20]:
pop_kallisto_paired_transcript = pandas.read_csv(tenx_sc_root / 'kallisto_paired_bulk_e10.5' / 'abundance.tsv', sep='\t', index_col=0, usecols=['target_id', 'est_counts', 'tpm'])
pop_kallisto_paired_transcript['gene_id'] = [mm10_M21_map[x] for x in pop_kallisto_transcript.index]
pop_kallisto_paired_gene_count = pop_kallisto_transcript.groupby('gene_id')['est_counts'].sum()
pop_kallisto_paired_gene_tpm = pop_kallisto_transcript.groupby('gene_id')['tpm'].sum()
pop_kallisto_paired_gene_count.shape


(81881,)

# Read Salmon Decoy results

In [21]:
pop_salmon_decoy_transcript = pandas.read_csv(tenx_pop_root / 'salmon_decoy_e10.5' / 'quant.sf', sep='\t', index_col=0, usecols=['Name', 'NumReads', 'TPM']).reindex(pop_rsem_transcript_count.index, fill_value=0.0)
pop_salmon_decoy_transcript['gene_id'] = [mm10_M21_map[x] for x in pop_salmon_decoy_transcript.index]

pop_salmon_decoy_gene_count = pop_salmon_decoy_transcript.groupby('gene_id')['NumReads'].sum()
pop_salmon_decoy_gene_tpm = pop_salmon_decoy_transcript.groupby('gene_id')['TPM'].sum()

pop_salmon_decoy_gene_count.shape

(81881,)

# Read Salmon results

In [22]:
pop_salmon_transcript = pandas.read_csv(tenx_pop_root / 'salmon_e10.5' / 'quant.sf', sep='\t', index_col=0, usecols=['Name', 'NumReads', 'TPM']).reindex(pop_rsem_transcript_count.index, fill_value=0.0)
pop_salmon_transcript['gene_id'] = [mm10_M21_map[x] for x in pop_salmon_transcript.index]

pop_salmon_gene_tpm = pop_salmon_transcript.groupby('gene_id')['TPM'].sum()

pop_salmon_gene_count = pop_salmon_transcript.groupby('gene_id')['NumReads'].sum()
pop_salmon_gene_count.reindex(pop_star_gene_count.index, fill_value=0.0)

pop_salmon_gene_count.shape

(81881,)

# Save results

In [23]:
def build_loom(filename, matrix, quantification_name, gtf):
    gene_info = gtf[gtf['type'].isin(['gene', 'tRNA']) | (gtf['source'] == 'spikein')]
    transcript_info = gtf[(gtf['type'].isin(['transcript', 'tRNA'])) | (gtf['source'] == 'spikein')]
    
    if matrix.shape[0] == gene_info.shape[0]:
        # We have a gene matrix
        info = gene_info
        info = info.set_index('gene_id')
        feature_type = 'gene'
    elif matrix.shape[0] == transcript_info.shape[0]:
        info = transcript_info
        info = info.set_index('transcript_id')
        feature_type = 'transcript'
    else:
        raise ValueError('Unrecognized shape expected {} or {} got {}'.format(
            gene_info.shape[0], 
            transcript_info.shape[0],
            count.shape[0],
        ))
    gene_names = []
    gene_types = []
    for feature in matrix.index:
        gene_names.append(info.loc[feature, 'gene_name'])
        gene_types.append(info.loc[feature, 'gene_type'])

    row_attrs = {
        'id': numpy.asarray(matrix.index),
        'gene_name': numpy.asarray(gene_names),
        'gene_type': numpy.asarray(gene_types),
    }
    column_attrs = {
        'experiment': numpy.asarray(matrix.columns), 
    }
    file_attrs = {
        'quantification_name': quantification_name,
        'feature_type': feature_type,
    }
    loompy.create(str(filename), matrix.values, row_attrs=row_attrs, col_attrs=column_attrs, file_attrs=file_attrs)


In [24]:
def build_anndata(filename, matrix, quantification_name, gtf):
    gene_info = gtf[gtf['type'].isin(['gene', 'tRNA']) | (gtf['source'] == 'spikein')]
    transcript_info = gtf[(gtf['type'].isin(['transcript', 'tRNA'])) | (gtf['source'] == 'spikein')]
    
    if matrix.shape[0] == gene_info.shape[0]:
        # We have a gene matrixCell
        info = gene_info
        info = info.set_index('gene_id')
        feature_type = 'gene'
    elif matrix.shape[0] == transcript_info.shape[0]:
        info = transcript_info
        info = info.set_index('transcript_id')
        feature_type = 'transcript'
    else:
        raise ValueError('Unrecognized shape expected {} or {} got {}'.format(
            gene_info.shape[0], 
            transcript_info.shape[0],
            count.shape[0],
        ))
    gene_names = []
    gene_types = []
    for feature in matrix.index:
        gene_names.append(info.loc[feature, 'gene_name'])
        gene_types.append(info.loc[feature, 'gene_type'])

    adata = anndata.AnnData(matrix.T)
    adata.var['gene_symbol'] = gene_names
    adata.var['gene_type'] = gene_types
    adata.uns['quantification_name'] = quantification_name
    adata.uns['feature_type'] = feature_type
    
    adata.write_h5ad(filename)

# Build Gene Matrix

In [25]:
cellr_sc = cellr_sc.reindex(pop_rsem_gene_count.index)
solo_sc = solo_sc.reindex(pop_rsem_gene_count.index)
kallisto_sc = kallisto_sc.reindex(pop_rsem_gene_count.index)
kallisto_em_sc = kallisto_em_sc.reindex(pop_rsem_gene_count.index)
alevin_sc = alevin_sc.reindex(pop_rsem_gene_count.index)

In [26]:
assert numpy.all(cellr_sc.index == solo_sc.index)
assert numpy.all(solo_sc.index == alevin_sc.index)
assert numpy.all(alevin_sc.index == kallisto_sc.index)
assert numpy.all(kallisto_sc.index == kallisto_em_sc.index)
assert numpy.all(kallisto_em_sc.index == pop_rsem_gene_count.index)

assert numpy.all(pop_rsem_gene_paired_count.index == pop_star_gene_paired_count.index)
assert numpy.all(pop_star_gene_paired_count.index == pop_kallisto_paired_gene_count.index)
assert numpy.all(pop_kallisto_paired_gene_count.index == pop_rsem_gene_count.index)
assert numpy.all(pop_rsem_gene_count.index == pop_star_gene_count.index)
assert numpy.all(pop_star_gene_count.index == pop_kallisto_gene_count.index)
assert numpy.all(pop_kallisto_gene_count.index == pop_salmon_decoy_gene_count.index)
assert numpy.all(pop_salmon_decoy_gene_count.index == pop_salmon_gene_count.index)


In [27]:
gene_counts = pandas.DataFrame({
    'sc_cellr': cellr_sc['sc_cellr'],
    'sc_solo': solo_sc['sc_solo'],
    'sc_alevin': alevin_sc['sc_alevin'],
    'sc_kallisto': kallisto_sc['sc_kallisto'],
    'sc_kallisto_em': kallisto_em_sc['sc_kallisto_em'], 
    #'pop_rsem_paired': pop_rsem_gene_paired_count['10x_e10.5_paired_all'],
    #'pop_star_paired': pop_star_gene_paired_count['10x_e10.5_paired_all'],
    'pop_rsem': pop_rsem_gene_count['10x_e10.5'],
    'pop_star': pop_star_gene_count['10x_e10.5'],
    'pop_kallisto': pop_kallisto_gene_count,
    #'pop_kallisto_paired_all': pop_kallisto_paired_gene_count,
    'pop_salmon_decoy': pop_salmon_decoy_gene_count,
    'pop_salmon': pop_salmon_gene_count,
})
gene_counts

Unnamed: 0_level_0,sc_cellr,sc_solo,sc_alevin,sc_kallisto,sc_kallisto_em,pop_rsem,pop_star,pop_kallisto,pop_salmon_decoy,pop_salmon
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10000,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0,0.0,0.0
10001,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0
10002,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0
10003,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0
10004,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
gSpikein_ERCC-00165,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
gSpikein_ERCC-00168,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
gSpikein_ERCC-00170,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
gSpikein_ERCC-00171,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0


In [28]:
assert numpy.all(pop_rsem_gene_paired_tpm.index == pop_kallisto_paired_gene_tpm.index)
assert numpy.all(pop_kallisto_paired_gene_tpm.index == pop_rsem_gene_tpm.index)

assert numpy.all(pop_rsem_gene_tpm.index == pop_kallisto_gene_tpm.index)
assert numpy.all(pop_kallisto_gene_tpm.index == pop_salmon_decoy_gene_tpm.index)
assert numpy.all(pop_salmon_decoy_gene_tpm.index == pop_salmon_gene_tpm.index)

In [29]:
gene_tpm = pandas.DataFrame({
    'pop_rsem': pop_rsem_gene_tpm['10x_e10.5'],
    #'pop_rsem_paired': pop_rsem_gene_paired_tpm['10x_e10.5_paired_all'],    
    'pop_kallisto': pop_kallisto_gene_tpm,
    #'pop_kallisto_paired_all': pop_kallisto_paired_gene_tpm,
    'pop_salmon_decoy': pop_salmon_decoy_gene_tpm,
    'pop_salmon': pop_salmon_gene_tpm,
})
gene_tpm

Unnamed: 0_level_0,pop_rsem,pop_kallisto,pop_salmon_decoy,pop_salmon
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000,0.00,0.000000,0.00000,0.000000
10001,0.00,0.000000,0.00000,0.000000
10002,0.00,0.000000,0.00000,0.000000
10003,0.00,0.000000,0.00000,0.000000
10004,0.00,0.000000,0.00000,0.000000
...,...,...,...,...
gSpikein_ERCC-00165,0.00,0.000000,0.00000,0.000000
gSpikein_ERCC-00168,0.00,0.000000,0.00000,0.000000
gSpikein_ERCC-00170,0.00,0.000000,0.00000,0.000000
gSpikein_ERCC-00171,0.00,0.000000,0.00000,0.000000


In [30]:
#build_loom(tenx_pop_root / '10x_e10.5_gene_counts.loom', gene_counts, 'counts', gtf)

In [31]:
build_anndata(tenx_pop_root / '10x_e10.5_gene_counts.h5ad', gene_counts, 'counts', gtf)

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


In [32]:
#build_loom(tenx_pop_root / '10x_e10.5_gene_tpms.loom', gene_tpm, 'TPM', gtf)

In [33]:
build_anndata(tenx_pop_root / '10x_e10.5_gene_tpms.h5ad', gene_tpm, 'TPM', gtf)

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


# Build Transcript Matrix

In [34]:
assert numpy.all(pop_rsem_transcript_count.index == pop_kallisto_transcript.index)
assert numpy.all(pop_rsem_transcript_count.index == pop_rsem_transcript_paired_count.index)
assert numpy.all(pop_kallisto_transcript.index == pop_salmon_decoy_transcript.index)
assert numpy.all(pop_salmon_decoy_transcript.index == pop_salmon_transcript.index)

assert numpy.all(pop_rsem_transcript_tpm.index == pop_kallisto_transcript.index)
assert numpy.all(pop_kallisto_transcript.index == pop_kallisto_paired_transcript.index)
assert numpy.all(pop_kallisto_paired_transcript.index == pop_salmon_decoy_transcript.index)
assert numpy.all(pop_salmon_decoy_transcript.index == pop_salmon_transcript.index)


In [35]:
transcript_counts = pandas.DataFrame({
    'pop_rsem': pop_rsem_transcript_count['10x_e10.5'],
    #'pop_rsem_paired': pop_rsem_transcript_paired_count['10x_e10.5_paired_all'],
    'pop_kallisto': pop_kallisto_transcript['est_counts'],
    #'pop_kallisto_paired_all': pop_kallisto_paired_transcript['est_counts'],
    'pop_salmon_decoy': pop_salmon_decoy_transcript['NumReads'],
    'pop_salmon': pop_salmon_transcript['NumReads'],
})
transcript_counts

Unnamed: 0,pop_rsem,pop_kallisto,pop_salmon_decoy,pop_salmon
10000,0.0,0.0,0.0,0.0
10001,0.0,0.0,0.0,0.0
10002,0.0,0.0,0.0,0.0
10003,0.0,0.0,0.0,0.0
10004,0.0,0.0,0.0,0.0
...,...,...,...,...
tSpikein_ERCC-00165,0.0,0.0,0.0,0.0
tSpikein_ERCC-00168,0.0,0.0,0.0,0.0
tSpikein_ERCC-00170,0.0,0.0,0.0,0.0
tSpikein_ERCC-00171,0.0,0.0,0.0,0.0


In [36]:
transcript_tpm = pandas.DataFrame({
    'pop_rsem': pop_rsem_transcript_tpm['10x_e10.5'],
    #'pop_rsem_paired': pop_rsem_transcript_paired_tpm['10x_e10.5_paired_all'],
    'pop_kallisto': pop_kallisto_transcript['tpm'],
    #'pop_kallisto_paired_all': pop_kallisto_paired_transcript['tpm'],
    'pop_salmon_decoy': pop_salmon_decoy_transcript['TPM'],
    'pop_salmon': pop_salmon_transcript['TPM'],
})
transcript_tpm

Unnamed: 0,pop_rsem,pop_kallisto,pop_salmon_decoy,pop_salmon
10000,0.00,0.000000,0.00000,0.000000
10001,0.00,0.000000,0.00000,0.000000
10002,0.00,0.000000,0.00000,0.000000
10003,0.00,0.000000,0.00000,0.000000
10004,0.00,0.000000,0.00000,0.000000
...,...,...,...,...
tSpikein_ERCC-00165,0.00,0.000000,0.00000,0.000000
tSpikein_ERCC-00168,0.00,0.000000,0.00000,0.000000
tSpikein_ERCC-00170,0.00,0.000000,0.00000,0.000000
tSpikein_ERCC-00171,0.00,0.000000,0.00000,0.000000


In [37]:
#build_loom(tenx_pop_root / '10x_e10.5_transcript_counts.loom', transcript_counts, 'counts', gtf)

In [38]:
build_anndata(tenx_pop_root / '10x_e10.5_transcript_counts.h5ad', transcript_counts, 'counts', gtf)

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


In [39]:
#build_loom(tenx_pop_root / '10x_e10.5_transcript_tpm.loom', transcript_tpm, 'TPM', gtf)

In [40]:
build_anndata(tenx_pop_root / '10x_e10.5_transcript_tpm.h5ad', transcript_tpm, 'TPM', gtf) 

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
