# Introduction

We need to load the per cell C1 quantifications, though only some of the cells are usable.

We're moving away from the e10.5 because those were done first and their spike ins are confusing.

In [1]:
import pandas
import scanpy
from pathlib import Path
import sys
from urllib import parse
import loompy
import re
import numpy
import anndata

In [2]:
LRSC = str(Path('~/proj/long-rna-seq-condor').expanduser())
if LRSC not in sys.path:
    sys.path.append(LRSC)
from woldrnaseq.models import load_experiments, load_library_tables, load_all_star_counts
from woldrnaseq.madqc import load_genomic_quantifications, load_transcriptome_quantifications

In [3]:
store = pandas.HDFStore(Path('~/proj/genome/mm10-M21-male/mm10-M21-male.h5').expanduser())
gtf = store[store.keys()[0]]
store.close()


In [4]:
rsem_gene_counts = scanpy.read_h5ad('c1_e10.5/c1_cell_e10.5_rsem_gene_counts.h5ad')
rsem_transcript_counts = scanpy.read_h5ad('c1_e10.5/c1_cell_e10.5_rsem_transcript_counts.h5ad')

In [5]:
def load_transcript_map(triplet):
    root = Path('genome')
    txp_map = {}
    with open(root /triplet/'txp2gene.tsv', 'rt') as instream:
        for line in instream:
            txp, gene = line.rstrip().split('\t')
            txp_map[txp] = gene
    return txp_map


In [6]:
mm10_M21_map = load_transcript_map('mm10-M21-male')
len(mm10_M21_map)

168207

In [7]:
def build_anndata(filename, matrix, quantification_name, gtf, feature_type):
    gene_info = gtf[gtf['type'].isin(['gene', 'tRNA']) | (gtf['source'] == 'spikein')]
    transcript_info = gtf[(gtf['type'].isin(['transcript', 'tRNA'])) | (gtf['source'] == 'spikein')]
    
    if feature_type == 'gene':
        # We have a gene matrix
        info = gene_info
        info = info.set_index('gene_id')
    elif feature_type == 'transcript':
        info = transcript_info
        info = info.set_index('transcript_id')
    else:
        raise ValueError('Unrecognized annotation_type {}'.format(
            feature_type
        ))
    gene_names = []
    gene_types = []
    for feature in matrix.index:
        gene_names.append(info.loc[feature, 'gene_name'])
        gene_types.append(info.loc[feature, 'gene_type'])

    columns = matrix.columns
    if 'gene_id' in columns:
        columns = columns.drop('gene_id')
    quant_matrix = matrix[columns]
    adata = anndata.AnnData(quant_matrix.T)
    adata.var['gene_symbol'] = gene_names
    adata.var['gene_type'] = gene_types
    adata.uns['quantification_name'] = quantification_name
    adata.uns['feature_type'] = feature_type
    
    adata.write_h5ad(filename)

In [8]:
c1_pseudo = Path('c1_pseudo')

In [9]:
c1_kallisto = c1_pseudo / 'kallisto'
c1_salmon = c1_pseudo / 'salmon'
c1_salmon_decoy = c1_pseudo / 'salmon_decoy'
c1_kallisto_minimal = c1_pseudo / 'kallisto_minimal'
c1_salmon_minimal = c1_pseudo / 'salmon_minimal'

In [10]:
c1_remap_root = Path('~/proj/C1_mouse_limb_combined/all_analysis_M21')

In [11]:
libraries = load_library_tables([c1_remap_root / 'libraries-passing.tsv'])

In [12]:
experiments = load_experiments([c1_remap_root / 'experiments-by-run-passing.tsv'])
runs = []
for experiment, row in experiments.iterrows():
    match = re.search('run(?P<run>[\d]+)', experiment)
    runs.append(int(match.group('run')))

experiments['run'] = runs
experiments.head()

Unnamed: 0_level_0,replicates,analysis_dir,run
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C1_mouse_e13.5_limb_mesenchyme_mm10_clean_run4,"[18251_A1, 18251_A10, 18251_A11, 18251_A12, 18...",~/proj/C1_mouse_limb_combined/all_analysis_M21,4
C1_mouse_e11.0_limb_mesenchyme_mm10_clean_run5,"[18258_A1, 18258_A10, 18258_A11, 18258_A12, 18...",~/proj/C1_mouse_limb_combined/all_analysis_M21,5
C1_mouse_e11.5_limb_mesenchyme_mm10_clean_run6,"[18263_A1, 18263_A10, 18263_A11, 18263_A12, 18...",~/proj/C1_mouse_limb_combined/all_analysis_M21,6
C1_mouse_e12.5_limb_mesenchyme_mm10_clean_run7,"[18270_A1, 18270_A10, 18270_A11, 18270_A12, 18...",~/proj/C1_mouse_limb_combined/all_analysis_M21,7
C1_mouse_e13.5_limb_mesenchyme_mm10_clean_run8,"[18311_A1, 18311_A10, 18311_A11, 18311_A12, 18...",~/proj/C1_mouse_limb_combined/all_analysis_M21,8


# Write out kallisto results

In [13]:
def load_kallisto(filename, column):
    assert column in ['est_counts', 'tpm']
    spike_prefix = 'tSpikein_'
    spike_prefix_len = len(spike_prefix)
    df = pandas.read_csv(
        filename,
        sep='\t',
        #converters={'target_id': lambda x: x[spike_prefix_len:] if x.startswith('tSpikein_') else x}
    )
    return df.set_index('target_id')[column]

In [14]:
def load_all_kallisto(kallisto_dir, column):
    cells = {}
    for experiment_name, row in experiments.iterrows():
        if row.run not in [1,2]:
            for replicate in row.replicates:
                analysis_dir = kallisto_dir / replicate
                abundance = analysis_dir / 'abundance.tsv'
                cells[replicate] = load_kallisto(abundance, column)
    return pandas.DataFrame(cells)

In [15]:
kallisto_transcript_counts = load_all_kallisto(c1_kallisto, 'est_counts').reindex(rsem_transcript_counts.var_names)
kallisto_transcript_tpms = load_all_kallisto(c1_kallisto, 'tpm').reindex(rsem_transcript_counts.var_names)

kallisto_transcript_counts['gene_id'] = [mm10_M21_map[x] for x in kallisto_transcript_counts.index]
kallisto_transcript_tpms['gene_id'] = [mm10_M21_map[x] for x in kallisto_transcript_counts.index]

In [16]:
kallisto_gene_counts = kallisto_transcript_counts.groupby('gene_id').sum().reindex(rsem_gene_counts.var_names).fillna(0)
kallisto_gene_tpms = kallisto_transcript_tpms.groupby('gene_id').sum().reindex(rsem_gene_counts.var_names).fillna(0)

In [17]:
build_anndata(c1_pseudo / 'kallisto_transcript_counts.h5ad', kallisto_transcript_counts, 'est_counts', gtf, 'transcript')
build_anndata(c1_pseudo / 'kallisto_transcript_tpms.h5ad', kallisto_transcript_tpms, 'tpm', gtf, 'transcript')
build_anndata(c1_pseudo / 'kallisto_gene_counts.h5ad', kallisto_gene_counts, 'est_counts', gtf, 'gene')
build_anndata(c1_pseudo / 'kallisto_gene_tpms.h5ad', kallisto_gene_tpms, 'tpm', gtf, 'gene')


... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


In [18]:
kallisto_gene_tpms

Unnamed: 0_level_0,18251_A1,18251_A10,18251_A11,18251_A12,18251_A2,18251_A3,18251_A4,18251_A5,18251_A6,18251_A7,...,20048_E8,20048_E9,20049_F1,20049_F2,20049_F3,20049_F4,20049_F5,20049_F6,20049_F7,20049_F8
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gSpikein_ERCC-00165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gSpikein_ERCC-00168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gSpikein_ERCC-00170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gSpikein_ERCC-00171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
rsem_gene_counts.var_names

Index(['10000', '10001', '10002', '10003', '10004', '10005', '10006', '10007',
       '10008', '10009',
       ...
       'gSpikein_ERCC-00158', 'gSpikein_ERCC-00160', 'gSpikein_ERCC-00162',
       'gSpikein_ERCC-00163', 'gSpikein_ERCC-00164', 'gSpikein_ERCC-00165',
       'gSpikein_ERCC-00168', 'gSpikein_ERCC-00170', 'gSpikein_ERCC-00171',
       'gSpikein_phiX174'],
      dtype='object', name='gene_id', length=81881)

# Load Star Counts

In [20]:
star_gene_counts = load_all_star_counts(libraries, column='U')

In [21]:
star_gene_counts = star_gene_counts[kallisto_gene_counts.columns].reindex(rsem_gene_counts.var_names)
star_gene_counts.shape
                                                                          

(81881, 845)

In [22]:
build_anndata(c1_pseudo / 'star_gene_counts.h5ad', star_gene_counts, 'U', gtf, 'gene')

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


# Write out RSEM results

In [23]:
combined_experiment = pandas.Series({'replicates': list(kallisto_gene_counts.columns)})
combined_experiment.name = 'combined'

rsem_gene_counts = load_genomic_quantifications(combined_experiment, libraries, 'expected_count')
rsem_gene_tpms = load_genomic_quantifications(combined_experiment, libraries, 'TPM')

rsem_transcript_counts = load_transcriptome_quantifications(combined_experiment, libraries, 'expected_count')
rsem_transcript_tpms = load_transcriptome_quantifications(combined_experiment, libraries, 'TPM')

quantifications combined (81881, 845)
quantifications combined (81881, 845)
quantifications combined (168207, 845)
quantifications combined (168207, 845)


In [24]:
build_anndata(c1_pseudo / 'rsem_transcript_counts.h5ad', rsem_transcript_counts, 'expected_count', gtf, 'transcript')
build_anndata(c1_pseudo / 'rsem_transcript_tpms.h5ad', rsem_transcript_tpms, 'TPM', gtf, 'transcript')
build_anndata(c1_pseudo / 'rsem_gene_counts.h5ad', rsem_gene_counts, 'expected_count', gtf, 'gene')
build_anndata(c1_pseudo / 'rsem_gene_tpms.h5ad', rsem_gene_tpms, 'TPM', gtf, 'gene')


... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


# Write out salmon results

In [25]:
def load_salmon(filename, column):
    assert column in ['TPM', 'NumReads']
    df = pandas.read_csv(
        filename,
        sep='\t',
    )
    return df.set_index('Name')[column]
    
def load_all_salmon(root, column):
    cells = {}
    for experiment_name, row in experiments.iterrows():
        if row.run not in [1,2]:
            for replicate in row.replicates:
                analysis_dir = root / replicate
                abundance = analysis_dir / 'quant.sf'
                cells[replicate] = load_salmon(abundance, column)
    return pandas.DataFrame(cells)

In [26]:
salmon_transcript_counts = load_all_salmon(c1_salmon, 'NumReads').reindex(rsem_transcript_counts.index).fillna(0)
salmon_transcript_tpms = load_all_salmon(c1_salmon, 'TPM').reindex(rsem_transcript_counts.index).fillna(0)

salmon_transcript_counts['gene_id'] = [mm10_M21_map[x] for x in salmon_transcript_counts.index]
salmon_transcript_tpms['gene_id'] = [mm10_M21_map[x] for x in salmon_transcript_counts.index]

In [27]:
salmon_gene_counts = salmon_transcript_counts.groupby('gene_id').sum()
salmon_gene_tpms = salmon_transcript_tpms.groupby('gene_id').sum()

In [28]:
build_anndata(c1_pseudo / 'salmon_transcript_counts.h5ad', salmon_transcript_counts, 'est_counts', gtf, 'transcript')
build_anndata(c1_pseudo / 'salmon_transcript_tpms.h5ad', salmon_transcript_tpms, 'tpm', gtf, 'transcript')
build_anndata(c1_pseudo / 'salmon_gene_counts.h5ad', salmon_gene_counts, 'est_counts', gtf, 'gene')
build_anndata(c1_pseudo / 'salmon_gene_tpms.h5ad', salmon_gene_tpms, 'tpm', gtf, 'gene')


... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


# Write out the salmon decoy results

In [29]:
salmon_decoy_transcript_counts = load_all_salmon(c1_salmon_decoy, 'NumReads').reindex(rsem_transcript_counts.index).fillna(0)
salmon_decoy_transcript_tpms = load_all_salmon(c1_salmon_decoy, 'TPM').reindex(rsem_transcript_counts.index).fillna(0)

salmon_decoy_transcript_counts['gene_id'] = [mm10_M21_map[x] for x in salmon_decoy_transcript_counts.index]
salmon_decoy_transcript_tpms['gene_id'] = [mm10_M21_map[x] for x in salmon_decoy_transcript_counts.index]

In [30]:
salmon_decoy_gene_counts = salmon_decoy_transcript_counts.groupby('gene_id').sum()
salmon_decoy_gene_tpms = salmon_decoy_transcript_tpms.groupby('gene_id').sum()

In [31]:
build_anndata(c1_pseudo / 'salmon_decoy_transcript_counts.h5ad', salmon_decoy_transcript_counts, 'est_counts', gtf, 'transcript')
build_anndata(c1_pseudo / 'salmon_decoy_transcript_tpms.h5ad', salmon_decoy_transcript_tpms, 'tpm', gtf, 'transcript')
build_anndata(c1_pseudo / 'salmon_decoy_gene_counts.h5ad', salmon_decoy_gene_counts, 'est_counts', gtf, 'gene')
build_anndata(c1_pseudo / 'salmon_decoy_gene_tpms.h5ad', salmon_decoy_gene_tpms, 'tpm', gtf, 'gene')


... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


# Load Kallisto Minimal

In [32]:
kallisto_minimal_transcript_counts = load_all_kallisto(c1_kallisto_minimal, 'est_counts')
kallisto_minimal_transcript_tpms = load_all_kallisto(c1_kallisto_minimal, 'tpm')

kallisto_minimal_transcript_counts['gene_id'] = [mm10_M21_map[x] for x in kallisto_minimal_transcript_counts.index]
kallisto_minimal_transcript_tpms['gene_id'] = [mm10_M21_map[x] for x in kallisto_minimal_transcript_counts.index]

In [33]:
kallisto_minimal_gene_counts = kallisto_minimal_transcript_counts.groupby('gene_id').sum().fillna(0)
kallisto_minimal_gene_tpms = kallisto_minimal_transcript_tpms.groupby('gene_id').sum().fillna(0)

In [34]:
build_anndata(c1_pseudo / 'kallisto_minimal_transcript_counts.h5ad', kallisto_minimal_transcript_counts, 'est_counts', gtf, 'transcript')
build_anndata(c1_pseudo / 'kallisto_minimal_transcript_tpms.h5ad', kallisto_minimal_transcript_tpms, 'tpm', gtf, 'transcript')
build_anndata(c1_pseudo / 'kallisto_minimal_gene_counts.h5ad', kallisto_minimal_gene_counts, 'est_counts', gtf, 'gene')
build_anndata(c1_pseudo / 'kallisto_minimal_gene_tpms.h5ad', kallisto_minimal_gene_tpms, 'tpm', gtf, 'gene')


... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


# Write Salmon Minimal

In [35]:
salmon_minimal_transcript_counts = load_all_salmon(c1_salmon_minimal, 'NumReads').reindex(kallisto_minimal_transcript_counts.index).fillna(0)
salmon_minimal_transcript_tpms = load_all_salmon(c1_salmon_minimal, 'TPM').reindex(kallisto_minimal_transcript_counts.index).fillna(0)

salmon_minimal_transcript_counts['gene_id'] = [mm10_M21_map[x] for x in salmon_minimal_transcript_counts.index]
salmon_minimal_transcript_tpms['gene_id'] = [mm10_M21_map[x] for x in salmon_minimal_transcript_counts.index]

In [36]:
salmon_minimal_gene_counts = salmon_minimal_transcript_counts.groupby('gene_id').sum()
salmon_minimal_gene_tpms = salmon_minimal_transcript_tpms.groupby('gene_id').sum()

In [37]:
build_anndata(c1_pseudo / 'salmon_minimal_transcript_counts.h5ad', salmon_minimal_transcript_counts, 'est_counts', gtf, 'transcript')
build_anndata(c1_pseudo / 'salmon_minimal_transcript_tpms.h5ad', salmon_minimal_transcript_tpms, 'tpm', gtf, 'transcript')
build_anndata(c1_pseudo / 'salmon_minimal_gene_counts.h5ad', salmon_minimal_gene_counts, 'est_counts', gtf, 'gene')
build_anndata(c1_pseudo / 'salmon_minimal_gene_tpms.h5ad', salmon_minimal_gene_tpms, 'tpm', gtf, 'gene')


... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


# Load Star Counts

In [38]:
minimal_libraries = load_library_tables([c1_pseudo / 'rsem_minimal' / 'libraries.tsv'])
star_minimal_counts = load_all_star_counts(minimal_libraries, 'U')[kallisto_gene_counts.columns].reindex(kallisto_minimal_gene_counts.index)
star_minimal_counts.shape

(31635, 845)

In [39]:
build_anndata(c1_pseudo / 'star_minimal_gene_counts.h5ad', star_minimal_counts, 'counts', gtf, 'gene')

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


# Some QC Checks

In [40]:
rgc = scanpy.read_h5ad(c1_pseudo / 'rsem_gene_counts.h5ad')
rtc = scanpy.read_h5ad(c1_pseudo / 'rsem_transcript_counts.h5ad')
ktc = scanpy.read_h5ad(c1_pseudo / 'kallisto_transcript_counts.h5ad')
stc = scanpy.read_h5ad(c1_pseudo / 'salmon_transcript_counts.h5ad')
kgc = scanpy.read_h5ad(c1_pseudo / 'kallisto_gene_counts.h5ad')
sgc = scanpy.read_h5ad(c1_pseudo / 'salmon_gene_counts.h5ad')

kmtc = scanpy.read_h5ad(c1_pseudo / 'kallisto_minimal_transcript_counts.h5ad')
smtc = scanpy.read_h5ad(c1_pseudo / 'salmon_minimal_transcript_counts.h5ad')
kmgc = scanpy.read_h5ad(c1_pseudo / 'kallisto_minimal_gene_counts.h5ad')
smgc = scanpy.read_h5ad(c1_pseudo / 'salmon_minimal_gene_counts.h5ad')

stargc = scanpy.read_h5ad(c1_pseudo / 'star_gene_counts.h5ad')
starmgc = scanpy.read_h5ad(c1_pseudo / 'star_minimal_gene_counts.h5ad')

In [41]:
assert numpy.all(rgc.obs_names == rtc.obs_names)
assert numpy.all(rtc.obs_names == ktc.obs_names)
assert numpy.all(ktc.obs_names == stc.obs_names)
assert numpy.all(stc.obs_names == kgc.obs_names)
assert numpy.all(kgc.obs_names == sgc.obs_names)
assert numpy.all(sgc.obs_names == starmgc.obs_names)

assert numpy.all(kmgc.obs_names == rgc.obs_names)
assert numpy.all(smgc.obs_names == rgc.obs_names)
assert numpy.all(kmtc.obs_names == rtc.obs_names)
assert numpy.all(smtc.obs_names == rtc.obs_names)

In [42]:
assert numpy.all(rtc.var_names == ktc.var_names)
assert numpy.all(ktc.var_names == stc.var_names)

assert numpy.all(rgc.var_names == kgc.var_names)
assert numpy.all(kgc.var_names == sgc.var_names)

assert numpy.all(kmgc.var_names.shape[0] < rgc.var_names.shape[0])
assert numpy.all(smgc.var_names.shape[0] < rgc.var_names.shape[0])
assert numpy.all(kmtc.var_names.shape[0] < rtc.var_names.shape[0])
assert numpy.all(smtc.var_names.shape[0] < rtc.var_names.shape[0])


In [43]:
assert numpy.all(kmtc.var_names == smtc.var_names)

In [44]:
assert numpy.all(kmgc.var_names == smgc.var_names)
assert numpy.all(kmgc.var_names == starmgc.var_names)

In [45]:
assert numpy.all(stargc.var_names == kgc.var_names)
assert numpy.all(stargc.obs_names == kgc.obs_names)