# Introduction

converting the directory of results files into a single annotated result matrix is a bit slow, so I break it out into it's own notebook so I don't have to rerun it when doing analysis notebook reruns.

In [1]:
import pandas
import scanpy
from pathlib import Path
import sys
from urllib import parse
import loompy
import numpy
import anndata

In [2]:
LRSC = str(Path('~/proj/long-rna-seq-condor').expanduser())
if LRSC not in sys.path:
    sys.path.append(LRSC)
from woldrnaseq.madqc import load_rsem_quantifications, load_genomic_quantifications, load_transcriptome_quantifications, replicate_scores
from woldrnaseq.models import load_library_tables, load_experiments, load_all_star_final, load_all_star_counts

In [3]:
HTSW = str(Path('~/proj/htsworkflow').expanduser())
if HTSW not in sys.path:
    sys.path.append(HTSW)
from htsworkflow.submission.encoded import ENCODED

# Load GTF

In [4]:
store = pandas.HDFStore(Path('~/proj/genome/mm10-M21-male/mm10-M21-male.h5').expanduser())
gtf = store[store.keys()[0]]
store.close()


In [5]:
gtf.columns

Index(['chromosome', 'source', 'type', 'start', 'stop', 'score', 'strand',
       'frame', 'gene_id', 'transcript_id', 'gene_type', 'gene_name',
       'transcript_type', 'transcript_name', 'level', 'havana_gene',
       'transcript_support_level', 'tag', 'havana_transcript', 'exon_number',
       'exon_id', 'protein_id', 'ccdsid', 'ont'],
      dtype='object')

In [6]:
gtf.iloc[1879552]

chromosome                           ERCC-00171
source                                  spikein
type                                       exon
start                                         1
stop                                        505
score                                       NaN
strand                                        1
frame                                       NaN
gene_id                     gSpikein_ERCC-00171
transcript_id               tSpikein_ERCC-00171
gene_type                                   NaN
gene_name                                   NaN
transcript_type                             NaN
transcript_name                             NaN
level                                       NaN
havana_gene                                 NaN
transcript_support_level                    NaN
tag                                         NaN
havana_transcript                           NaN
exon_number                                 NaN
exon_id                                 

In [7]:
gene_info = gtf[gtf['type'].isin(['gene', 'tRNA']) | (gtf['source'] == 'spikein')].set_index('gene_id')

In [8]:
gene_info.loc['ENSMUSG00000026787.3', 'gene_name']

'Gad2'

In [9]:
gene_info

Unnamed: 0_level_0,chromosome,source,type,start,stop,score,strand,frame,transcript_id,gene_type,...,level,havana_gene,transcript_support_level,tag,havana_transcript,exon_number,exon_id,protein_id,ccdsid,ont
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
896,chr1,ENSEMBL,tRNA,112349389,112349461,,1,,896,Pseudo_tRNA,...,,,,,,,,,,
897,chr1,ENSEMBL,tRNA,112576185,112576260,,-1,,897,Pseudo_tRNA,...,,,,,,,,,,
1275,chr1,ENSEMBL,tRNA,32624825,32624895,,1,,1275,Pseudo_tRNA,...,,,,,,,,,,
1914,chr1,ENSEMBL,tRNA,167276215,167276287,,1,,1914,Pseudo_tRNA,...,,,,,,,,,,
1915,chr1,ENSEMBL,tRNA,167323285,167323359,,1,,1915,Pseudo_tRNA,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gSpikein_ERCC-00165,ERCC-00165,spikein,exon,1,872,,1,,tSpikein_ERCC-00165,,...,,,,,,,,,,
gSpikein_ERCC-00168,ERCC-00168,spikein,exon,1,1024,,1,,tSpikein_ERCC-00168,,...,,,,,,,,,,
gSpikein_ERCC-00170,ERCC-00170,spikein,exon,1,1023,,1,,tSpikein_ERCC-00170,,...,,,,,,,,,,
gSpikein_ERCC-00171,ERCC-00171,spikein,exon,1,505,,1,,tSpikein_ERCC-00171,,...,,,,,,,,,,


# Load Transcript Map

In [10]:
def load_transcript_map(triplet):
    root = Path('genome')
    txp_map = {}
    with open(root /triplet/'txp2gene.tsv', 'rt') as instream:
        for line in instream:
            txp, gene = line.rstrip().split('\t')
            txp_map[txp] = gene
    return txp_map


In [11]:
mm10_M21_map = load_transcript_map('mm10-M21-male')
len(mm10_M21_map)

168207

In [12]:
server = ENCODED('www.encodeproject.org')
server.load_netrc()

Read in list of fastqs used to generate the fastqs for the population STAR run, and extract our cell IDs

It was copied from <a href="build-e10.5-population-fastqs.html">build-e10.5-population-fastqs.ipynb</a>

In [13]:
libraries_used = set()
prefix_len = len('barbara-wold:')
with open('fastqs-used-in-c1-e10.5.txt') as instream:
    for i, line in enumerate(instream):
        url = parse.urlparse(line.strip())
        parts = url.path.split('/')
        fastq_accession = parts[2]
        fastq = server.get_json(fastq_accession)
        library = fastq['library']
        library_id = library['@id']
        aliases = library['aliases']
        jumpgate = aliases[0][prefix_len:]
        #print(fastq_accession, library_id, jumpgate)
        libraries_used.add(jumpgate)
    

In [14]:
len(libraries_used)

251

In [15]:
replicates = []
for row in libraries_used:
    replicates.append(row)

one_encode_experiment = pandas.Series({
    'replicates': replicates,
})
len(one_encode_experiment['replicates'])

251

In [16]:
with open('c1_e10.5_cells.txt', 'wt') as outstream:
    for r in sorted(replicates):
        outstream.write('{}\n'.format(r))

In [17]:
print("\n".join(sorted(one_encode_experiment['replicates'])))

17327_A1
17327_A10
17327_A11
17327_A12
17327_A2
17327_A3
17327_A4
17327_A5
17327_A6
17327_A7
17327_A8
17327_A9
17328_B1
17328_B10
17328_B11
17328_B12
17328_B2
17328_B3
17328_B4
17328_B5
17328_B6
17328_B7
17328_B8
17328_B9
17329_C1
17329_C10
17329_C11
17329_C12
17329_C2
17329_C3
17329_C4
17329_C5
17329_C6
17329_C7
17329_C8
17329_C9
17330_D1
17330_D10
17330_D11
17330_D12
17330_D2
17330_D3
17330_D4
17330_D5
17330_D6
17330_D7
17330_D8
17330_D9
17331_E1
17331_E10
17331_E11
17331_E12
17331_E2
17331_E3
17331_E4
17331_E5
17331_E6
17331_E7
17331_E8
17331_E9
17332_F1
17332_F10
17332_F11
17332_F12
17332_F2
17332_F3
17332_F4
17332_F5
17332_F6
17332_F7
17332_F8
17332_F9
17333_G1
17333_G10
17333_G11
17333_G12
17333_G2
17333_G3
17333_G4
17333_G5
17333_G6
17333_G7
17333_G8
17333_G9
17334_H1
17334_H10
17334_H11
17334_H12
17334_H2
17334_H3
17334_H4
17334_H5
17334_H6
17334_H7
17334_H8
17334_H9
18042_A1
18042_A10
18042_A11
18042_A12
18042_A2
18042_A3
18042_A4
18042_A5
18042_A6
18042_A7
18042_A8
18042_A9
1

In [18]:
c1_root = Path('./c1_e10.5')

# Load Single Cell RSEM values

In [19]:
c1_M21_vdir = Path('~/proj/C1_mouse_limb_combined/all_analysis_M21').expanduser()

In [20]:
c1_M21_libraries_filename = c1_M21_vdir / 'libraries-passing.tsv'

In [21]:
c1_M21_libraries = load_library_tables([c1_M21_libraries_filename], sep='\t')

In [22]:
cells_rsem_gene_count = load_genomic_quantifications(one_encode_experiment, c1_M21_libraries, column='expected_count')
cells_rsem_gene_tpm = load_genomic_quantifications(one_encode_experiment, c1_M21_libraries, column='TPM')

quantifications None (81881, 251)
quantifications None (81881, 251)


In [23]:
sc_rsem_gene_count = cells_rsem_gene_count.sum(axis=1)
sc_rsem_gene_tpm = cells_rsem_gene_count.sum(axis=1)

In [24]:
sc_rsem_gene_count.shape

(81881,)

In [25]:
cells_rsem_transcript_count = load_transcriptome_quantifications(one_encode_experiment, c1_M21_libraries, column='expected_count')
cells_rsem_transcript_tpm = load_transcriptome_quantifications(one_encode_experiment, c1_M21_libraries, column='TPM')

quantifications None (168207, 251)
quantifications None (168207, 251)


In [26]:
sc_rsem_transcript_count = cells_rsem_transcript_count.sum(axis=1)
sc_rsem_transcript_tpm = cells_rsem_transcript_tpm.sum(axis=1)

In [27]:
sc_rsem_transcript_count.shape

(168207,)

In [28]:
sc_star_stats = load_all_star_final(c1_M21_libraries).loc[one_encode_experiment['replicates']]

In [29]:
sc_star_stats[('', 'Number of input reads')].sum()

492826080.0

# Load Population RSEM values

In [30]:
pop_dir = Path('/woldlab/loxcyc/home/diane/proj/encode-202006-jamboree-detrout-rna-sc-pipeline/')
pop_experiment = load_experiments([pop_dir / 'experiments_c1_e10.5.tsv'])
pop_libraries = load_library_tables([pop_dir / 'libraries_c1_e10.5.tsv'])

In [31]:
pop_star_stats = load_all_star_final(pop_libraries)

In [32]:
pop_star_stats.loc['c1_e10.5', ('', 'Number of input reads')]

492826080.0

In [33]:
pop_star_stats.loc['c1_e10.5', ('', 'Number of input reads')] == sc_star_stats[('', 'Number of input reads')].sum()

True

In [34]:
pop_rsem_gene_count = load_genomic_quantifications(pop_experiment.loc['c1_e10.5_pool'], pop_libraries, column='expected_count')
pop_rsem_gene_tpm = load_genomic_quantifications(pop_experiment.loc['c1_e10.5_pool'], pop_libraries, column='TPM')

quantifications c1_e10.5_pool (81881, 1)
quantifications c1_e10.5_pool (81881, 1)


In [35]:
pop_rsem_transcript_count = load_transcriptome_quantifications(pop_experiment.loc['c1_e10.5_pool'], pop_libraries, column='expected_count')
pop_rsem_transcript_tpm = load_transcriptome_quantifications(pop_experiment.loc['c1_e10.5_pool'], pop_libraries, column='TPM')

quantifications c1_e10.5_pool (168207, 1)
quantifications c1_e10.5_pool (168207, 1)


# Read STAR ReadsPerGene.tab

In [36]:
cells_star_gene_count = load_all_star_counts(c1_M21_libraries, column='U').reindex(pop_rsem_gene_count.index)
cells_star_gene_count.loc[cells_star_gene_count.index, one_encode_experiment['replicates']]
sc_star_gene_count = cells_star_gene_count.sum(axis=1)

In [37]:
pop_star_gene_count = load_all_star_counts(pop_libraries, column='U').reindex(pop_rsem_gene_count.index)

# Read kallisto results

In [38]:
pop_kallisto_transcript = pandas.read_csv(c1_root / 'kallisto_e10.5' / 'abundance.tsv', sep='\t', index_col=0, usecols=['target_id', 'est_counts', 'tpm'])
#pop_kallisto_transcript.columns = ['kallisto_gene_counts']
pop_kallisto_transcript['gene_id'] = [mm10_M21_map[x] for x in pop_kallisto_transcript.index]


In [39]:
pop_kallisto_gene_count = pop_kallisto_transcript.groupby('gene_id')['est_counts'].sum()
pop_kallisto_gene_tpm = pop_kallisto_transcript.groupby('gene_id')['tpm'].sum()

In [40]:
pop_kallisto_gene_count.shape

(81881,)

# Read kallisto 2x fragment size

To see how mapping is influenced by fragment size, I doubled it.

In [41]:
pop_kallisto_2x_transcript = pandas.read_csv(c1_root / 'kallisto_e10.5_f594' / 'abundance.tsv', sep='\t', index_col=0, usecols=['target_id', 'est_counts', 'tpm'])
#pop_kallisto_transcript.columns = ['kallisto_gene_counts']
pop_kallisto_2x_transcript['gene_id'] = [mm10_M21_map[x] for x in pop_kallisto_transcript.index]


In [42]:
pop_kallisto_2x_gene_count = pop_kallisto_transcript.groupby('gene_id')['est_counts'].sum()
pop_kallisto_2x_gene_tpm = pop_kallisto_transcript.groupby('gene_id')['tpm'].sum()

In [43]:
pop_kallisto_2x_gene_count.shape

(81881,)

# Read Salmon Decoy results

In [44]:
pop_salmon_decoy_transcript = pandas.read_csv(c1_root / 'salmon_decoy_e10.5' / 'quant.sf', sep='\t', index_col=0, usecols=['Name', 'NumReads', 'TPM']).reindex(pop_rsem_transcript_count.index, fill_value=0.0)
pop_salmon_decoy_transcript['gene_id'] = [mm10_M21_map[x] for x in pop_salmon_decoy_transcript.index]
pop_salmon_decoy_transcript.shape

(168207, 3)

In [45]:
pop_salmon_decoy_transcript

Unnamed: 0_level_0,TPM,NumReads,gene_id
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10000,0.000000,0.000,10000
10001,0.000000,0.000,10001
10002,5.778059,12.553,10002
10003,0.000000,0.000,10003
10004,0.000000,0.000,10004
...,...,...,...
tSpikein_ERCC-00165,8.609722,2847.000,gSpikein_ERCC-00165
tSpikein_ERCC-00168,0.210129,87.000,gSpikein_ERCC-00168
tSpikein_ERCC-00170,12.450761,5155.000,gSpikein_ERCC-00170
tSpikein_ERCC-00171,1087.570259,139289.000,gSpikein_ERCC-00171


In [46]:
pop_salmon_decoy_gene_count = pop_salmon_decoy_transcript.groupby('gene_id')['NumReads'].sum()
pop_salmon_decoy_gene_tpm = pop_salmon_decoy_transcript.groupby('gene_id')['TPM'].sum()

In [47]:
pop_salmon_decoy_gene_count.shape

(81881,)

# Read Salmon results

In [48]:
pop_salmon_transcript = pandas.read_csv(c1_root / 'salmon_e10.5' / 'quant.sf', sep='\t', index_col=0, usecols=['Name', 'NumReads', 'TPM']).reindex(pop_rsem_transcript_count.index, fill_value=0.0)
pop_salmon_transcript['gene_id'] = [mm10_M21_map[x] for x in pop_salmon_transcript.index]
pop_salmon_transcript.shape

(168207, 3)

In [49]:
pop_salmon_gene_count = pop_salmon_transcript.groupby('gene_id')['NumReads'].sum()
pop_salmon_gene_tpm = pop_salmon_transcript.groupby('gene_id')['TPM'].sum()

In [50]:
pop_salmon_gene_count.shape

(81881,)

In [51]:
pop_salmon_gene_count.reindex(pop_star_gene_count.index, fill_value=0.0)

gene_id
10000                       0.000
10001                       0.000
10002                      10.374
10003                       0.000
10004                       0.000
                          ...    
gSpikein_ERCC-00165      2847.000
gSpikein_ERCC-00168        87.000
gSpikein_ERCC-00170      5155.000
gSpikein_ERCC-00171    139289.000
gSpikein_phiX174           22.000
Name: NumReads, Length: 81881, dtype: float64

In [52]:
def build_loom(filename, matrix, quantification_name, gtf):
    gene_info = gtf[gtf['type'].isin(['gene', 'tRNA']) | (gtf['source'] == 'spikein')]
    transcript_info = gtf[(gtf['type'].isin(['transcript', 'tRNA'])) | (gtf['source'] == 'spikein')]
    
    if matrix.shape[0] == gene_info.shape[0]:
        # We have a gene matrix
        info = gene_info
        info = info.set_index('gene_id')
        feature_type = 'gene'
    elif matrix.shape[0] == transcript_info.shape[0]:
        info = transcript_info
        info = info.set_index('transcript_id')
        feature_type = 'transcript'
    else:
        raise ValueError('Unrecognized shape expected {} or {} got {}'.format(
            gene_info.shape[0], 
            transcript_info.shape[0],
            count.shape[0],
        ))
    gene_names = []
    gene_types = []
    for feature in matrix.index:
        gene_names.append(info.loc[feature, 'gene_name'])
        gene_types.append(info.loc[feature, 'gene_type'])

    row_attrs = {
        'id': numpy.asarray(matrix.index),
        'gene_name': numpy.asarray(gene_names),
        'gene_type': numpy.asarray(gene_types),
    }
    column_attrs = {
        'experiment': numpy.asarray(matrix.columns), 
    }
    file_attrs = {
        'quantification_name': quantification_name,
        'feature_type': feature_type,
    }
    loompy.create(str(filename), matrix.values, row_attrs=row_attrs, col_attrs=column_attrs, file_attrs=file_attrs)


In [53]:
def build_anndata(filename, matrix, quantification_name, gtf):
    gene_info = gtf[gtf['type'].isin(['gene', 'tRNA']) | (gtf['source'] == 'spikein')]
    transcript_info = gtf[(gtf['type'].isin(['transcript', 'tRNA'])) | (gtf['source'] == 'spikein')]
    
    if matrix.shape[0] == gene_info.shape[0]:
        # We have a gene matrix
        info = gene_info
        info = info.set_index('gene_id')
        feature_type = 'gene'
    elif matrix.shape[0] == transcript_info.shape[0]:
        info = transcript_info
        info = info.set_index('transcript_id')
        feature_type = 'transcript'
    else:
        raise ValueError('Unrecognized shape expected {} or {} got {}'.format(
            gene_info.shape[0], 
            transcript_info.shape[0],
            count.shape[0],
        ))
    gene_names = []
    gene_types = []
    for feature in matrix.index:
        gene_names.append(info.loc[feature, 'gene_name'])
        gene_types.append(info.loc[feature, 'gene_type'])

    adata = anndata.AnnData(matrix.T)
    adata.var['gene_symbol'] = gene_names
    adata.var['gene_type'] = gene_types
    adata.uns['quantification_name'] = quantification_name
    adata.uns['feature_type'] = feature_type
    
    adata.write_h5ad(filename)

# Build Gene Matrix

In [54]:
pop_star_gene_count.index.shape

(81881,)

In [55]:
sc_rsem_gene_count.shape, pop_rsem_gene_count.shape

((81881,), (81881, 1))

In [56]:
assert numpy.all(sc_rsem_gene_count.index == pop_rsem_gene_count.index)
assert numpy.all(pop_rsem_gene_count.index == sc_star_gene_count.index)
assert numpy.all(sc_star_gene_count.index == pop_star_gene_count.index)
assert numpy.all(pop_star_gene_count.index == pop_kallisto_gene_count.index)
assert numpy.all(pop_kallisto_gene_count.index == pop_salmon_decoy_gene_count.index)
assert numpy.all(pop_salmon_decoy_gene_count.index == pop_salmon_gene_count.index)
assert numpy.all(pop_kallisto_gene_count.index == pop_kallisto_2x_gene_count.index)


In [57]:
pop_star_gene_count.columns

Index(['c1_e10.5'], dtype='object')

In [58]:
gene_counts = pandas.DataFrame({
    'sc_rsem': sc_rsem_gene_count,
    'pop_rsem': pop_rsem_gene_count['c1_e10.5'],
    'sc_star': sc_star_gene_count,
    'pop_star': pop_star_gene_count['c1_e10.5'],
    'pop_kallisto': pop_kallisto_gene_count,
    'pop_kallisto_fragment2x': pop_kallisto_2x_gene_count,
    'pop_salmon_decoy': pop_salmon_decoy_gene_count,
    'pop_salmon': pop_salmon_gene_count,
})
gene_counts

Unnamed: 0_level_0,sc_rsem,pop_rsem,sc_star,pop_star,pop_kallisto,pop_kallisto_fragment2x,pop_salmon_decoy,pop_salmon
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10000,0.0,0.0,0,0,0.0,0.0,0.000,0.000
10001,0.0,0.0,0,0,0.0,0.0,0.000,0.000
10002,0.0,0.0,0,0,0.0,0.0,12.553,10.374
10003,0.0,0.0,0,0,0.0,0.0,0.000,0.000
10004,0.0,0.0,0,0,0.0,0.0,0.000,0.000
...,...,...,...,...,...,...,...,...
gSpikein_ERCC-00165,2851.0,2851.0,67304,2967,1800.0,1800.0,2847.000,2847.000
gSpikein_ERCC-00168,87.0,87.0,2600,87,63.0,63.0,87.000,87.000
gSpikein_ERCC-00170,5189.0,5189.0,74813,5345,2670.0,2670.0,5155.000,5155.000
gSpikein_ERCC-00171,137815.0,137815.0,2700617,145624,8847.0,8847.0,139289.000,139289.000


In [59]:
assert numpy.all(sc_rsem_gene_tpm.index == pop_rsem_gene_tpm.index)
assert numpy.all(pop_rsem_gene_tpm.index == pop_kallisto_gene_tpm.index)
assert numpy.all(pop_kallisto_gene_tpm.index == pop_salmon_decoy_gene_tpm.index)
assert numpy.all(pop_salmon_decoy_gene_tpm.index == pop_salmon_gene_tpm.index)
assert numpy.all(pop_kallisto_gene_tpm.index == pop_kallisto_2x_gene_tpm.index)


In [60]:
gene_tpm = pandas.DataFrame({
    'sc_rsem': sc_rsem_gene_tpm,
    'pop_rsem': pop_rsem_gene_tpm['c1_e10.5'],
    'pop_kallisto': pop_kallisto_gene_tpm,
    'pop_kallisto_fragment2x': pop_kallisto_2x_gene_tpm,
    'pop_salmon_decoy': pop_salmon_decoy_gene_tpm,
    'pop_salmon': pop_salmon_gene_tpm,
})
gene_tpm

Unnamed: 0_level_0,sc_rsem,pop_rsem,pop_kallisto,pop_kallisto_fragment2x,pop_salmon_decoy,pop_salmon
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,0.0,0.00,0.000000,0.000000,0.000000,0.000000
10001,0.0,0.00,0.000000,0.000000,0.000000,0.000000
10002,0.0,0.00,0.000000,0.000000,5.778059,4.507016
10003,0.0,0.00,0.000000,0.000000,0.000000,0.000000
10004,0.0,0.00,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...
gSpikein_ERCC-00165,2851.0,10.81,13.580100,13.580100,8.609722,8.126028
gSpikein_ERCC-00168,87.0,0.28,0.343633,0.343633,0.210129,0.198324
gSpikein_ERCC-00170,5189.0,16.62,14.590100,14.590100,12.450761,11.751278
gSpikein_ERCC-00171,137815.0,942.74,244.729000,244.729000,1087.570259,1026.470610


In [61]:
build_loom(c1_root / 'c1_e10.5_gene_counts.loom', gene_counts, 'counts', gtf)

In [62]:
build_anndata(c1_root / 'c1_e10.5_gene_counts.h5ad', gene_counts, 'counts', gtf)

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


In [63]:
build_loom(c1_root / 'c1_e10.5_gene_tpms.loom', gene_tpm, 'TPM', gtf)

In [64]:
build_anndata(c1_root / 'c1_e10.5_gene_tpms.h5ad', gene_tpm, 'TPM', gtf)

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


# Build Transcript Matrix

In [65]:
assert numpy.all(sc_rsem_transcript_count.index == pop_rsem_transcript_count.index)
assert numpy.all(pop_rsem_transcript_count.index == pop_kallisto_transcript.index)
assert numpy.all(pop_kallisto_transcript.index == pop_salmon_decoy_transcript.index)
assert numpy.all(pop_salmon_decoy_transcript.index == pop_salmon_transcript.index)
assert numpy.all(pop_kallisto_transcript.index == pop_kallisto_2x_transcript.index)

assert numpy.all(sc_rsem_transcript_tpm.index == pop_rsem_transcript_tpm.index)
assert numpy.all(pop_rsem_transcript_tpm.index == pop_kallisto_transcript.index)



In [66]:
transcript_counts = pandas.DataFrame({
    'sc_rsem': sc_rsem_transcript_count,
    'pop_rsem': pop_rsem_transcript_count['c1_e10.5'],
    'pop_kallisto': pop_kallisto_transcript['est_counts'],
    'pop_kallisto_fragment2x': pop_kallisto_2x_transcript['est_counts'],
    'pop_salmon_decoy': pop_salmon_decoy_transcript['NumReads'],
    'pop_salmon': pop_salmon_transcript['NumReads'],
})
transcript_counts

Unnamed: 0,sc_rsem,pop_rsem,pop_kallisto,pop_kallisto_fragment2x,pop_salmon_decoy,pop_salmon
10000,0.0,0.0,0.0,0.0,0.000,0.000
10001,0.0,0.0,0.0,0.0,0.000,0.000
10002,0.0,0.0,0.0,0.0,12.553,10.374
10003,0.0,0.0,0.0,0.0,0.000,0.000
10004,0.0,0.0,0.0,0.0,0.000,0.000
...,...,...,...,...,...,...
tSpikein_ERCC-00165,2851.0,2851.0,1800.0,0.0,2847.000,2847.000
tSpikein_ERCC-00168,87.0,87.0,63.0,2.0,87.000,87.000
tSpikein_ERCC-00170,5189.0,5189.0,2670.0,1207.0,5155.000,5155.000
tSpikein_ERCC-00171,137815.0,137815.0,8847.0,0.0,139289.000,139289.000


In [67]:
transcript_tpm = pandas.DataFrame({
    'sc_rsem': sc_rsem_transcript_tpm,
    'pop_rsem': pop_rsem_transcript_tpm['c1_e10.5'],
    'pop_kallisto': pop_kallisto_transcript['tpm'],
    'pop_kallisto_fragment2x': pop_kallisto_2x_transcript['tpm'],
    'pop_salmon_decoy': pop_salmon_decoy_transcript['TPM'],
    'pop_salmon': pop_salmon_transcript['TPM'],
})
transcript_tpm

Unnamed: 0,sc_rsem,pop_rsem,pop_kallisto,pop_kallisto_fragment2x,pop_salmon_decoy,pop_salmon
10000,0.00,0.00,0.000000,0.000000,0.000000,0.000000
10001,0.00,0.00,0.000000,0.000000,0.000000,0.000000
10002,0.00,0.00,0.000000,0.000000,5.778059,4.507016
10003,0.00,0.00,0.000000,0.000000,0.000000,0.000000
10004,0.00,0.00,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...
tSpikein_ERCC-00165,3322.71,10.81,13.580100,0.000000,8.609722,8.126028
tSpikein_ERCC-00168,69.43,0.28,0.343633,0.050603,0.210129,0.198324
tSpikein_ERCC-00170,4479.03,16.62,14.590100,30.668800,12.450761,11.751278
tSpikein_ERCC-00171,271399.15,942.74,244.729000,0.000000,1087.570259,1026.470610


In [68]:
build_loom(c1_root / 'c1_e10.5_transcript_counts.loom', transcript_counts, 'counts', gtf)

In [69]:
build_anndata(c1_root / 'c1_e10.5_transcript_counts.h5ad', transcript_counts, 'counts', gtf)

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


In [70]:
build_loom(c1_root / 'c1_e10.5_transcript_tpm.loom', transcript_tpm, 'TPM', gtf)

In [71]:
build_anndata(c1_root / 'c1_e10.5_transcript_tpm.h5ad', transcript_tpm, 'TPM', gtf) 

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical


In [72]:
cells_star_gene_count

Unnamed: 0_level_0,19906_A5,20038_F9,18268_F12,18270_A12,18316_F6,20047_D10,19914_A11,20090_C6,19909_D1,18048_G6,...,20090_C5,20026_A4,20028_C2,20039_A9,18259_B10,18275_F5,18251_A8,20036_D7,20033_A3,20028_C3
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gSpikein_ERCC-00165,64,122,161,82,19,166,48,234,131,0,...,69,50,78,204,17,70,34,78,52,141
gSpikein_ERCC-00168,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
gSpikein_ERCC-00170,198,274,63,34,23,845,213,182,38,0,...,79,483,31,55,0,67,10,16,90,60
gSpikein_ERCC-00171,1196,2614,11768,3918,560,7056,4573,7521,2487,0,...,2773,3166,1066,3976,2963,3991,1978,2161,1159,2872


In [73]:
build_anndata(c1_root / 'c1_cell_e10.5_rsem_gene_counts.h5ad', cells_rsem_gene_count, 'counts', gtf)
build_anndata(c1_root / 'c1_cell_e10.5_rsem_gene_tpms.h5ad', cells_rsem_gene_tpm, 'TPM', gtf)
build_anndata(c1_root / 'c1_cell_e10.5_star_gene_counts.h5ad', cells_star_gene_count, 'TPM', gtf)

build_anndata(c1_root / 'c1_cell_e10.5_rsem_transcript_counts.h5ad', cells_rsem_transcript_count, 'counts', gtf)
build_anndata(c1_root / 'c1_cell_e10.5_rsem_transcript_tpms.h5ad', cells_rsem_transcript_tpm, 'TPM', gtf)

... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
... storing 'gene_symbol' as categorical
... storing 'gene_type' as categorical
