# Introduction

In [1]:
import sys
import os
from pathlib import Path
from xopen import xopen
import requests
from anndata import AnnData

from pipeline_common import get_gene_id_to_name

In [2]:
HTSW = str(Path('~/proj/htsworkflow').expanduser())
if HTSW not in sys.path:
    sys.path.append(str(HTSW))
from htsworkflow.submission.encoded import ENCODED

In [3]:
LRSC = str(Path('~/proj/long-rna-seq-condor').expanduser())
if LRSC not in sys.path:
    sys.path.append(str(LRSC))
from woldrnaseq.madqc import load_rsem_quantifications

In [4]:
server = ENCODED('www.encodeproject.org')

In [5]:
e10_5 = "https://www.encodeproject.org/search/?searchTerm=e10.5&type=Experiment&assay_title=scRNA-seq&files.file_type=fastq&limit=all"

In [6]:
graph = server.search_jsonld(searchTerm="e10.5", **{'type': "Experiment", 'assay_title': "scRNA-seq", "files.file_type": "fastq", "limit": "all"})

In [7]:
len(graph['@graph'])

257

In [8]:
e = server.get_json('/experiments/ENCSR278TAI/')

In [9]:
set([(x['file_format'], x['output_type']) for x in e['files']])

{('bam', 'alignments'),
 ('bam', 'transcriptome alignments'),
 ('bigWig', 'signal of all reads'),
 ('bigWig', 'signal of unique reads'),
 ('fastq', 'reads'),
 ('tsv', 'gene quantifications'),
 ('tsv', 'transcript quantifications')}

In [10]:
[(x['submitted_file_name'], x['@id']) for x in e['files'] if x['output_type'] == 'gene quantifications']

[('C1_mouse_limb_combined_Mar_2017/18049_H6_mm10_clean/18049_H6_mm10_clean-mm10-M4-male_anno_rsem.genes.results',
  '/files/ENCFF078YTM/')]

In [11]:
experiments_all = [x['accession'] for x in graph['@graph']]
len(experiments_all)

257

In [12]:
empty_experiments = {
    'ENCSR134JVO',
     'ENCSR220RKA',
     'ENCSR430OIC',
     'ENCSR530WGW',
     'ENCSR619BUX',
     'ENCSR991VTV'
}

In [13]:
total = len(graph['@graph'])
rsem = []
experiments = []
count = 0
for g in graph['@graph']:
    if g['accession'] not in empty_experiments:
        e = server.get_json(g['@id'])
        for f in e['files']:
            if f.get('output_type', None) == 'gene quantifications':
                experiments.append(g['accession'])
                url = 'https://www.encodeproject.org' + f['href']
                rsem.append(url)


In [14]:
counts = load_rsem_quantifications(rsem, index=experiments, column='expected_count')

In [21]:
gene_id_map = get_gene_id_to_name('mm10-M21-male')

In [22]:
gene_symbols = [gene_id_map[x] for x in counts.index]

KeyError: 'ENSMUSG00000000003.11'

In [15]:
counts.shape

(69690, 251)

In [16]:
counts.sum(axis=1)

gene_id
10000                       0.0
10001                       0.0
10002                       1.0
10003                       0.0
10004                       0.0
                         ...   
gSpikein_ERCC-00164        78.0
gSpikein_ERCC-00165      2848.0
gSpikein_ERCC-00168        87.0
gSpikein_ERCC-00170      5185.0
gSpikein_ERCC-00171    137503.0
Length: 69690, dtype: float64

In [18]:
ls c1_e10.5/

align-star-0.log
align-star-0.out
bedgraph2bigwig-0.log
bedgraph2bigwig-0.out
bedgraph2bigwig-1.log
bedgraph2bigwig-1.out
bedgraph-star-0.log
bedgraph-star-0.out
c1_e10.5.dagman
c1_e10.5.dagman~
c1_e10.5.fastq.gz
c1_e10.5-mm10-M21-male_all.bw
c1_e10.5-mm10-M21-male_anno.bam
c1_e10.5-mm10-M21-male_anno_rsem.genes.results
c1_e10.5-mm10-M21-male_anno_rsem.isoforms.results
c1_e10.5-mm10-M21-male.coverage
c1_e10.5-mm10-M21-male.coverage.geneList
c1_e10.5-mm10-M21-male_genome.bam
c1_e10.5-mm10-M21-male_genome.bam.bai
c1_e10.5-mm10-M21-male_genome.samstats
c1_e10.5-mm10-M21-male.sam_reads_genes
c1_e10.5-mm10-M21-male_uniq.bw
COfile.txt
coverage-0.log
coverage-0.out
distribution-0.log
distribution-0.out
Log.final.out
Log.out
Log.progress.out
ReadsPerGene.out.tab
rsem-0.log
rsem-0.out
samstats-0.log
samstats-0.out
samtools-index-0.log
samtools-index-0.out
Signal.UniqueMultiple.str1.out.bg
Signal.Unique.str1.out.bg
SJ.out.tab
sort-samtools-0.log
sort-samt

In [20]:
#population = load_rsem_quantifications(['c1_e10.5/c1_e10.5-mm10-M21-male-star2.7.5a_anno_rsem.genes.results'], 'population', column='expected_count')
population = load_rsem_quantifications(['c1_e10.5/c1_e10.5-mm10-M21-male_anno_rsem.genes.results'], column='expected_count')