# Introduction

I'd like to test my theory of how well the sum of the population counts match the population level RSEM estimates.

There were too many different spike in concentrations used for the full set of cells so I should try a more targeted set.

In [31]:
import sys
import os
from pathlib import Path
from xopen import xopen
import requests
import pandas

In [2]:
HTSW = str(Path('~/proj/htsworkflow').expanduser())
if HTSW not in sys.path:
    sys.path.append(str(HTSW))
from htsworkflow.submission.encoded import ENCODED

In [3]:
server = ENCODED('www.encodeproject.org')

In [6]:
cells = []
aliases = []
with open('c1-cells-using-ENCSR535LMC-spike.txt', 'rt') as instream:
    for line in instream:
        line = line.rstrip()
        cells.append(line)
        aliases.append('barbara-wold:'+line)

In [39]:
records = []
for alias in aliases:
    graph = server.search_jsonld(searchTerm=alias)
    for stub in graph['@graph']:
        if 'Experiment' in stub['@type'] and stub['status'] == 'released':
            experiment_id = stub['@id']
            exp = server.get_json(experiment_id)
            for f in exp['files']:
                if f['file_format'] == 'fastq':
                    base_name = Path(f['submitted_file_name']).name
                    href = f['href']
                    #print(experiment_id, base_name, href)
                    assert f['run_type'] == 'single-ended'
                    assert f['status'] == 'released'
                    records.append({
                        'experiment_id': experiment_id,
                        'alias': alias,
                        'href': f['href']
                    })
                
hrefs = pandas.DataFrame(records)
hrefs

Unnamed: 0,experiment_id,alias,href
0,/experiments/ENCSR163PNT/,barbara-wold:18087_F1,/files/ENCFF657NUR/@@download/ENCFF657NUR.fast...
1,/experiments/ENCSR163PNT/,barbara-wold:18087_F1,/files/ENCFF166HZA/@@download/ENCFF166HZA.fast...
2,/experiments/ENCSR824QUM/,barbara-wold:18087_F10,/files/ENCFF674IYA/@@download/ENCFF674IYA.fast...
3,/experiments/ENCSR824QUM/,barbara-wold:18087_F10,/files/ENCFF799SVZ/@@download/ENCFF799SVZ.fast...
4,/experiments/ENCSR346MHI/,barbara-wold:18087_F11,/files/ENCFF355ICJ/@@download/ENCFF355ICJ.fast...
...,...,...,...
113,/experiments/ENCSR602YRY/,barbara-wold:19913_H8,/files/ENCFF836WOH/@@download/ENCFF836WOH.fast...
114,/experiments/ENCSR590VUT/,barbara-wold:19913_H9,/files/ENCFF119NUC/@@download/ENCFF119NUC.fast...
115,/experiments/ENCSR590VUT/,barbara-wold:19913_H9,/files/ENCFF480WEC/@@download/ENCFF480WEC.fast...
116,/experiments/ENCSR466TNK/,barbara-wold:20048_E10,/files/ENCFF883PYF/@@download/ENCFF883PYF.fast...


In [37]:
f['status']

'released'

In [44]:
if 1:
    target = Path('c1_e10.5_ENCSR535LMC_spikes/c1_e10.5_ENCSR535LMC_spikes.fastq.gz')
    block_size = 8192
    if target.exists():
        os.unlink(target)
    processed = []
    total = hrefs.shape[0]
    
    with open(target, 'wb') as outstream:
        count = 0
        for i, row in hrefs.iterrows():
            fastq_url = 'https://www.encodeproject.org' + row['href']
            processed.append(fastq_url)
            instream = requests.get(fastq_url, stream=True)
            for block in instream.iter_content(8192):
                outstream.write(block)
            count += 1
            print(f'Processed {fastq_url} {count}/{total}')


Processed https://www.encodeproject.org/files/ENCFF657NUR/@@download/ENCFF657NUR.fastq.gz 1/118
Processed https://www.encodeproject.org/files/ENCFF166HZA/@@download/ENCFF166HZA.fastq.gz 2/118
Processed https://www.encodeproject.org/files/ENCFF674IYA/@@download/ENCFF674IYA.fastq.gz 3/118
Processed https://www.encodeproject.org/files/ENCFF799SVZ/@@download/ENCFF799SVZ.fastq.gz 4/118
Processed https://www.encodeproject.org/files/ENCFF355ICJ/@@download/ENCFF355ICJ.fastq.gz 5/118
Processed https://www.encodeproject.org/files/ENCFF272MGL/@@download/ENCFF272MGL.fastq.gz 6/118
Processed https://www.encodeproject.org/files/ENCFF726WGA/@@download/ENCFF726WGA.fastq.gz 7/118
Processed https://www.encodeproject.org/files/ENCFF650TAX/@@download/ENCFF650TAX.fastq.gz 8/118
Processed https://www.encodeproject.org/files/ENCFF531JZY/@@download/ENCFF531JZY.fastq.gz 9/118
Processed https://www.encodeproject.org/files/ENCFF436VJX/@@download/ENCFF436VJX.fastq.gz 10/118
Processed https://www.encodeproject.org

Processed https://www.encodeproject.org/files/ENCFF495SEV/@@download/ENCFF495SEV.fastq.gz 86/118
Processed https://www.encodeproject.org/files/ENCFF007EXF/@@download/ENCFF007EXF.fastq.gz 87/118
Processed https://www.encodeproject.org/files/ENCFF740MJQ/@@download/ENCFF740MJQ.fastq.gz 88/118
Processed https://www.encodeproject.org/files/ENCFF374BQV/@@download/ENCFF374BQV.fastq.gz 89/118
Processed https://www.encodeproject.org/files/ENCFF580UDX/@@download/ENCFF580UDX.fastq.gz 90/118
Processed https://www.encodeproject.org/files/ENCFF041WHB/@@download/ENCFF041WHB.fastq.gz 91/118
Processed https://www.encodeproject.org/files/ENCFF772JEG/@@download/ENCFF772JEG.fastq.gz 92/118
Processed https://www.encodeproject.org/files/ENCFF261XFY/@@download/ENCFF261XFY.fastq.gz 93/118
Processed https://www.encodeproject.org/files/ENCFF181EOM/@@download/ENCFF181EOM.fastq.gz 94/118
Processed https://www.encodeproject.org/files/ENCFF795VKK/@@download/ENCFF795VKK.fastq.gz 95/118
Processed https://www.encodepr