# Introduction

I'd been focused on just the e10.5 data, but that's the set where we'd been working out the spike concentration so it has some difficulties to analyze. However when I looked I found more spike sets than we remembered and we need to figure out which cells in which runs had which spikes.

In [24]:
import pandas
from pathlib import Path
import sys
from urllib import parse
import re

In [4]:
LRSC = str(Path('~/proj/long-rna-seq-condor').expanduser())
if LRSC not in sys.path:
    sys.path.append(LRSC)
from woldrnaseq.models import load_experiments

In [2]:
HTSW = str(Path('~/proj/htsworkflow').expanduser())
if HTSW not in sys.path:
    sys.path.append(HTSW)
    
from htsworkflow.submission.encoded import ENCODED
server = ENCODED('www.encodeproject.org')

In [3]:
c1_m21 = Path('~diane/proj/C1_mouse_limb_combined/all_analysis_M21/').expanduser()

In [9]:
list(c1_m21.glob('*.tsv'))

[PosixPath('/woldlab/loxcyc/home/diane/proj/C1_mouse_limb_combined/all_analysis_M21/libraries-passing.tsv'),
 PosixPath('/woldlab/loxcyc/home/diane/proj/C1_mouse_limb_combined/all_analysis_M21/library-passing.tsv'),
 PosixPath('/woldlab/loxcyc/home/diane/proj/C1_mouse_limb_combined/all_analysis_M21/experiments-by-run-passing.tsv')]

In [11]:
c1_m21_experiments = load_experiments([c1_m21 / 'experiments-by-run-passing.tsv'])

In [13]:
records = []
for name, row in c1_m21_experiments.iterrows():
    for library_id in row.replicates:
        library = server.get_json('barbara-wold:{}'.format(library_id))
        for spikes_used in library['spikeins_used']:
            records.append({
                'experiment_name': name,
                'library_id': library_id,
                'spikes_used': spikes_used[len('/references/'):-1],
            })

In [16]:
spikes_used = {x['spikes_used'] for x in records}
spikes_used

{'ENCSR013YHQ', 'ENCSR156CIL', 'ENCSR535LMC', 'ENCSR722NWD', 'ENCSR881CTY'}

In [20]:
spike_details = {}
for spike in spikes_used:
    obj = server.get_json(f'/references/{spike}/')
    aliases = [x for x in obj['aliases'] if x.startswith('barbara-wold:')]
    if len(aliases) == 0:
        aliases = None
    elif len(aliases) == 1:
        aliases = aliases[0]
    else:
        aliases = None
        print("Confused", aliases)
        
    spike_details[spike] = {
        'description': obj['description'],
        'alias': aliases
    }

In [21]:
spike_details

{'ENCSR535LMC': {'description': 'profile C1_1 ERCC spike-in concentrations used for C1 fluidigm',
  'alias': 'barbara-wold:profile_C1_3'},
 'ENCSR881CTY': {'description': 'profile C1_2 ERCC spike-in concentrations used for C1 fluidigm',
  'alias': 'barbara-wold:profile_C1_2'},
 'ENCSR722NWD': {'description': 'profile C1_1 ERCC spike-in concentrations used for C1 fluidigm',
  'alias': 'barbara-wold:profile_C1_1'},
 'ENCSR156CIL': {'description': 'Ambion mix 1 spike-ins',
  'alias': 'barbara-wold:ERCC'},
 'ENCSR013YHQ': {'description': 'Caltech profile 4 spike-ins',
  'alias': 'barbara-wold:SpikeProfile4'}}

In [40]:
for row in records:
    row['spike_name'] = spike_details[row['spikes_used']]['description']
    row['spike_alias'] = spike_details[row['spikes_used']]['alias']
    match = re.search('run(?P<run>[\d]+)', row['experiment_name'])
    if match:
        row['run'] = int(match.group('run'))

In [41]:
experiment_spikes = pandas.DataFrame(records)
experiment_spikes.head()

Unnamed: 0,experiment_name,library_id,spikes_used,spike_name,spike_alias,run
0,C1_mouse_e13.5_limb_mesenchyme_mm10_clean_run4,18251_A1,ENCSR535LMC,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_3,4
1,C1_mouse_e13.5_limb_mesenchyme_mm10_clean_run4,18251_A10,ENCSR535LMC,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_3,4
2,C1_mouse_e13.5_limb_mesenchyme_mm10_clean_run4,18251_A11,ENCSR535LMC,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_3,4
3,C1_mouse_e13.5_limb_mesenchyme_mm10_clean_run4,18251_A12,ENCSR535LMC,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_3,4
4,C1_mouse_e13.5_limb_mesenchyme_mm10_clean_run4,18251_A2,ENCSR535LMC,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_3,4


In [49]:
experiment_spikes.groupby('spikes_used')[['run']].count()

Unnamed: 0_level_0,run
spikes_used,Unnamed: 1_level_1
ENCSR013YHQ,18
ENCSR156CIL,55
ENCSR535LMC,845
ENCSR722NWD,78
ENCSR881CTY,41


In [50]:
experiment_spikes = experiment_spikes.sort_values(by=['run','library_id'])

In [51]:
experiment_spikes

Unnamed: 0,experiment_name,library_id,spikes_used,spike_name,spike_alias,run
389,C1_mouse_e10.5_limb_mm10_clean_run1,18042_A1,ENCSR722NWD,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_1,1
390,C1_mouse_e10.5_limb_mm10_clean_run1,18042_A10,ENCSR013YHQ,Caltech profile 4 spike-ins,barbara-wold:SpikeProfile4,1
391,C1_mouse_e10.5_limb_mm10_clean_run1,18042_A11,ENCSR722NWD,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_1,1
392,C1_mouse_e10.5_limb_mm10_clean_run1,18042_A12,ENCSR013YHQ,Caltech profile 4 spike-ins,barbara-wold:SpikeProfile4,1
393,C1_mouse_e10.5_limb_mm10_clean_run1,18042_A2,ENCSR722NWD,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_1,1
...,...,...,...,...,...,...
815,C1_mouse_e14.5_forelimb_run17_January16_2018,19917_D4,ENCSR535LMC,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_3,17
816,C1_mouse_e14.5_forelimb_run17_January16_2018,19917_D5,ENCSR535LMC,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_3,17
817,C1_mouse_e14.5_forelimb_run17_January16_2018,19917_D6,ENCSR535LMC,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_3,17
818,C1_mouse_e14.5_forelimb_run17_January16_2018,19917_D7,ENCSR535LMC,profile C1_1 ERCC spike-in concentrations used...,barbara-wold:profile_C1_3,17


In [52]:
experiment_spikes.to_csv('c1-experiments-to-annotated-spike-ins-used.csv', index=False)