# Introduction

The 10x e10.5 runs were kind of boring and hard to recognize what they were doing. How about we try replicating the paper's analysis with alevin & kallisto?

In [1]:
import pandas
from pathlib import Path
import os
import sys
from glob import glob
import re

In [2]:
fastq_root = Path('~/proj/brian-2018-01-10x').expanduser()
target_root = Path('~/proj/encode-202006-jamboree-detrout-rna-sc-pipeline/10x_paper').expanduser()

In [3]:
runs = {
    '10x-1': [
        fastq_root / 'Chromium_20170819/SI-GA-D5_1/',
        fastq_root / 'Chromium_20170819/SI-GA-D5_2/',
        fastq_root / 'Chromium_20170819/SI-GA-D5_3/',
        fastq_root / 'Chromium_20170819/SI-GA-D5_4/',
    ],
    '10x-3': [fastq_root / 'SE731_new/outs/fastq_path/'],
    '10x-4': [fastq_root / 'SE732_new/outs/fastq_path/'],
    '10x-5': [fastq_root / 'SE733/outs/fastq_path/'],
    '10x-6': [fastq_root / 'SE734/outs/fastq_path'],
    '10x-7': [fastq_root / 'FT-BB01669/FT-SA17497_FT-TS92801'],
    '10x-8': [fastq_root / 'FT-BB01670/FT-SA17501_FT-TS92805'],
    #'10x-9': [fastq_root / 'FT-BB01671/FT-SA17505_FT-TS92809'],
    '10x-10': [fastq_root / 'FT-BB01672/FT-SA17509_FT-TS92813'],
    '10x-11': [fastq_root / 'FT-BB01673/FT-SA17513_FT-TS92817'],
    '10x-12': [fastq_root / 'FT-BB01674/FT-SA17517_FT-TS92821'],
    '10x-13': [fastq_root / 'FT-BB01675/FT-SA17521_FT-TS92825'],
}

In [4]:
runs

{'10x-1': [PosixPath('/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/Chromium_20170819/SI-GA-D5_1'),
  PosixPath('/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/Chromium_20170819/SI-GA-D5_2'),
  PosixPath('/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/Chromium_20170819/SI-GA-D5_3'),
  PosixPath('/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/Chromium_20170819/SI-GA-D5_4')],
 '10x-3': [PosixPath('/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/SE731_new/outs/fastq_path')],
 '10x-4': [PosixPath('/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/SE732_new/outs/fastq_path')],
 '10x-5': [PosixPath('/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/SE733/outs/fastq_path')],
 '10x-6': [PosixPath('/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/SE734/outs/fastq_path')],
 '10x-7': [PosixPath('/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/FT-BB01669/FT-SA17497_FT-TS92801')],
 '10x-8': [PosixPath('/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/FT-BB01670/FT-SA17501_FT-TS92

In [5]:
run_fastqs = {}
for run_name in runs:
    for run_directory in runs[run_name]:
        for filename in run_directory.glob('*_R*.fastq.gz'):
            if 'Undetermined' in str(filename):
                continue
            run_fastqs.setdefault(run_name, []).append(filename)
            #print(run_name, filename)

In [6]:
def pair_reads(fastqs):
    pattern = re.compile("(?P<prefix>.*)_(?P<read>R[12])_001.fastq.gz")
    mates = {}
    for pathname in fastqs:
        match = pattern.match(pathname.name)
        if match:
            prefix = match.group('prefix')
            read_id = match.group('read')[1:]
            mates.setdefault(prefix, {})[read_id] = pathname
            
    for prefix in mates:
        yield (mates[prefix]['1'], mates[prefix]['2'])


In [7]:
def kallisto_condor(runs, target_root, flavor):
    command = {
        'kallisto_em': Path('~/proj/encode-202006-jamboree-detrout-rna-sc-pipeline/10x_paper/kallisto-em.sh').expanduser(),
        'kallisto_em_minimal': Path('~/proj/encode-202006-jamboree-detrout-rna-sc-pipeline/10x_paper/kallisto-em-minimal.sh').expanduser(),
    }
    assert flavor in command
    header = """#!/usr/bin/condor_submit

universe=vanilla
log=logs/{flavor}.log
output=logs/{flavor}.$(process).out
error=logs/{flavor}.$(process).out

THREADS=8

request_cpus=$(THREADS)
request_memory=4G
executable={command}
""".format(flavor=flavor, command=command[flavor])
    lines = [header]
    for run in runs:
        fastqs = []
        for pair in pair_reads(runs[run]):
            fastqs.append(str(pair[0]))
            fastqs.append(str(pair[1]))
        fastqs = " ".join(fastqs)
        lines.append(f'arguments="{run} {fastqs}"')
        lines.append('queue')            
    return '\n'.join(lines)

In [8]:
def alevin_condor(runs, target_root, flavor):
    project_root = Path('~/proj/encode-202006-jamboree-detrout-rna-sc-pipeline').expanduser()
    genome_dirs = {
        'alevin': project_root / 'genome' / 'mm10-M21-male',
        'alevin_decoy': project_root / 'genome' / 'mm10-M21-male',
        'alevin_minimal': project_root / 'genome' / 'mm10-M21_minimal-male',
        'alevin_decoy_minimal': project_root / 'genome' / 'mm10-M21_minimal-male',
    }
    genome_dir = genome_dirs[flavor]
    index_dirs = {
        'alevin': genome_dir / 'salmon_index',
        'alevin_decoy': genome_dir / 'salmon_index_decoy',
        'alevin_minimal': genome_dir / 'salmon_index',
        'alevin_decoy_minimal': genome_dir / 'salmon_index_decoy'
    }
    index_dir = index_dirs[flavor]
    header = """#!/usr/bin/condor_submit

universe=vanilla
log=logs/{flavor}.log
output=logs/{flavor}.$(process).out
error=logs/{flavor}.$(process).out

THREADS=8
HOME=/woldlab/loxcyc/home/diane
TGMAP={genome_dir}/txp2gene.tsv
SALMON_SIMG=$(HOME)/proj/encode-202006-jamboree-detrout-rna-sc-pipeline/salmon-container/salmon-unstable.simg

request_cpus=$(THREADS)
executable=/usr/bin/singularity
""".format(flavor=flavor, target_root=str(target_root), genome_dir=str(genome_dir))
    lines = [header]
    for run in runs:
        fastqs = pair_reads(runs[run])
        read1 = ['-1']
        read2 = ['-2']
        for pair in fastqs:
            # alevin wanted the reads backwards?
            read1.append(pair[1])
            read2.append(pair[0])

        read1_arg = " ".join([str(x) for x in read1])
        read2_arg = " ".join([str(x) for x in read2])
        alevin_dir = target_root / flavor
        if not alevin_dir.exists():
            alevin_dir.mkdir()
        run_dir = alevin_dir / run
        if not run_dir.exists():
            run_dir.mkdir()
        lines.append(f'arguments="run $(SALMON_SIMG) alevin -l ISR {read1_arg} {read2_arg} --chromium -i {index_dir} -p $(THREADS) -o {run_dir} --dumpMtx --tgMap $(TGMAP)"')
        lines.append('queue')            
    return '\n'.join(lines)



In [9]:
algorithms = {
    'kallisto_em': kallisto_condor, 
    'kallisto_em_minimal': kallisto_condor, 
    'alevin': alevin_condor,
    'alevin_minimal': alevin_condor,
    'alevin_decoy': alevin_condor,
    'alevin_decoy_minimal': alevin_condor,    
}

In [10]:
for name in algorithms:
    template = algorithms[name]
    with open(target_root / (name + '.condor'), 'wt') as outstream:
        outstream.write(template(run_fastqs, target_root, name))
    