Skip to content

Commit

Permalink
Merge pull request #187 from sunbeam-labs/dev
Browse files Browse the repository at this point in the history
Sunbeam version 2.0.0
  • Loading branch information
louiejtaylor committed Jan 22, 2019
2 parents 0399ae0 + 2217a60 commit ba0aa2c
Show file tree
Hide file tree
Showing 20 changed files with 971 additions and 124 deletions.
14 changes: 10 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ from pathlib import Path, PurePath
from snakemake.utils import update_config, listfiles
from snakemake.exceptions import WorkflowError

from sunbeamlib import build_sample_list, read_seq_ids
from sunbeamlib import load_sample_list, read_seq_ids
from sunbeamlib.config import *
from sunbeamlib.reports import *

Expand Down Expand Up @@ -55,9 +55,10 @@ for sbx in sbxs:
# Setting up config files and samples
Cfg = check_config(config)
Blastdbs = process_databases(Cfg['blastdbs'])
Samples = build_sample_list(Cfg['all']['samplelist_fp'], Cfg['all']['paired_end'])
Samples = load_sample_list(Cfg['all']['samplelist_fp'], Cfg['all']['paired_end'], Cfg['all']['download_reads'], Cfg["all"]['root']/Cfg['all']['output_fp'])
Pairs = ['1', '2'] if Cfg['all']['paired_end'] else ['1']


# Collect host (contaminant) genomes
sys.stderr.write("Collecting host/contaminant genomes... ")
if Cfg['qc']['host_fp'] == Cfg['all']['root']:
Expand Down Expand Up @@ -87,13 +88,19 @@ sys.stderr.write("done.\n")
workdir: str(Cfg['all']['output_fp'])

# ---- Set up output paths for the various steps
DOWNLOAD_FP = output_subdir(Cfg, 'download')
QC_FP = output_subdir(Cfg, 'qc')
ASSEMBLY_FP = output_subdir(Cfg, 'assembly')
ANNOTATION_FP = output_subdir(Cfg, 'annotation')
CLASSIFY_FP = output_subdir(Cfg, 'classify')
MAPPING_FP = output_subdir(Cfg, 'mapping')


# ---- Download rules
if Cfg['all']['download_reads']:
include: "rules/download/download.rules"


# ---- Targets rules
include: "rules/targets/targets.rules"

Expand All @@ -105,6 +112,7 @@ include: "rules/qc/decontaminate.rules"

# ---- Assembly rules
include: "rules/assembly/assembly.rules"
include: "rules/assembly/coverage.rules"


# ---- Contig annotation rules
Expand Down Expand Up @@ -136,5 +144,3 @@ rule samples:
message: "Samples to be processed:"
run:
[print(sample) for sample in sorted(list(Samples.keys()))]


13 changes: 13 additions & 0 deletions docs/_static/theme_overrides.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
/* override table width restrictions */
@media screen and (min-width: 767px) {

.wy-table-responsive table td {
/* !important prevents the common CSS stylesheets from overriding
this as on RTD they are loaded after this stylesheet */
white-space: normal !important;
}

.wy-table-responsive {
overflow: visible !important;
}
}
6 changes: 6 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

html_context = {
'css_files': [
'_static/theme_overrides.css', # override wide tables in RTD theme
],
}

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
Expand Down
356 changes: 356 additions & 0 deletions docs/extensions.rst

Large diffs are not rendered by default.

44 changes: 35 additions & 9 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ conda environment:
You should see '(sunbeam)' in your prompt when you're in the environment. To leave
the environment, run ``source deactivate`` or close the terminal.

Creating a new project
Creating a new project using local data
----------------------

We provide a utility, ``sunbeam init``, to create a new config file and sample
Expand Down Expand Up @@ -171,6 +171,33 @@ part of the init command.
Further usage information is available by typing ``sunbeam init --help``.


Creating a new project using data from SRA
----------------------

If you'd like to analyze public data from `NCBI SRA <https://www.ncbi.nlm.nih.gov/sra/>`_,
we provide a feature in ``sunbeam init`` to use SRA as a data source instead of
files already on local disk. Run ``sunbeam init`` with ``--data_acc`` instead
of ``--data_fp``, giving one or more SRA accession numbers.

.. code-block:: shell
sunbeam init /path/to/my_project --data_acc SRP#######
You can pass any number of SRA run identifiers (SRR/ERR - one sample), SRA project
identifiers (SRP/ERP - one or more samples), or BioProject identifiers (PRJNA - one
or more samples). For example, the below command will initialize a project for
analyzing the 34 samples from ``SRP159164`` plus the single sample ``ERR1759004``:

.. code-block:: shell
sunbeam init /path/to/my_project --data_acc SRP159164 ERR1759004
Sometimes, SRA projects contain both paired and unpaired reads. If this is the case,
two config files and sample lists will be output--one prepended with "paired\_"and
one prepended with "unpaired\_" (as Sunbeam runs on either paired or unpaired reads,
but not both). Sunbeam uses the SRA metadata to call reads as paired- or single-end so
it is only as accurate as the SRA metadata.

Configuration
=============

Expand All @@ -192,6 +219,8 @@ all
list_samples``.
* ``paired_end``: 'true' or 'false' depending on whether you are using paired-
or single-end reads.
* ``download_reads``: 'true' or 'false' depending on whether you are using reads
from NCBI SRA.
* ``version``: Automatically added for you by ``sunbeam init``. Ensures
compatibility with the right version of Sunbeam.

Expand Down Expand Up @@ -300,6 +329,11 @@ mapping
mapping. This is a good place to add '-F4' to keep only mapped reads and
decrease the space these files occupy.

download
++++++++
* ``suffix``: the name of the subfolder to create for download output (fastq.gz files)
* ``threads``: number of threads to use for downloading (too many at once may make NCBI unhappy)


.. _running:

Expand Down Expand Up @@ -492,11 +526,3 @@ Quality control
This folder contains the trimmed, low-complexity filtered reads in
``cleaned``. The ``decontam`` folder contains the cleaned reads that did not map
to any contaminant or host genomes. In general, most downstream steps should reference the ``decontam`` reads.


.. _troubleshooting:

Troubleshooting
===============

Coming soon. For now we suggest browsing the closed issues tab on Github.
4 changes: 3 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ channels:
- bioconda
- conda-forge
- eclarke
- louiejtaylor
dependencies:
- snakemake=4.8.1
- ruamel.yaml
Expand All @@ -23,4 +24,5 @@ dependencies:
- setuptools_scm
- komplexity=0.3.6
- rust-bio-tools

- grabseqs=0.3.4
- minimap2
22 changes: 19 additions & 3 deletions rules/annotation/blast.rules
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ rule run_blastn:
-outfmt 5 \
-num_threads {threads} \
-evalue 1e-10 \
-max_target_seqs 1 \
-max_target_seqs 5000 \
-out {output} \
"""

Expand All @@ -49,7 +49,7 @@ rule run_blastp:
-outfmt 5 \
-num_threads {threads} \
-evalue 1e-10 \
-max_target_seqs 1 \
-max_target_seqs 2475 \
-out {output} \
"""

Expand All @@ -70,7 +70,7 @@ rule run_blastx:
-outfmt 5 \
-num_threads {threads} \
-evalue 1e-10 \
-max_target_seqs 1 \
-max_target_seqs 2475 \
-out {output} \
"""

Expand Down Expand Up @@ -100,3 +100,19 @@ rule _test_blastpx_report:
input:
expand(str(ANNOTATION_FP/'{blastpx}'/'card'/'prodigal'/'report.tsv'),
blastpx=['blastx','blastp'])

rule clean_xml:
input:
expand(str(ANNOTATION_FP/'summary'/'{sample}.tsv'), sample=Samples.keys())
params:
blastn_fp = str(ANNOTATION_FP/'blastn'),
blastp_fp = str(ANNOTATION_FP/'blastp'),
blastx_fp = str(ANNOTATION_FP/'blastx')
output:
touch(".xml_cleaned")
shell:
"""
if [ -d {params.blastn_fp} ]; then rm -r {params.blastn_fp}; fi && \
if [ -d {params.blastp_fp} ]; then rm -r {params.blastp_fp}; fi && \
if [ -d {params.blastx_fp} ]; then rm -r {params.blastx_fp}; fi
"""
30 changes: 28 additions & 2 deletions rules/assembly/assembly.rules
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,20 @@ rule megahit_paired:
out_fp = str(ASSEMBLY_FP/'megahit'/'{sample}_asm')
shell:
"""
megahit -1 {input.r1} -2 {input.r2} -o {params.out_fp} --continue || \
## turn off bash strict mode
set +o pipefail
## sometimes the error is due to lack of memory
exitcode=0
megahit -1 {input.r1} -2 {input.r2} -o {params.out_fp} --continue || exitcode=$?
if [ $exitcode -eq 255 ]
then
echo "Empty contigs"
elif [ $exitcode -gt 1 ]
then
echo "Check your memory"
fi
if [ ! -a {output} ]; then touch {output}; fi
"""

Expand All @@ -34,7 +47,20 @@ rule megahit_unpaired:
out_fp = str(ASSEMBLY_FP/'megahit'/'{sample}_asm')
shell:
"""
megahit -r {input} -o {params.out_fp} --continue || \
## turn off bash strict mode
set +o pipefail
## sometimes the error is due to lack of memory
exitcode=0
megahit -r {input} -o {params.out_fp} --continue || exitcode=$?
if [ $exitcode -eq 255 ]
then
echo "Empty contigs"
elif [ $exitcode -gt 1 ]
then
echo "Check your memory"
fi
if [ ! -a {output} ]; then touch {output}; fi
"""

Expand Down
107 changes: 107 additions & 0 deletions rules/assembly/coverage.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# -*- mode: Snakemake -*-
#
# Map reads to contigs and calculate per base coverage
#
# Requires Minimap2 and samtools.


import csv
import numpy


rule all_coverage:
input:
str(ASSEMBLY_FP/'contigs_coverage.txt')

rule _contigs_mapping:
input:
expand(str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.depth'),
sample=Samples.keys())

rule _all_coverage:
input:
expand(str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.csv'),
sample=Samples.keys())

rule contigs_mapping:
input:
contig = str(ASSEMBLY_FP/'contigs'/'{sample}-contigs.fa'),
reads = expand(str(QC_FP/'decontam'/'{{sample}}_{rp}.fastq.gz'),rp = Pairs)
output:
bam = str(ASSEMBLY_FP/'contigs'/'minimap2'/'{sample}.sorted.bam'),
bai = str(ASSEMBLY_FP/'contigs'/'minimap2'/'{sample}.sorted.bam.bai')
params:
temp = temp(str(ASSEMBLY_FP/'contigs'/'minimap2'/'{sample}.sorted.tmp'))
threads:
Cfg['mapping']['threads']
shell:
"""
minimap2 -ax sr -t {threads} {input.contig} {input.reads} | \
samtools sort -@ {threads} -o {output.bam} -T {params.temp} -
samtools index {output.bam} {output.bai}
"""

rule mapping_depth:
input:
bam = str(ASSEMBLY_FP/'contigs'/'minimap2'/'{sample}.sorted.bam'),
bai = str(ASSEMBLY_FP/'contigs'/'minimap2'/'{sample}.sorted.bam.bai')
output:
depth = str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.depth')
shell:
"""
samtools depth -aa {input.bam} > {output.depth}
"""

def _get_coverage(input_fp, sample, output_fp):
"""
Summarize stats for coverage data for each sample
"""

with open(input_fp) as f:
reader = csv.reader(f, delimiter='\t')
data = {}
for row in reader:
if not data.get(row[0]):
data[row[0]] = []
data[row[0]].append(int(row[2]))

# summarize stats for all segments present and append to output
output_rows = []
for segment in data.keys():
sumval = numpy.sum(data[segment])
minval = numpy.min(data[segment])
maxval = numpy.max(data[segment])
mean = numpy.mean(data[segment])
median = numpy.median(data[segment])
stddev = numpy.std(data[segment])
gen_cov = len(list(filter(lambda x: x!=0, data[segment])))
gen_length = len(data[segment])
row = [sample, segment, sumval, minval, maxval, mean, median, stddev, gen_cov, gen_length]
output_rows.append(row)

# write out stats per segment per sample
fields = ['sample','contig', 'sum', 'min', 'max', 'mean', 'median', 'stddev', 'coverage', 'length']
with open(output_fp, 'w') as f:
writer = csv.writer(f)
writer.writerow(fields)
writer.writerows(output_rows)


rule get_coverage:
input:
str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.depth')
output:
str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.csv')
run:
_get_coverage(input[0], wildcards.sample, output[0])


rule summarize_coverage:
input:
expand(str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.csv'),
sample = Samples.keys())
output:
str(ASSEMBLY_FP/'contigs_coverage.txt')
shell:
"(head -n 1 {input[0]}; tail -q -n +2 {input}) > {output}"

0 comments on commit ba0aa2c

Please sign in to comment.