Merge pull request #187 from sunbeam-labs/dev

Sunbeam version 2.0.0
sunbeam-labs · Jan 22, 2019 · ba0aa2c · ba0aa2c
2 parents 0399ae0 + 2217a60
commit ba0aa2c
Show file tree

Hide file tree

Showing 20 changed files with 971 additions and 124 deletions.
diff --git a/Snakefile b/Snakefile
@@ -16,7 +16,7 @@ from pathlib import Path, PurePath
 from snakemake.utils import update_config, listfiles
 from snakemake.exceptions import WorkflowError
 
-from sunbeamlib import build_sample_list, read_seq_ids
+from sunbeamlib import load_sample_list, read_seq_ids
 from sunbeamlib.config import *
 from sunbeamlib.reports import *
 
@@ -55,9 +55,10 @@ for sbx in sbxs:
 # Setting up config files and samples
 Cfg = check_config(config)
 Blastdbs = process_databases(Cfg['blastdbs'])
-Samples = build_sample_list(Cfg['all']['samplelist_fp'], Cfg['all']['paired_end'])
+Samples = load_sample_list(Cfg['all']['samplelist_fp'], Cfg['all']['paired_end'], Cfg['all']['download_reads'], Cfg["all"]['root']/Cfg['all']['output_fp'])
 Pairs = ['1', '2'] if Cfg['all']['paired_end'] else ['1']
 
+
 # Collect host (contaminant) genomes
 sys.stderr.write("Collecting host/contaminant genomes... ")
 if Cfg['qc']['host_fp'] == Cfg['all']['root']:
@@ -87,13 +88,19 @@ sys.stderr.write("done.\n")
 workdir: str(Cfg['all']['output_fp'])
 
 # ---- Set up output paths for the various steps
+DOWNLOAD_FP = output_subdir(Cfg, 'download')
 QC_FP = output_subdir(Cfg, 'qc')
 ASSEMBLY_FP = output_subdir(Cfg, 'assembly')
 ANNOTATION_FP = output_subdir(Cfg, 'annotation')
 CLASSIFY_FP = output_subdir(Cfg, 'classify')
 MAPPING_FP = output_subdir(Cfg, 'mapping')
 
 
+# ---- Download rules
+if Cfg['all']['download_reads']:
+	include: "rules/download/download.rules"
+
+
 # ---- Targets rules
 include: "rules/targets/targets.rules"
 
@@ -105,6 +112,7 @@ include: "rules/qc/decontaminate.rules"
 
 # ---- Assembly rules
 include: "rules/assembly/assembly.rules"
+include: "rules/assembly/coverage.rules"
 
 
 # ---- Contig annotation rules
@@ -136,5 +144,3 @@ rule samples:
     message: "Samples to be processed:"
     run:
         [print(sample) for sample in sorted(list(Samples.keys()))]
-
-
diff --git a/docs/_static/theme_overrides.css b/docs/_static/theme_overrides.css
@@ -0,0 +1,13 @@
+/* override table width restrictions */
+@media screen and (min-width: 767px) {
+
+   .wy-table-responsive table td {
+      /* !important prevents the common CSS stylesheets from overriding
+         this as on RTD they are loaded after this stylesheet */
+      white-space: normal !important;
+   }
+
+   .wy-table-responsive {
+      overflow: visible !important;
+   }
+}
diff --git a/docs/conf.py b/docs/conf.py
@@ -96,6 +96,12 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
 
+html_context = {
+    'css_files': [
+        '_static/theme_overrides.css',  # override wide tables in RTD theme
+        ],
+     }
+
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
 #

diff --git a/docs/extensions.rst b/docs/extensions.rst
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -118,7 +118,7 @@ conda environment:
 You should see '(sunbeam)' in your prompt when you're in the environment. To leave
 the environment, run ``source deactivate`` or close the terminal.
 
-Creating a new project
+Creating a new project using local data
 ----------------------
 
 We provide a utility, ``sunbeam init``, to create a new config file and sample
@@ -171,6 +171,33 @@ part of the init command.
 Further usage information is available by typing ``sunbeam init --help``.
 
 
+Creating a new project using data from SRA
+----------------------
+
+If you'd like to analyze public data from `NCBI SRA <https://www.ncbi.nlm.nih.gov/sra/>`_,
+we provide a feature in ``sunbeam init`` to use SRA as a data source instead of
+files already on local disk. Run ``sunbeam init`` with ``--data_acc`` instead
+of ``--data_fp``, giving one or more SRA accession numbers.
+
+.. code-block:: shell
+
+   sunbeam init /path/to/my_project --data_acc SRP#######
+
+You can pass any number of SRA run identifiers (SRR/ERR - one sample), SRA project 
+identifiers (SRP/ERP - one or more samples), or BioProject identifiers (PRJNA - one 
+or more samples). For example, the below command will initialize a project for 
+analyzing the 34 samples from ``SRP159164`` plus the single sample ``ERR1759004``:
+
+.. code-block:: shell
+
+   sunbeam init /path/to/my_project --data_acc SRP159164 ERR1759004
+   
+Sometimes, SRA projects contain both paired and unpaired reads. If this is the case,
+two config files and sample lists will be output--one prepended with "paired\_"and
+one prepended with "unpaired\_" (as Sunbeam runs on either paired or unpaired reads, 
+but not both). Sunbeam uses the SRA metadata to call reads as paired- or single-end so 
+it is only as accurate as the SRA metadata.
+
 Configuration
 =============
 
@@ -192,6 +219,8 @@ all
   list_samples``.
 * ``paired_end``: 'true' or 'false' depending on whether you are using paired-
   or single-end reads.
+* ``download_reads``: 'true' or 'false' depending on whether you are using reads
+  from NCBI SRA.
 * ``version``: Automatically added for you by ``sunbeam init``. Ensures
   compatibility with the right version of Sunbeam.
 
@@ -300,6 +329,11 @@ mapping
   mapping. This is a good place to add '-F4' to keep only mapped reads and
   decrease the space these files occupy.
 
+download
+++++++++
+* ``suffix``: the name of the subfolder to create for download output (fastq.gz files)
+* ``threads``: number of threads to use for downloading (too many at once may make NCBI unhappy)
+
 
 .. _running:
 
@@ -492,11 +526,3 @@ Quality control
 This   folder   contains  the   trimmed,   low-complexity   filtered  reads   in
 ``cleaned``. The ``decontam`` folder contains the cleaned reads that did not map
 to any contaminant or host genomes. In general, most downstream steps should reference the ``decontam`` reads.
-
-
-.. _troubleshooting:
-
-Troubleshooting
-===============
-
-Coming soon. For now we suggest browsing the closed issues tab on Github.
diff --git a/environment.yml b/environment.yml
@@ -2,6 +2,7 @@ channels:
   - bioconda
   - conda-forge
   - eclarke
+  - louiejtaylor
 dependencies:
   - snakemake=4.8.1
   - ruamel.yaml
@@ -23,4 +24,5 @@ dependencies:
   - setuptools_scm
   - komplexity=0.3.6
   - rust-bio-tools
-
+  - grabseqs=0.3.4
+  - minimap2
diff --git a/rules/annotation/blast.rules b/rules/annotation/blast.rules
@@ -28,7 +28,7 @@ rule run_blastn:
         -outfmt 5 \
         -num_threads {threads} \
         -evalue 1e-10 \
-        -max_target_seqs 1 \
+        -max_target_seqs 5000 \
         -out {output} \
         """
 
@@ -49,7 +49,7 @@ rule run_blastp:
         -outfmt 5 \
         -num_threads {threads} \
         -evalue 1e-10 \
-        -max_target_seqs 1 \
+        -max_target_seqs 2475 \
         -out {output} \
         """
 
@@ -70,7 +70,7 @@ rule run_blastx:
         -outfmt 5 \
         -num_threads {threads} \
         -evalue 1e-10 \
-        -max_target_seqs 1 \
+        -max_target_seqs 2475 \
         -out {output} \
         """
 
@@ -100,3 +100,19 @@ rule _test_blastpx_report:
     input:
         expand(str(ANNOTATION_FP/'{blastpx}'/'card'/'prodigal'/'report.tsv'),
         blastpx=['blastx','blastp'])
+
+rule clean_xml:
+    input:
+        expand(str(ANNOTATION_FP/'summary'/'{sample}.tsv'), sample=Samples.keys())
+    params:
+        blastn_fp = str(ANNOTATION_FP/'blastn'),
+        blastp_fp = str(ANNOTATION_FP/'blastp'),
+        blastx_fp = str(ANNOTATION_FP/'blastx')
+    output:
+        touch(".xml_cleaned")
+    shell:
+        """
+        if [ -d {params.blastn_fp} ]; then rm -r {params.blastn_fp}; fi && \
+        if [ -d {params.blastp_fp} ]; then rm -r {params.blastp_fp}; fi && \
+        if [ -d {params.blastx_fp} ]; then rm -r {params.blastx_fp}; fi
+        """
diff --git a/rules/assembly/assembly.rules b/rules/assembly/assembly.rules
@@ -21,7 +21,20 @@ rule megahit_paired:
         out_fp = str(ASSEMBLY_FP/'megahit'/'{sample}_asm')
     shell:
         """
-        megahit -1 {input.r1} -2 {input.r2} -o {params.out_fp} --continue || \
+        ## turn off bash strict mode
+        set +o pipefail
+
+        ## sometimes the error is due to lack of memory
+        exitcode=0
+        megahit -1 {input.r1} -2 {input.r2} -o {params.out_fp} --continue || exitcode=$?
+        
+        if [ $exitcode -eq 255 ]
+        then
+            echo "Empty contigs"
+        elif [ $exitcode -gt 1 ]
+        then
+            echo "Check your memory"
+        fi
         if [ ! -a {output} ]; then touch {output}; fi
         """
 
@@ -34,7 +47,20 @@ rule megahit_unpaired:
         out_fp = str(ASSEMBLY_FP/'megahit'/'{sample}_asm')
     shell:
         """
-        megahit -r {input} -o {params.out_fp} --continue || \
+        ## turn off bash strict mode
+        set +o pipefail
+
+        ## sometimes the error is due to lack of memory
+        exitcode=0
+        megahit -r {input} -o {params.out_fp} --continue || exitcode=$?
+
+        if [ $exitcode -eq 255 ]
+        then
+            echo "Empty contigs"
+        elif [ $exitcode -gt 1 ]
+        then
+            echo "Check your memory"
+        fi
         if [ ! -a {output} ]; then touch {output}; fi
         """
 

diff --git a/rules/assembly/coverage.rules b/rules/assembly/coverage.rules
@@ -0,0 +1,107 @@
+# -*- mode: Snakemake -*-
+#
+# Map reads to contigs and calculate per base coverage
+#
+# Requires Minimap2 and samtools.
+
+
+import csv
+import numpy
+
+
+rule all_coverage:
+    input:
+        str(ASSEMBLY_FP/'contigs_coverage.txt')
+
+rule _contigs_mapping:
+    input:
+        expand(str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.depth'),
+               sample=Samples.keys())        
+
+rule _all_coverage:
+    input:
+        expand(str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.csv'),
+               sample=Samples.keys())
+
+rule contigs_mapping:
+    input:
+        contig = str(ASSEMBLY_FP/'contigs'/'{sample}-contigs.fa'),
+        reads = expand(str(QC_FP/'decontam'/'{{sample}}_{rp}.fastq.gz'),rp = Pairs)
+    output:
+        bam = str(ASSEMBLY_FP/'contigs'/'minimap2'/'{sample}.sorted.bam'),
+        bai = str(ASSEMBLY_FP/'contigs'/'minimap2'/'{sample}.sorted.bam.bai')
+    params:
+        temp = temp(str(ASSEMBLY_FP/'contigs'/'minimap2'/'{sample}.sorted.tmp'))
+    threads: 
+        Cfg['mapping']['threads']
+    shell:
+        """
+        minimap2 -ax sr -t {threads} {input.contig} {input.reads} | \
+            samtools sort -@ {threads} -o {output.bam} -T {params.temp} - 
+        samtools index {output.bam} {output.bai}
+        """
+
+rule mapping_depth:
+    input:
+        bam = str(ASSEMBLY_FP/'contigs'/'minimap2'/'{sample}.sorted.bam'),
+        bai = str(ASSEMBLY_FP/'contigs'/'minimap2'/'{sample}.sorted.bam.bai')
+    output:
+        depth = str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.depth')
+    shell:
+        """
+        samtools depth -aa {input.bam} > {output.depth}
+        """
+
+def _get_coverage(input_fp, sample, output_fp):
+    """
+    Summarize stats for coverage data for each sample 
+    """
+
+    with open(input_fp) as f:
+        reader = csv.reader(f, delimiter='\t')    
+        data = {}
+        for row in reader:
+            if not data.get(row[0]):
+                data[row[0]] = []
+            data[row[0]].append(int(row[2]))
+
+    # summarize stats for all segments present and append to output
+    output_rows = []
+    for segment in data.keys():
+        sumval     = numpy.sum(data[segment])
+        minval     = numpy.min(data[segment])
+        maxval     = numpy.max(data[segment])
+        mean       = numpy.mean(data[segment])
+        median     = numpy.median(data[segment])
+        stddev     = numpy.std(data[segment])
+        gen_cov    = len(list(filter(lambda x: x!=0, data[segment])))
+        gen_length = len(data[segment])
+        row = [sample, segment, sumval, minval, maxval, mean, median, stddev, gen_cov, gen_length]
+        output_rows.append(row)
+
+    # write out stats per segment per sample
+    fields = ['sample','contig', 'sum', 'min', 'max', 'mean', 'median', 'stddev', 'coverage', 'length']
+    with open(output_fp, 'w') as f:
+        writer = csv.writer(f)
+        writer.writerow(fields)
+        writer.writerows(output_rows)
+
+
+rule get_coverage:
+    input:
+        str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.depth')
+    output:
+        str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.csv')
+    run:
+        _get_coverage(input[0], wildcards.sample, output[0])
+
+
+rule summarize_coverage:
+    input:
+        expand(str(ASSEMBLY_FP/'contigs'/'coverage'/'{sample}.csv'), 
+               sample = Samples.keys())
+    output:
+        str(ASSEMBLY_FP/'contigs_coverage.txt')
+    shell:
+        "(head -n 1 {input[0]}; tail -q -n +2 {input}) > {output}"
+