Merge pull request #144 from sunbeam-labs/dev

stable release
sunbeam-labs · May 24, 2018 · ce2f035 · ce2f035
2 parents 593754c + 97e6278
commit ce2f035
Show file tree

Hide file tree

Showing 11 changed files with 171 additions and 85 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -6,7 +6,8 @@ version: 2
 jobs:
   build:
     docker:
-      - image: circleci/python:3.6.1
+      - image: circleci/python:3.6
+      - image: circleci/python:3.5
 
     steps:
       - checkout

diff --git a/Readme.md b/Readme.md
@@ -21,10 +21,51 @@ Sunbeam currently automates the following tasks:
 * Mapping of reads to target genomes; and
 * ORF prediction using [Prodigal](https://github.com/hyattpd/Prodigal).
 
-Sunbeam was designed to be modular and extensible. We have a few pre-built
-extensions available that handle visualization tasks, including contig
-assembly graphs, read alignments, and taxonomic classifications.
+Sunbeam was designed to be modular and extensible. Some extensions have been built for:
 
-To get started, see our [documentation!](https://sunbeam.readthedocs.io)
+- [IGV](https://github.com/sunbeam-labs/sbx_igv) for viewing read alignments
+- [KrakenHLL](https://github.com/zhaoc1/sbx_krakenhll), an alternate read classifier
+- [Kaiju](https://github.com/sunbeam-labs/sbx_kaiju), a read classifier that uses BWA rather than kmers
+- [Anvi'o](https://github.com/sunbeam-labs/sbx_anvio), a downstream analysis pipeline that does lots of stuff!
 
+To get started, see our [documentation](http://sunbeam.readthedocs.io)!
+
+
+------
+
+### Changelog:
+
+#### v1.2.0 (May 2, 2018)
+
+ - Low-complexity reads are now removed by default rather than masked
+ - Bug fixes related to single-end sequencing experiments
+ - Documentation updates
+
+#### v1.1.0 (April 8, 2018)
+
+ - Reports include number of filtered reads per host, rather than in aggregate
+ - Static binary dependency for [komplexity](https://github.com/eclarke/komplexity) for easier deployment
+ - Remove max length filter for contigs
+
+#### v1.0.0 (March 22, 2018)
+
+ - First stable release!
+ - Support for single-end sequencing experiments
+ - Low-complexity read masking via [komplexity](https://github.com/eclarke/komplexity)
+ - Support for extensions
+ - Documentation on [ReadTheDocs.io](http://sunbeam.readthedocs.io)
+ - Better assembler (megahit)
+ - Better ORF finder (prodigal)
+ - Can remove reads from any number of host/contaminant genomes
+ - Semantic versioning checks
+ - Integration tests and continuous deployment
+
+-------
+
+### Contributors
+
+- Erik Clarke ([@eclarke](https://github.com/eclarke))
+- Chunyu Zhao ([@zhaoc1](https://github.com/zhaoc1))
+- Jesse Connell ([@ressy](https://github.com/ressy))
+- Louis Taylor ([@louiejtaylor](https://github.com/louiejtaylor))
 
diff --git a/Snakefile b/Snakefile
@@ -57,7 +57,6 @@ Cfg = check_config(config)
 Blastdbs = process_databases(Cfg['blastdbs'])
 Samples = build_sample_list(Cfg['all']['samplelist_fp'], Cfg['all']['paired_end'])
 Pairs = ['1', '2'] if Cfg['all']['paired_end'] else ['1']
-print(Samples)
 
 # Collect host (contaminant) genomes
 sys.stderr.write("Collecting host/contaminant genomes... ")

diff --git a/environment.yml b/environment.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - eclarke
 dependencies:
-  - snakemake
+  - snakemake=4.8.1
   - ruamel.yaml
   - biopython
   - trimmomatic

diff --git a/rules/assembly/assembly.rules b/rules/assembly/assembly.rules
@@ -56,6 +56,7 @@ rule final_filter:
             vsearch --sortbylength {input} \
             --minseqlength {params.len} \
             --maxseqlength -1 \
+            --notrunclabels \
             --output {input}.{params.len}f &> {log} && \
             cp {input}.{params.len}f {output}
         else

diff --git a/rules/reports/reports.rules b/rules/reports/reports.rules
@@ -3,47 +3,14 @@
 # ReportGeneration rules
 
 import pandas
-from io import StringIO
+
 from collections import OrderedDict
+from sunbeamlib import reports
 
 rule all_reports:
     input:
         TARGET_REPORT
 
-def parse_trim_summary_paired(f):
-    for line in f.readlines():
-        if line.startswith('Input Read'):
-            vals = re.findall('\D+\: (\d+)', line)
-            keys = ('input', 'both_kept','fwd_only','rev_only','dropped')
-            return(OrderedDict(zip(keys, vals)))
-
-def parse_trim_summary_single(f):
-    for line in f:
-        if line.startswith('Input Read'):
-            vals = re.findall('\D+\: (\d+)', line)
-            keys = ('input', 'kept', 'dropped')
-            return(OrderedDict(zip(keys, vals)))
-
-def parse_decontam_log(f):
-    keys = f.readline().rstrip().split('\t')
-    vals = f.readline().rstrip().split('\t')
-    return(OrderedDict(zip(keys,vals)))
-
-def summarize_qual_decontam(tfile, dfile):
-    """Return a dataframe for summary information for trimmomatic and decontam rule"""
-    tname = os.path.basename(tfile).split('.out')[0]
-    dname = os.path.basename(dfile).split('.txt')[0]
-    with open(tfile) as tf:
-        with open(dfile) as jf:
-            if Cfg['all']['paired_end']:
-                trim_data = parse_trim_summary_paired(tf)
-            else:
-                trim_data = parse_trim_summary_single(tf)
-
-            decontam_data = parse_decontam_log(jf)
-    sys.stderr.write("trim data: {}\n".format(trim_data))
-    sys.stderr.write("decontam data: {}\n".format(decontam_data))
-    return(pandas.DataFrame(OrderedDict(trim_data, **(decontam_data)), index=[tname]))
 
 rule preprocess_report:
     """Combines the information from multiple preprocessing steps"""
@@ -57,25 +24,13 @@ rule preprocess_report:
     output:
         str(QC_FP/'reports'/'preprocess_summary.tsv')
     run:
-        summary_list = [summarize_qual_decontam(q, d) for q, d in 
-                       zip(input.trim_files, input.decontam_files)]
-        reports = pandas.concat(summary_list)
-        reports.to_csv(output[0], sep='\t', index_label='Samples')
-
-def parse_fastqc_quality(filename):
-    with open(filename) as f:
-        report = f.read()
-    tableString = re.search(
-        '\>\>Per base sequence quality.*?\n(.*?)\n\>\>END_MODULE',
-        report, re.DOTALL).group(1)
+        paired_end = Cfg['all']['paired_end']
+        summary_list = [
+            reports.summarize_qual_decontam(q, d, paired_end) for q, d in 
+            zip(input.trim_files, input.decontam_files)]
+        _reports = pandas.concat(summary_list)
+        _reports.to_csv(output[0], sep='\t', index_label='Samples')
 
-    f_s = StringIO(tableString)
-    df = pandas.read_csv(
-        f_s, sep='\t', usecols=['#Base', 'Mean'], index_col='#Base')
-    sample_name = os.path.basename(filename.split('_fastqc')[0])
-    df.columns=[sample_name]
-    f_s.close()
-    return(df)
 
 rule fastqc_report:
     """ make fastqc reports """
@@ -86,6 +41,6 @@ rule fastqc_report:
     output:
         str(QC_FP/'reports'/'fastqc_quality.tsv')
     run:
-        quality_list = [parse_fastqc_quality(file) for file in input.files]
+        quality_list = [reports.parse_fastqc_quality(file) for file in input.files]
         quality_table = pandas.concat(quality_list, axis=1).transpose()
         quality_table.to_csv(output[0],sep="\t",index_label="Samples")
diff --git a/sunbeamlib/__init__.py b/sunbeamlib/__init__.py
@@ -54,34 +54,41 @@ def guess_format_string(fnames, paired_end=True, split_pattern="([_\.])"):
     """
 
     if isinstance(fnames, str):
-        raise ValueError("need a list of filenames, not a string")
+        raise SampleFormatError("need a list of filenames, not a string")
     if len(fnames) == 1:
-        raise ValueError("need a list of filenames, not just one")
+        raise SampleFormatError("need a list of filenames, not just one")
     if len(set(fnames)) == 1:
-        raise ValueError("all filenames are the same")
+        raise SampleFormatError("all filenames are the same")
 
     splits = [re.split(split_pattern, fname) for fname in fnames]
 
     if len(set([len(p) for p in splits])) > 1:
-        raise ValueError("files have inconsistent numbers of _ or . characters")
+        raise SampleFormatError("files have inconsistent numbers of _ or . characters")
 
     elements = []
     variant_idx = []
 
+    # A special case when paired-end and only two files:
+    # invariant regions may be sample names (since only one sample)
+    potential_single_sample = len(fnames) == 1 and paired_end
+
     for i, parts in enumerate(zip(*splits)):
         items = set(parts)
         # If they're all the same, it's a common part; so add it to the element
         # list unchanged
-        if len(items) == 1:
+        if items.issubset({"fastq", ".", "_", "gz", "fq"}):
+            elements.append(parts[0])
+        elif len(items) == 1 and not potential_single_sample:
             elements.append(parts[0])
         else:
             if paired_end:
                 # If all the items in a split end with 1 or 2, and only have
                 # one char preceding that that's the same among all items,
-                # then it's like a read-pair identifier.
+                # OR all the items are 1 or 2 and the only things in a split,
+                # then it's likely a read-pair identifier.
                 if set(_[-1] for _ in items) == {'1', '2'}:
                     prefixes = set(_[:-1] for _ in items)
-                    if len(prefixes) == 1 and all(len(p) == 1 for p in prefixes):
+                    if prefixes == {''} or (len(prefixes) == 1 and all(len(p) == 1 for p in prefixes)):
                         prefix = parts[0][:-1]
                         elements.append(prefix)
                         elements.append("{rp}")
@@ -95,7 +102,11 @@ def guess_format_string(fnames, paired_end=True, split_pattern="([_\.])"):
     elements[_min:_max+1] = ["{sample}"]
     return "".join(elements)
 
+class MissingMatePairError(Exception):
+    pass
 
+class SampleFormatError(Exception):
+    pass
 
 def _verify_path(fp):
     if not fp:

diff --git a/sunbeamlib/reports.py b/sunbeamlib/reports.py
@@ -1,8 +1,13 @@
 import warnings
 warnings.filterwarnings('ignore', '.*experimental.*')
 from pathlib import Path
-from collections import Counter
+from collections import Counter, OrderedDict
+import re
+import os
+import sys
 
+import pandas
+from io import StringIO
 from Bio import SeqIO
 from Bio import SearchIO
 from Bio.SeqRecord import SeqRecord
@@ -30,6 +35,57 @@ def blast_contig_summary(xml_files):
 
 def blast_hits(blast_xml_files):
     return Counter(c['query'] for c in blast_summary(blast_xml_files))
+
+def parse_trim_summary_paired(f):
+    for line in f.readlines():
+        if line.startswith('Input Read'):
+            vals = re.findall('\D+\: (\d+)', line)
+            keys = ('input', 'both_kept','fwd_only','rev_only','dropped')
+            return(OrderedDict(zip(keys, vals)))
+
+def parse_trim_summary_single(f):
+    for line in f:
+        if line.startswith('Input Read'):
+            vals = re.findall('\D+\: (\d+)', line)
+            keys = ('input', 'kept', 'dropped')
+            return(OrderedDict(zip(keys, vals)))
+
+def parse_decontam_log(f):
+    keys = f.readline().rstrip().split('\t')
+    vals = f.readline().rstrip().split('\t')
+    return(OrderedDict(zip(keys,vals)))
+
+def summarize_qual_decontam(tfile, dfile, paired_end):
+    """Return a dataframe for summary information for trimmomatic and decontam rule"""
+    tname = os.path.basename(tfile).split('.out')[0]
+    dname = os.path.basename(dfile).split('.txt')[0]
+    with open(tfile) as tf:
+        with open(dfile) as jf:
+            if paired_end:
+                trim_data = parse_trim_summary_paired(tf)
+            else:
+                trim_data = parse_trim_summary_single(tf)
+
+            decontam_data = parse_decontam_log(jf)
+    sys.stderr.write("trim data: {}\n".format(trim_data))
+    sys.stderr.write("decontam data: {}\n".format(decontam_data))
+    return(pandas.DataFrame(OrderedDict(trim_data, **(decontam_data)), index=[tname]))
+
+def parse_fastqc_quality(filename):
+    with open(filename) as f:
+        report = f.read()
+    tableString = re.search(
+        '\>\>Per base sequence quality.*?\n(.*?)\n\>\>END_MODULE',
+        report, re.DOTALL).group(1)
+
+    f_s = StringIO(tableString)
+    df = pandas.read_csv(
+        f_s, sep='\t', usecols=['#Base', 'Mean'], index_col='#Base')
+    sample_name = os.path.basename(filename.split('_fastqc')[0])
+    df.columns=[sample_name]
+    f_s.close()
+    return(df)
+
 
 
 

diff --git a/sunbeamlib/scripts/init.py b/sunbeamlib/scripts/init.py
@@ -4,7 +4,7 @@
 import ruamel.yaml
 from pathlib import Path
 
-from .list_samples import build_sample_list
+from .list_samples import build_sample_list, MissingMatePairError, SampleFormatError
 from sunbeamlib import config
 
 def main(argv=sys.argv):
@@ -94,12 +94,16 @@ def main(argv=sys.argv):
                     is_single_end = args.single_end)
                 sys.stderr.write(
                     "New sample list written to {}\n".format(samplelist_file))
-        except ValueError as e:
+        except SampleFormatError as e:
             raise SystemExit(
-                "Error: could not create sample list (reason: {}). Provide a "
-                "format string using --format, use `sunbeam list_samples` or "
-                "create manually (see user guide).".format(e))
-
+                "Error: could not create sample list. Specify correct sample filename"
+                " format using --format.\n  Reason: {}".format(e))
+        except MissingMatePairError as e:
+            raise SystemExit(
+                "Error: assuming paired-end reads, but could not find mates. Specify "
+                "--single-end if not paired-end, or provide sample name format "
+                "using --format."
+                "\n  Reason: {}".format(e))
 
 def check_existing(path, force=False):
     if path.is_dir():