Skip to content

Commit

Permalink
Merge pull request #144 from sunbeam-labs/dev
Browse files Browse the repository at this point in the history
stable release
  • Loading branch information
eclarke committed May 24, 2018
2 parents 593754c + 97e6278 commit ce2f035
Show file tree
Hide file tree
Showing 11 changed files with 171 additions and 85 deletions.
3 changes: 2 additions & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ version: 2
jobs:
build:
docker:
- image: circleci/python:3.6.1
- image: circleci/python:3.6
- image: circleci/python:3.5

steps:
- checkout
Expand Down
49 changes: 45 additions & 4 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,51 @@ Sunbeam currently automates the following tasks:
* Mapping of reads to target genomes; and
* ORF prediction using [Prodigal](https://github.com/hyattpd/Prodigal).

Sunbeam was designed to be modular and extensible. We have a few pre-built
extensions available that handle visualization tasks, including contig
assembly graphs, read alignments, and taxonomic classifications.
Sunbeam was designed to be modular and extensible. Some extensions have been built for:

To get started, see our [documentation!](https://sunbeam.readthedocs.io)
- [IGV](https://github.com/sunbeam-labs/sbx_igv) for viewing read alignments
- [KrakenHLL](https://github.com/zhaoc1/sbx_krakenhll), an alternate read classifier
- [Kaiju](https://github.com/sunbeam-labs/sbx_kaiju), a read classifier that uses BWA rather than kmers
- [Anvi'o](https://github.com/sunbeam-labs/sbx_anvio), a downstream analysis pipeline that does lots of stuff!

To get started, see our [documentation](http://sunbeam.readthedocs.io)!


------

### Changelog:

#### v1.2.0 (May 2, 2018)

- Low-complexity reads are now removed by default rather than masked
- Bug fixes related to single-end sequencing experiments
- Documentation updates

#### v1.1.0 (April 8, 2018)

- Reports include number of filtered reads per host, rather than in aggregate
- Static binary dependency for [komplexity](https://github.com/eclarke/komplexity) for easier deployment
- Remove max length filter for contigs

#### v1.0.0 (March 22, 2018)

- First stable release!
- Support for single-end sequencing experiments
- Low-complexity read masking via [komplexity](https://github.com/eclarke/komplexity)
- Support for extensions
- Documentation on [ReadTheDocs.io](http://sunbeam.readthedocs.io)
- Better assembler (megahit)
- Better ORF finder (prodigal)
- Can remove reads from any number of host/contaminant genomes
- Semantic versioning checks
- Integration tests and continuous deployment

-------

### Contributors

- Erik Clarke ([@eclarke](https://github.com/eclarke))
- Chunyu Zhao ([@zhaoc1](https://github.com/zhaoc1))
- Jesse Connell ([@ressy](https://github.com/ressy))
- Louis Taylor ([@louiejtaylor](https://github.com/louiejtaylor))

1 change: 0 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ Cfg = check_config(config)
Blastdbs = process_databases(Cfg['blastdbs'])
Samples = build_sample_list(Cfg['all']['samplelist_fp'], Cfg['all']['paired_end'])
Pairs = ['1', '2'] if Cfg['all']['paired_end'] else ['1']
print(Samples)

# Collect host (contaminant) genomes
sys.stderr.write("Collecting host/contaminant genomes... ")
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- conda-forge
- eclarke
dependencies:
- snakemake
- snakemake=4.8.1
- ruamel.yaml
- biopython
- trimmomatic
Expand Down
1 change: 1 addition & 0 deletions rules/assembly/assembly.rules
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ rule final_filter:
vsearch --sortbylength {input} \
--minseqlength {params.len} \
--maxseqlength -1 \
--notrunclabels \
--output {input}.{params.len}f &> {log} && \
cp {input}.{params.len}f {output}
else
Expand Down
63 changes: 9 additions & 54 deletions rules/reports/reports.rules
Original file line number Diff line number Diff line change
Expand Up @@ -3,47 +3,14 @@
# ReportGeneration rules

import pandas
from io import StringIO

from collections import OrderedDict
from sunbeamlib import reports

rule all_reports:
input:
TARGET_REPORT

def parse_trim_summary_paired(f):
for line in f.readlines():
if line.startswith('Input Read'):
vals = re.findall('\D+\: (\d+)', line)
keys = ('input', 'both_kept','fwd_only','rev_only','dropped')
return(OrderedDict(zip(keys, vals)))

def parse_trim_summary_single(f):
for line in f:
if line.startswith('Input Read'):
vals = re.findall('\D+\: (\d+)', line)
keys = ('input', 'kept', 'dropped')
return(OrderedDict(zip(keys, vals)))

def parse_decontam_log(f):
keys = f.readline().rstrip().split('\t')
vals = f.readline().rstrip().split('\t')
return(OrderedDict(zip(keys,vals)))

def summarize_qual_decontam(tfile, dfile):
"""Return a dataframe for summary information for trimmomatic and decontam rule"""
tname = os.path.basename(tfile).split('.out')[0]
dname = os.path.basename(dfile).split('.txt')[0]
with open(tfile) as tf:
with open(dfile) as jf:
if Cfg['all']['paired_end']:
trim_data = parse_trim_summary_paired(tf)
else:
trim_data = parse_trim_summary_single(tf)

decontam_data = parse_decontam_log(jf)
sys.stderr.write("trim data: {}\n".format(trim_data))
sys.stderr.write("decontam data: {}\n".format(decontam_data))
return(pandas.DataFrame(OrderedDict(trim_data, **(decontam_data)), index=[tname]))

rule preprocess_report:
"""Combines the information from multiple preprocessing steps"""
Expand All @@ -57,25 +24,13 @@ rule preprocess_report:
output:
str(QC_FP/'reports'/'preprocess_summary.tsv')
run:
summary_list = [summarize_qual_decontam(q, d) for q, d in
zip(input.trim_files, input.decontam_files)]
reports = pandas.concat(summary_list)
reports.to_csv(output[0], sep='\t', index_label='Samples')

def parse_fastqc_quality(filename):
with open(filename) as f:
report = f.read()
tableString = re.search(
'\>\>Per base sequence quality.*?\n(.*?)\n\>\>END_MODULE',
report, re.DOTALL).group(1)
paired_end = Cfg['all']['paired_end']
summary_list = [
reports.summarize_qual_decontam(q, d, paired_end) for q, d in
zip(input.trim_files, input.decontam_files)]
_reports = pandas.concat(summary_list)
_reports.to_csv(output[0], sep='\t', index_label='Samples')

f_s = StringIO(tableString)
df = pandas.read_csv(
f_s, sep='\t', usecols=['#Base', 'Mean'], index_col='#Base')
sample_name = os.path.basename(filename.split('_fastqc')[0])
df.columns=[sample_name]
f_s.close()
return(df)

rule fastqc_report:
""" make fastqc reports """
Expand All @@ -86,6 +41,6 @@ rule fastqc_report:
output:
str(QC_FP/'reports'/'fastqc_quality.tsv')
run:
quality_list = [parse_fastqc_quality(file) for file in input.files]
quality_list = [reports.parse_fastqc_quality(file) for file in input.files]
quality_table = pandas.concat(quality_list, axis=1).transpose()
quality_table.to_csv(output[0],sep="\t",index_label="Samples")
25 changes: 18 additions & 7 deletions sunbeamlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,34 +54,41 @@ def guess_format_string(fnames, paired_end=True, split_pattern="([_\.])"):
"""

if isinstance(fnames, str):
raise ValueError("need a list of filenames, not a string")
raise SampleFormatError("need a list of filenames, not a string")
if len(fnames) == 1:
raise ValueError("need a list of filenames, not just one")
raise SampleFormatError("need a list of filenames, not just one")
if len(set(fnames)) == 1:
raise ValueError("all filenames are the same")
raise SampleFormatError("all filenames are the same")

splits = [re.split(split_pattern, fname) for fname in fnames]

if len(set([len(p) for p in splits])) > 1:
raise ValueError("files have inconsistent numbers of _ or . characters")
raise SampleFormatError("files have inconsistent numbers of _ or . characters")

elements = []
variant_idx = []

# A special case when paired-end and only two files:
# invariant regions may be sample names (since only one sample)
potential_single_sample = len(fnames) == 1 and paired_end

for i, parts in enumerate(zip(*splits)):
items = set(parts)
# If they're all the same, it's a common part; so add it to the element
# list unchanged
if len(items) == 1:
if items.issubset({"fastq", ".", "_", "gz", "fq"}):
elements.append(parts[0])
elif len(items) == 1 and not potential_single_sample:
elements.append(parts[0])
else:
if paired_end:
# If all the items in a split end with 1 or 2, and only have
# one char preceding that that's the same among all items,
# then it's like a read-pair identifier.
# OR all the items are 1 or 2 and the only things in a split,
# then it's likely a read-pair identifier.
if set(_[-1] for _ in items) == {'1', '2'}:
prefixes = set(_[:-1] for _ in items)
if len(prefixes) == 1 and all(len(p) == 1 for p in prefixes):
if prefixes == {''} or (len(prefixes) == 1 and all(len(p) == 1 for p in prefixes)):
prefix = parts[0][:-1]
elements.append(prefix)
elements.append("{rp}")
Expand All @@ -95,7 +102,11 @@ def guess_format_string(fnames, paired_end=True, split_pattern="([_\.])"):
elements[_min:_max+1] = ["{sample}"]
return "".join(elements)

class MissingMatePairError(Exception):
pass

class SampleFormatError(Exception):
pass

def _verify_path(fp):
if not fp:
Expand Down
58 changes: 57 additions & 1 deletion sunbeamlib/reports.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import warnings
warnings.filterwarnings('ignore', '.*experimental.*')
from pathlib import Path
from collections import Counter
from collections import Counter, OrderedDict
import re
import os
import sys

import pandas
from io import StringIO
from Bio import SeqIO
from Bio import SearchIO
from Bio.SeqRecord import SeqRecord
Expand Down Expand Up @@ -30,6 +35,57 @@ def blast_contig_summary(xml_files):

def blast_hits(blast_xml_files):
return Counter(c['query'] for c in blast_summary(blast_xml_files))

def parse_trim_summary_paired(f):
for line in f.readlines():
if line.startswith('Input Read'):
vals = re.findall('\D+\: (\d+)', line)
keys = ('input', 'both_kept','fwd_only','rev_only','dropped')
return(OrderedDict(zip(keys, vals)))

def parse_trim_summary_single(f):
for line in f:
if line.startswith('Input Read'):
vals = re.findall('\D+\: (\d+)', line)
keys = ('input', 'kept', 'dropped')
return(OrderedDict(zip(keys, vals)))

def parse_decontam_log(f):
keys = f.readline().rstrip().split('\t')
vals = f.readline().rstrip().split('\t')
return(OrderedDict(zip(keys,vals)))

def summarize_qual_decontam(tfile, dfile, paired_end):
"""Return a dataframe for summary information for trimmomatic and decontam rule"""
tname = os.path.basename(tfile).split('.out')[0]
dname = os.path.basename(dfile).split('.txt')[0]
with open(tfile) as tf:
with open(dfile) as jf:
if paired_end:
trim_data = parse_trim_summary_paired(tf)
else:
trim_data = parse_trim_summary_single(tf)

decontam_data = parse_decontam_log(jf)
sys.stderr.write("trim data: {}\n".format(trim_data))
sys.stderr.write("decontam data: {}\n".format(decontam_data))
return(pandas.DataFrame(OrderedDict(trim_data, **(decontam_data)), index=[tname]))

def parse_fastqc_quality(filename):
with open(filename) as f:
report = f.read()
tableString = re.search(
'\>\>Per base sequence quality.*?\n(.*?)\n\>\>END_MODULE',
report, re.DOTALL).group(1)

f_s = StringIO(tableString)
df = pandas.read_csv(
f_s, sep='\t', usecols=['#Base', 'Mean'], index_col='#Base')
sample_name = os.path.basename(filename.split('_fastqc')[0])
df.columns=[sample_name]
f_s.close()
return(df)




Expand Down
16 changes: 10 additions & 6 deletions sunbeamlib/scripts/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import ruamel.yaml
from pathlib import Path

from .list_samples import build_sample_list
from .list_samples import build_sample_list, MissingMatePairError, SampleFormatError
from sunbeamlib import config

def main(argv=sys.argv):
Expand Down Expand Up @@ -94,12 +94,16 @@ def main(argv=sys.argv):
is_single_end = args.single_end)
sys.stderr.write(
"New sample list written to {}\n".format(samplelist_file))
except ValueError as e:
except SampleFormatError as e:
raise SystemExit(
"Error: could not create sample list (reason: {}). Provide a "
"format string using --format, use `sunbeam list_samples` or "
"create manually (see user guide).".format(e))

"Error: could not create sample list. Specify correct sample filename"
" format using --format.\n Reason: {}".format(e))
except MissingMatePairError as e:
raise SystemExit(
"Error: assuming paired-end reads, but could not find mates. Specify "
"--single-end if not paired-end, or provide sample name format "
"using --format."
"\n Reason: {}".format(e))

def check_existing(path, force=False):
if path.is_dir():
Expand Down

0 comments on commit ce2f035

Please sign in to comment.