Skip to content

Commit

Permalink
Merge pull request #148 from sunbeam-labs/dev
Browse files Browse the repository at this point in the history
Bugfixes
  • Loading branch information
eclarke committed May 24, 2018
2 parents ce2f035 + 1339f46 commit cc0c6ad
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 17 deletions.
4 changes: 4 additions & 0 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ To get started, see our [documentation](http://sunbeam.readthedocs.io)!

### Changelog:

#### v1.2.1 (May 24, 2018)

- Minor bugfixes

#### v1.2.0 (May 2, 2018)

- Low-complexity reads are now removed by default rather than masked
Expand Down
42 changes: 28 additions & 14 deletions sunbeamlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,27 +55,32 @@ def guess_format_string(fnames, paired_end=True, split_pattern="([_\.])"):

if isinstance(fnames, str):
raise SampleFormatError("need a list of filenames, not a string")
if len(fnames) == 1:
raise SampleFormatError("need a list of filenames, not just one")
if len(set(fnames)) == 1:
if len(fnames) > 1 and len(set(fnames)) == 1:
raise SampleFormatError("all filenames are the same")
if len(set(fnames)) == 0:
raise SampleFormatError("no files in directory!")

splits = [re.split(split_pattern, fname) for fname in fnames]
splits = [list(reversed(re.split(split_pattern, fname))) for fname in fnames]

if len(fnames) == 1:
sys.stderr.write("Only one sample found; defaulting to {sample}.fastq.gz\n")
return "{sample}.fastq.gz"

if len(set([len(p) for p in splits])) > 1:
raise SampleFormatError("files have inconsistent numbers of _ or . characters")
sys.stderr.write("Warning: samples have inconsistent numbers of _ or . characters\n")

elements = []
variant_idx = []

# A special case when paired-end and only two files:
# invariant regions may be sample names (since only one sample)
potential_single_sample = len(fnames) == 1 and paired_end
potential_single_sample = len(fnames) == 2 and paired_end

for i, parts in enumerate(zip(*splits)):
items = set(parts)
# If they're all the same, it's a common part; so add it to the element
# list unchanged

if items.issubset({"fastq", ".", "_", "gz", "fq"}):
elements.append(parts[0])
elif len(items) == 1 and not potential_single_sample:
Expand All @@ -88,19 +93,28 @@ def guess_format_string(fnames, paired_end=True, split_pattern="([_\.])"):
# then it's likely a read-pair identifier.
if set(_[-1] for _ in items) == {'1', '2'}:
prefixes = set(_[:-1] for _ in items)
if prefixes == {''} or (len(prefixes) == 1 and all(len(p) == 1 for p in prefixes)):
prefix = parts[0][:-1]
elements.append(prefix)
NO_PREFIX = prefixes == {''}
ALL_SAME_PREFIX = len(prefixes) == 1
ONE_CHAR_PREFIX = all(len(p) == 1 for p in prefixes)
I_OR_R_PREFIX = prefixes == {'I', 'R'}
if NO_PREFIX or (ALL_SAME_PREFIX and ONE_CHAR_PREFIX) or I_OR_R_PREFIX:
if I_OR_R_PREFIX:
prefix = 'R'
else:
prefix = parts[0][:-1]
elements.append("{rp}")
elements.append(prefix)
continue
variant_idx.append(i)
elements.append("{sample}")

# Combine multiple variant elements
_min = min(variant_idx)
_max = max(variant_idx)
elements[_min:_max+1] = ["{sample}"]
return "".join(elements)
if len(variant_idx) > 0:
_min = min(variant_idx)
_max = max(variant_idx)
elements[_min+1:_max+2] = ["{sample}"]
return "".join(reversed(elements))
else:
raise SampleFormatError("No variable regions identified")

class MissingMatePairError(Exception):
pass
Expand Down
6 changes: 3 additions & 3 deletions sunbeamlib/scripts/list_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def main(argv=sys.argv):
" format using --format. \n\tReason: {}".format(e))
except MissingMatePairError as e:
raise SystemExit(
"Detected paired-end reads, but could not find mates. Specify "
"--single-end if not paired-end, or provide sample name format "
"Assuming paired-end reads, but could not find mates. Specify "
"--single_end if not paired-end, or provide sample name format "
"using --format."
"\n\tReason: {}".format(e))

Expand Down Expand Up @@ -65,7 +65,7 @@ def build_sample_list(data_fp, format_str, output_file, is_single_end):
", ".join(no_match)))

if len(samples) == 0:
raise ("no samples matching the given format found.")
raise SampleFormatError("no samples matching the given format found.")

sys.stderr.write("Found {} samples in {}.\n".format(len(samples), data_fp))
fieldnames = ["sample", "1", "2"]
Expand Down
38 changes: 38 additions & 0 deletions tests/test_suite.bash
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,41 @@ function test_pair_concordance {
fi
done
}

# Test that we can guess a variety of sample names correctly
# Correct behavior for only two samples
function test_guess_with_two_samples {
mkdir -p $TEMPDIR/only_two_samples
touch $TEMPDIR/only_two_samples/sample_1.fastq.gz
touch $TEMPDIR/only_two_samples/sample_2.fastq.gz
sunbeam list_samples $TEMPDIR/only_two_samples 2> >(tee out.txt >&2)
grep '{sample}_{rp}.fastq.gz' out.txt
}

# Correct behavior for samples with inconsistent _ or .
function test_guess_with_inconsistent_samples {
mkdir -p $TEMPDIR/inconsistent_samples
touch $TEMPDIR/inconsistent_samples/asdf_123_R1.fastq.gz
touch $TEMPDIR/inconsistent_samples/asdf_123_R2.fastq.gz
touch $TEMPDIR/inconsistent_samples/asddf_R1.fastq.gz
touch $TEMPDIR/inconsistent_samples/asddf_R2.fastq.gz
sunbeam list_samples $TEMPDIR/inconsistent_samples 2> >(tee out.txt >&2)
grep '{sample}_R{rp}.fastq.gz' out.txt
rm -r $TEMPDIR/inconsistent_samples
}

# Correct behavior for folders that still have index files
function test_guess_with_index_files_present {
mkdir -p $TEMPDIR/idx_files_present
touch $TEMPDIR/idx_files_present/asdf_123_R1.fastq.gz
touch $TEMPDIR/idx_files_present/asdf_123_R2.fastq.gz
touch $TEMPDIR/idx_files_present/asddf_R1.fastq.gz
touch $TEMPDIR/idx_files_present/asddf_R2.fastq.gz
touch $TEMPDIR/idx_files_present/asdf_123_I1.fastq.gz
touch $TEMPDIR/idx_files_present/asdf_123_I2.fastq.gz
touch $TEMPDIR/idx_files_present/asddf_I1.fastq.gz
touch $TEMPDIR/idx_files_present/asddf_I2.fastq.gz
sunbeam list_samples $TEMPDIR/idx_files_present 2> >(tee out.txt >&2)
grep '{sample}_R{rp}.fastq.gz' out.txt
# rm -r $TEMPDIR/idx_files_present
}

0 comments on commit cc0c6ad

Please sign in to comment.