/
deepseq_preprocessing_wrapper.py
executable file
·481 lines (431 loc) · 33.3 KB
/
deepseq_preprocessing_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
#!/usr/bin/env python2.7
""" Preprocessing of insertional-mutant deepseq reads: strip first bases (from adapter) and cassette sequence, filter by length, collapse to unique (all stages are optional and customizable).
1) Strip a given start sequence from each read; reads that didn't start with that sequence go in a separate file.
Uses my own trim_prefix function (used to be done with fastx_trimmer, but that didn't check what the sequence was)
2) Strip the cassette sequence from middle/end of read, and grab only the remaining reads that are within the expected length range; reads that don't contain the cassette or are too short/long go in a separate file.
This is a wrapper around the cutadapt command-line program (or my modified version of it).
3) Collapse identical read sequences to single reads (encoding the original readcount in the header) to avoid aligning the same sequence multiple times. This discards the quality info (i.e. converts file from fastq to fasta), if it hasn't been done earlier.
This is a wrapper around the fastx_collapser command-line program that does exactly that.
My program keeps track of read-counts and metadata and saves it all in a metadata file.
Output - there are actually four outfiles generated (with names based on the -o option - calling the value X here):
- sequence file - <X>.fq or .fa, depending on input format etc - contains the final sequences
after trimming the start sequence, the cassette sequence (cutadapt) and collapsing to unique (fastx_collapser)
- metadata file - <X>_info.txt - contains general information:
exact command used, date/time, system, user, etc
read-length distribution before/after each step (may be optional),
summary output from the wrapped programs (normally printed to stdout),
sequence counts from each stage, with the total count/percentage of discarded sequences
- <X>_wrong-start.fa - sequences that didn't start with the start sequence given in the -F option;
collapsed to unique if -C option.
- <X>_no-cassette.fa - sequences that didn't contain the cassette sequence given in the -A option;
collapsed to unique if -C option.
- temporary/intermediate files - <X>_tmp* - contain intermediate sequence data (after start-trimming but
before cassette-stripping, and after cassette-stripping but before collapsing to unique if -C option given).
Removed after program finishes running with no errors, unless option -k is used.
-- Weronika Patena, Jonikas Lab, Carnegie Institution, 2011-2012
USAGE: deepseq_preprocessing_wrapper.py [options] infile -o outfile_basename """
# standard library
from __future__ import division
import sys, os
import unittest
# my modules
from general_utilities import write_header_data, run_command_print_info_output, value_and_percentages
from basic_seq_utilities import parse_fasta, write_fasta_line, name_seq_generator_from_fasta_fastq
import seq_count_and_lengths
FASTQ_ENCODINGS_FASTX_TOOLKIT = {'auto': '', 'sanger': '-Q33', 'illumina': '-Q64'}
### 5' and 3' ends of the different cassette variants we have
CASSETTE_pMJ013b_5prime = 'GTTGGAaccaatcgtcacacgagccctcgtcagaaacacgtctccgccac'
CASSETTE_pMJ015b_5prime = 'GTTGGAgacaatcgtcacacgagccctcgtcagaaacacgtctccgccac' # ga after GTTGGA is different from pMJ013b
CASSETTE_pMJ007_5prime = 'GTTGGAaccaatcgtcacacgagccctcgtcagaaacacgtctccgccac' # same as pMJ013b
CASSETTE_pMJ013b_3prime = 'GTTGGAgcgcggggcgtgttgcagcggcagggaagagcagctgatgatga'
CASSETTE_pMJ015b_3prime = 'GTTGGAgcgcggggcgtgttgcagcggcagggaagagcagctgatgatga' # same as pMJ013b
CASSETTE_pMJ007_3prime = '' # DOES NOT GIVE 3' product!
CASSETTE_DEFAULT_LENGTH = 12
CASSETTE_DEFAULT_5prime = CASSETTE_pMJ013b_5prime[:CASSETTE_DEFAULT_LENGTH]
CASSETTE_DEFAULT_3prime = CASSETTE_pMJ013b_3prime[:CASSETTE_DEFAULT_LENGTH]
def check_readcount(infile, OUTFILE=None, printing=True, description=None,
total_read_number_only=False, input_collapsed_to_unique=False):
""" Make sure infile exists and is fa/fq (sys.exit if not); print read count and length distribution
to OUTFILE and/or stdout. Return total number of reads. """
if not os.path.exists(infile):
sys.exit("No %s file found! Exiting program. You should examine and/or remove the intermediate files."%infile)
read_count_data = seq_count_and_lengths.main([infile], total_read_number_only, input_collapsed_to_unique,
include_zeros=False, verbosity=1, OUTPUT=None)[2]
if description is not None: header = "# %s file (%s) data:"%(description, infile)
else: header = "# %s file data:"%infile
output = "%s\n%s"%(header, ''.join(read_count_data))
if OUTFILE is not None: OUTFILE.write(output+'\n')
if printing: print output
read_count = int(read_count_data[-1].split(' ')[1])
return read_count
def _trim_prefix_single(seqname, seq, prefix_bases, TRIMMED_OUTFILE, WRONG_PREFIX_OUTFILE=None):
""" If prefix_bases is a prefix of seq, trim it off, print to TRIMMED_OUTFILE, return 1;
otherwise print to WRONG_PREFIX_OUTFILE if not None, return 0.
"""
if seq.upper().startswith(prefix_bases.upper()):
seq_trimmed = seq[len(prefix_bases):]
# some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
write_fasta_line(seqname, seq_trimmed.upper(), TRIMMED_OUTFILE)
return 1
else:
if WRONG_PREFIX_OUTFILE is not None:
# some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
write_fasta_line(seqname, seq.upper(), WRONG_PREFIX_OUTFILE)
return 0
# MAYBE-TODO add an option that specifies the number/percentage of allowed mismatches?
# MAYBE-TODO might be worth making this a separate program
def trim_prefix(prefix_bases, infile, trimmed_outfile, wrong_prefix_outfile=os.devnull,
INFOFILE=None, verbosity=1):
""" Trim prefix_bases from seqs in infile and print to trimmed_outfile; print other seqs to wrong_prefix_outfile.
Reads fasta or fastq files; outputs fasta files only.
For each seq in infile, if seq starts with prefix_bases, trim them and print result to trimmed_outfile;
otherwise print full seq to wrong_prefix_outfile (if not None).
INFOFILE should be an open file handle to print summary info to, or None;
verbosity governs how much is printed to stdout
"""
text = "### Trimming %s from start of each sequence in %s (output to %s, untrimmed to %s)\n"%(prefix_bases, infile,
trimmed_outfile, wrong_prefix_outfile)
# MAYBE-TODO modify so it can output fastq too?
if INFOFILE is not None: INFOFILE.write(text+'\n')
if verbosity>0: print text
N_trimmed, N_untrimmed = 0, 0
with open(trimmed_outfile, 'w') as TRIMMED_OUTFILE:
with open(wrong_prefix_outfile, 'w') as WRONG_PREFIX_OUTFILE:
# MAYBE-TODO right now if wrong_prefix_outfile==None, /dev/null is used - it would be faster with a custom file-like object that doesn't touch the OS, but I'm not sure how to write one so it can be opened! See general_utilities.FAKE_OUTFILE for an already open one.
name_seq_generator = name_seq_generator_from_fasta_fastq(infile, verbosity>2)
for name,seq in name_seq_generator:
if_trimmed = _trim_prefix_single(name, seq, prefix_bases, TRIMMED_OUTFILE, WRONG_PREFIX_OUTFILE)
if if_trimmed: N_trimmed += 1
else: N_untrimmed += 1
N_total = N_trimmed + N_untrimmed
text = "Trimmed sequences: %s\nUntrimmed sequences: %s\n"%(value_and_percentages(N_trimmed, [N_total]),
value_and_percentages(N_untrimmed, [N_total]))
if INFOFILE is not None: INFOFILE.write(text+'\n')
if verbosity>1: print text
return N_trimmed, N_untrimmed
def define_option_parser():
""" Populates and returns an optparse option parser object, with __doc__ as the usage string."""
from optparse import OptionParser
parser = OptionParser(__doc__)
### basic functionality options
parser.add_option('-F','--first_bases_to_trim', metavar='SEQ|NONE', default='ACTA',
help="Sequence to trim from the beginning of each read - reads that don't start with that sequence "
+"will end up in another file. Set to NONE to not trim any bases. Default %default.")
parser.add_option('-5','--adapter_5prime', metavar='SEQ', default=CASSETTE_DEFAULT_5prime,
help='adapter sequence(s) for 5\' flanking regions, to pass to cutadapt. Default "%default".'
+' Set to "" if no 5\' flanking regions are expected. To run cutadapt, one or both of -3/-5 must be set.'
+' Can include multiple comma-separated sequence variants - the best match will be used for each read.')
parser.add_option('-3','--adapter_3prime', metavar='SEQ', default=CASSETTE_DEFAULT_3prime,
help='adapter sequence(s) for 3\' flanking regions, to pass to cutadapt. Default "%default".'
+' Set to "" if no 3\' flanking regions are expected. See -5 option help for more detail.')
parser.add_option('-A','--other_cutadapt_options', metavar='"TEXT"', default='-e 0.1 -O 10 -n 1 -m 20 -M 21',
help="Other options to pass to cutadapt (except adapter sequences - those are set with -5/-3), "
+'as a quoted string. Set to NONE to not run cutadapt. Default "%default". '
+'For help on available options, run "cutadapt -h" on the command-line.')
parser.add_option('-C','--collapse_to_unique', action="store_true", default=False,
help="Run output through fastx_collapser to collapse all identical-sequence reads to unique ones. "
+"(converts fastq to fasta) (default %default).")
parser.add_option('-e','--fastq_encoding', type='choice', choices=FASTQ_ENCODINGS_FASTX_TOOLKIT.keys(), default='auto',
metavar='|'.join(FASTQ_ENCODINGS_FASTX_TOOLKIT.keys()),
help="fastq quality encoding for fastx_toolkit (default %default).")
# MAYBE-TODO add options for commonly-used cutadapt options (fastx_collapser doesn't have any); if I do that, make sure that either specific options OR the general option (-A) is provided and used, NOT BOTH, and print the information for the user!
### extra readcount/length check options
parser.add_option('-n','--total_read_number_only', action="store_true", default=False,
help="Only check the total read number after each step, not read length counts (default %default).")
parser.add_option('-u','--dont_check_uncollapsed_reads', action="store_true", default=False,
help="Don't check whether the read counts after fastx_collapser (uncollapsed based on header info) "
+"match the ones before (by default this check is done and prints a warning if failed, but it "
+"takes a while on long files).")
### outfile options
parser.add_option('-o','--outfile_basename', metavar='X', default='test_preprocessing_output',
help="Basename for the two outfiles (which will be X.fa or X.fq, and X_info.txt). Default %default.")
parser.add_option('-k', '--keep_tmpfiles', action="store_true", default=False,
help="Don't delete the temporary/intermediate files (all will have the same prefix as the outfile, "
+"plus _tmp*; for exact names see full commands printed to stdout and info file).")
# MAYBE-TODO make option to generate and keep both the collapsed-to-unique file and the uncollapsed file? That would be useful - may want it as a default. Give them sensible names, like _full and _unique.
# MAYBE-TODO the seq_count_and_lengths step etc could be optional, to speed things up - make option for that? To do things quickly without intermediate checks, use a single command with pipes, instead of multiple commands with intermediate files? But that would make disentangling the stuff printed by each command complicated...
### printing options
parser.set_defaults(verbosity=1)
parser.add_option('-q', '--quiet', action="store_const", const=1, dest='verbosity',
help="Only print commands and the output summary to STDOUT.")
parser.add_option('-Q', '--very_quiet', action="store_const", const=0, dest='verbosity',
help="Don't print anything to STDOUT.")
parser.add_option('-v', '--verbose', action="store_const", const=2, dest='verbosity',
help="Print read-length counts at each step, and each program output, to STDOUT.")
### test options
parser.add_option('-t','--test_functionality', action='store_true', default=False,
help="Run the built-in unit test suite (ignores all other options/arguments; default %default).")
parser.add_option('-T','--test_run', action='store_true', default=False,
help="Run on a test input file, check output against reference files. "
+ "Ignores all other options/arguments. (default %default).")
return parser
def main(args, options):
""" Run the main functionality of the module (see module docstring for more information), excluding testing.
The options argument should be generated by an optparse parser.
"""
try:
[infile] = args
# TODO multiple infiles would be nice!
except ValueError:
parser = define_option_parser()
parser.print_help()
sys.exit("Error: exactly one infile required!")
# MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc...
### check inputs
adapter_options = '-a --adapter -b --anywhere -g --front'
if any([x in options.other_cutadapt_options for x in adapter_options.split()]):
sys.exit("Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"%adapter_options
+" - use -5/-3 options to specify adapters instead!")
### outfile and tmpfile names
# outfile suffix is always fa because we always discard quality info right now, even when not forced to do that by collapsing to unique! MAYBE-TODO change that?
#infile_suffix = os.path.splitext(infile)[1]
#outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix
outfile_suffix = '.fa'
infofile = options.outfile_basename + '_info.txt'
wrong_start_file = options.outfile_basename + '_wrong-start.fa'
no_cassette_file = options.outfile_basename + '_no-cassette.fa'
trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa'
# outfiles and tmpfiles should be split by end ONLY if cutadapt is being run!
if options.other_cutadapt_options == 'NONE' or not (options.adapter_5prime or options.adapter_3prime):
outfiles = {'': options.outfile_basename + '.fa'}
no_cassette_tmpfiles = {'': options.outfile_basename + '_no-cassette-tmpfile.fa'}
cutadapt_tmpfiles = {'': options.outfile_basename + '_cutadapt-tmpfile.fa'}
cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles)
else:
ends = "5' 3'".split()
outfiles = {end: options.outfile_basename + '_%s.fa'%end.replace("'","prime") for end in ends}
no_cassette_tmpfiles = {end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa'%end.replace("'","prime")
for end in ends}
cutadapt_tmpfiles = {end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa'%end.replace("'","prime") for end in ends}
cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles)
with open(infofile,'w') as INFOFILE:
### write header data
write_header_data(INFOFILE,options)
INFOFILE.write('\n')
### 0. look at the infile; make sure it's readable, etc
# (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format)
starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity>1), "original input",
options.total_read_number_only, False)
### 1. Trim the first bases (from adapter)
# MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function...
# Would that be faster, or better in any other way?
# MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl),
# since that's the eventual point of having those constant first bases there...
if options.first_bases_to_trim == 'NONE':
text = "### Not trimming first bases, since NONE was passed to -F option.\n"
if options.verbosity>0: print text
INFOFILE.write(text+'\n')
trimmed_tmpfile = infile
trimmed_readcount = starting_readcount
untrimmed_readcount = 0
else:
trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity)
trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity>1),
"first-base-trimming output", options.total_read_number_only, False)
untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False)
assert trimmed_readcount+untrimmed_readcount==starting_readcount,\
"Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\
+"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount)
### 2. run cutadapt to strip cassette sequence
# NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version),
# to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the
# standard install! Or wait until the cutadapt maintainer does it (I submitted it as an issue)
# (see ~/experiments/basic_programs/cutadapt_modifications/).
if_running_cutadapt = True
if options.other_cutadapt_options == 'NONE':
if_running_cutadapt = False
text = "### Not running cutadapt, since NONE was passed to -A option.\n"
elif not (options.adapter_5prime or options.adapter_3prime):
if_running_cutadapt = False
text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n"
# if not running it, just skip it
if not if_running_cutadapt:
if options.verbosity>0: print text
INFOFILE.write(text+'\n')
cutadapt_tmpfiles[''] = trimmed_tmpfile
cutadapt_readcount = {'all': trimmed_readcount}
no_cassette_readcount = 0
# otherwise run the 5' and 3' ends separately
else:
cutadapt_readcount = {}
for (end_type, adapter_seqs) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]:
assert end_type in ends
# if the adapter sequence for that side is empty, skip
adapter_seqs = adapter_seqs.replace('"','').replace("'",'').replace(' ','')
if not adapter_seqs: continue
cutadapt_tmpfile = cutadapt_tmpfiles[end_type]
all_adapter_options = ' '.join(['-a %s'%seq for seq in adapter_seqs.split(',')])
full_cutadapt_options = all_adapter_options + ' ' + options.other_cutadapt_options
for extra_seq_category in ('untrimmed', 'too-short', 'too-long'):
if not extra_seq_category in full_cutadapt_options:
full_cutadapt_options += ' --%s-output %s'%(extra_seq_category, no_cassette_tmpfiles[end_type])
command = "cutadapt_mod %s -o %s %s"%(full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile)
run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True,
program_name="cutadapt for %s"%end_type)
cutadapt_readcount[end_type] = check_readcount(cutadapt_tmpfile, INFOFILE, bool(options.verbosity>1),
"cutadapt output", options.total_read_number_only, False)
tmp_no_cassette_readcount = check_readcount(no_cassette_tmpfiles[end_type], None, False, True, False)
assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\
"%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
+"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount)
# make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles!
text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n"
if options.verbosity>0: print text
INFOFILE.write(text+'\n')
no_cassette_seqs = []
for no_cassette_tmpfile in no_cassette_tmpfiles.values():
try: no_cassette_seqs.append(dict(parse_fasta(no_cassette_tmpfile)))
except IOError: pass
# the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets
overlapping_no_cassette_headers = set.intersection(*[set(d.keys()) for d in no_cassette_seqs])
no_cassette_readcount = len(overlapping_no_cassette_headers)
with open(no_cassette_file,'w') as NO_CASSETTE_FILE:
for header in sorted(overlapping_no_cassette_headers):
# some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE)
assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\
"Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
+"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount)
# remove the original no_cassette_tmpfiles
for tmpfile in no_cassette_tmpfiles.values():
if os.path.exists(tmpfile): os.remove(tmpfile)
### 3. run fastx_collapser to collapse the sequences to unique
if not options.collapse_to_unique:
text = "### Not running fastx_collapser, since -C option was not used.\n"
if options.verbosity>0: print text
INFOFILE.write(text+'\n')
for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items():
if os.path.exists(cutadapt_tmpfile): os.rename(cutadapt_tmpfile, outfiles[end_type])
collapsed_readcount = cutadapt_readcount
# Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because
# fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off.
else:
collapsed_readcount, uncollapsed_readcount = {}, {}
for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items():
outfile = outfiles[end_type]
# if there is no file for that end, skip
if not os.path.exists(cutadapt_tmpfile): continue
command = "fastx_collapser -v %s -i %s -o %s"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding],
cutadapt_tmpfile, outfile)
run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True,
program_name="fastx_collapser for %s"%end_type)
INFOFILE.write('\n')
collapsed_readcount[end_type] = check_readcount(outfile,INFOFILE,bool(options.verbosity>1),
"fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False)
# make sure uncollapsed readcount is the same as before collapsing
uncollapsed_readcount[end_type] = check_readcount(outfile, None, False, "", True, input_collapsed_to_unique=True)
if not uncollapsed_readcount[end_type] == cutadapt_readcount[end_type]:
text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count! Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n"
else:
text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n"
if options.verbosity>1: print text
INFOFILE.write(text+'\n')
# also run fastx_collapser on wrong_start_file and no_cassette_file
text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n"
if options.verbosity: print text
INFOFILE.write(text+'\n')
extra_collapsed_readcounts = {}
for extra_file in (wrong_start_file, no_cassette_file):
command = "fastx_collapser -v %s -i %s -o tmp.fa"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding],
extra_file)
retcode = run_command_print_info_output(command, None, options.verbosity-1, shell=True)
# note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists
# (also it chokes on empty files, AND on lowercase bases! That's a bit ridiculous...)
# it also apparently sometimes changes the order of the sequences for no good reason! ARGH.
if retcode in (0, None) and os.path.exists('tmp.fa'):
os.remove(extra_file)
os.rename('tmp.fa', extra_file)
extra_collapsed_readcounts[extra_file] = check_readcount(extra_file, None, False, "", True,
input_collapsed_to_unique=False)
### Final readcount check
final_output = ["### Final read count info for %s (main output files %s)\n"%(infile, ', '.join(outfiles))]
final_output.append("# starting total read count:\t%s\n"%starting_readcount)
if not options.first_bases_to_trim == 'NONE':
final_output.append('# "good" read count after start trimming (%% of total):\t%s\n'%
value_and_percentages(trimmed_readcount, [starting_readcount]))
final_output.append('# "bad" read count (wrong-start) (%% of total):\t%s\n'%
value_and_percentages(untrimmed_readcount, [starting_readcount]))
if if_running_cutadapt:
for end_type in cutadapt_readcount.keys():
final_output.append('# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'%
(end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount])))
final_output.append('# "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'%
value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount]))
for end_type in cutadapt_readcount.keys():
final_output.append('## final "good" %s reads (in main output file) (%% of total):\t%s\n'%(end_type,
value_and_percentages(cutadapt_readcount[end_type], [starting_readcount])))
final_output.append('## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'%
value_and_percentages(starting_readcount-sum(cutadapt_readcount.values()), [starting_readcount]))
if options.collapse_to_unique:
for end_type in cutadapt_readcount.keys():
final_output.append('# "good" %s unique sequence count after collapsing reads to unique sequences '%end_type
+'(%% of read count):\t%s\n'%value_and_percentages(collapsed_readcount[end_type],
[cutadapt_readcount[end_type]]))
if not options.first_bases_to_trim == 'NONE':
final_output.append('# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n'
%value_and_percentages(extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount]))
if if_running_cutadapt:
final_output.append('# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n'
%value_and_percentages(extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount]))
for line in final_output:
INFOFILE.write(line)
if options.verbosity>0: print line,
### Remove tmpfiles
# need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps,
# and I don't want to remove the infile!
if not options.keep_tmpfiles:
for tmpfile in [trimmed_tmpfile_original] + cutadapt_tmpfiles_original.values():
if os.path.exists(tmpfile): os.remove(tmpfile)
def do_test_run():
""" Test run: run script on test infile, compare output to reference file."""
from testing_utilities import run_functional_tests
test_folder = 'test_data'
infile1 = test_folder + '/INPUT_raw-fastq_complex-5prime.fq'
infile2 = test_folder + '/INPUT_raw-fastq_simple-both-ends.fq'
# tests in (testname, test_description, arg_string, infiles) format
test_runs = [
('full-not-unique', "testing all 'bad' cases, 5' end only, without collapsing to unique", '-3 ""', [infile1]),
('full-unique', "testing all 'bad' cases, 5' end only, WITH collapsing to unique for all outfiles", '-3 "" -C', [infile1]),
('end-5prime', "simpler test, 5' end only (collapsing to unique)", '-3 "" -C', [infile2]),
('end-3prime', "simpler test, 3' end only (collapsing to unique)", '-5 "" -C', [infile2]),
('ends-both', "simpler test, 5' and 3' ends (collapsing to unique)", '-C', [infile2]),
# this test uses the standard 5' and 3' cassette seqs as two alternative 5' ones, so the 5' output is like ends-both 5'+3'
('ends-two-5prime', "simpler test, two 5' adapters (collapsing to unique)", '-5 %s,%s -3 "" -C'%(
CASSETTE_DEFAULT_5prime,CASSETTE_DEFAULT_3prime), [infile2]),
]
# MAYBE-TODO currently some of the tests are weird because fastx_collapser changes the order of the sequences for no good reason (see <IGNORE> lines in some of the files)
# convert tests into (testname, arg_and_infile_string) format, adding the options that are always used
test_runs = [('pre__'+testname, descr, test_args+' -Q '+' '.join(infiles))
for testname,descr,test_args,infiles in test_runs]
# argument_converter converts (parser,options,args) to the correct argument order for main
argument_converter = lambda parser,options,args: (args, options)
# use my custom function to run all the tests, auto-detect reference files, compare to output.
return run_functional_tests(test_runs, define_option_parser(), main, test_folder,
argument_converter=argument_converter, outfile_option='-o',
append_to_outfilenames='')
class Testing(unittest.TestCase):
""" Unit-tests this module. """
def test__(self):
sys.exit("NO UNIT-TESTS FOR THIS MODULE")
# MAYBE-TODO add unit-tests
if __name__=='__main__':
parser = define_option_parser()
options,args = parser.parse_args()
# if run with -t option, do unit tests and quit
if options.test_functionality:
print("*** You used the -t option - ignoring all other options/arguments, running the built-in test suite. ***")
# to run tests for another file, have to use TextTestRunner, not unittest.main - make a test suite with
# autodetection of all tests (see http://docs.python.org/library/unittest.html#unittest.TestLoader)
#print("\n * unit-tests for the ______ module")
#test_suite_1 = unittest.defaultTestLoader.loadTestsFromModule(______
#unittest.TextTestRunner(verbosity=1).run(test_suite_1)
# to run tests for current module, just run unittest.main, passing it only the filename
# (by default it takes all of sys.argv and complains about options/arguments it can't recognize)
print("\n * unit-tests for this module (%s)"%sys.argv[0])
unittest.main(argv=[sys.argv[0]]) # unittest.main automatically runs sys.exit()
if options.test_run:
print("*** You used the -T option - ignoring all other options and running the built-in example test runs. ***")
test_result = do_test_run()
sys.exit(test_result)
# otherwise pass the arguments to the main function
main(args, options)