-
Notifications
You must be signed in to change notification settings - Fork 0
/
mutant_count_alignments.py
executable file
·615 lines (546 loc) · 43.5 KB
/
mutant_count_alignments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
#!/usr/bin/env python2.7
""" Take a deepseq alignment file; group the reads into insertional mutants.
Output a line-per-mutant file containing position info, gene annotation data (optional), total/perfect read count, number of distinct sequences, and some of the sequences/counts (optional), and a summary of read/mutant counts etc.
Also output a line-per-gene file containing gene ID/name/position/annotation and the number and read-counts of mutants that were inserted into that gene (optional, NOT IMPLEMENTED YET).
Output files are in simple tab-separated plaintext format, with all header/summary lines starting with #.
Grouping reads into mutants is currently done by alignment position (other options such as sequence clustering or grouping 1bp-adjacent positions together may be implemented later).
The input file should be a SAM-format deepseq alignment file created by bowtie, novoalign, or other deepseq aligner programs (tested mainly on bowtie), with optionally a metadata file created by my deepseq_alignment_wrapper.py or deepseq_preprocessing_wrapper.py scripts.
The program assumes the SAM file contains only unique matches (i.e. each read was reported as aligning to at most one genomic location).
It also only deals with single-end alignments at the moment.
-- Weronika Patena, Jonikas Lab, Carnegie Institution, 2011
USAGE: mutant_count_alignments.py [options] infile1 [infile2 infile3 ...] outfile """
# basic library
import sys, os, time
import unittest
import pickle
# other packages
import HTSeq
# my modules
from general_utilities import write_header_data
from testing_utilities import run_functional_tests
import mutant_analysis_classes
import mutant_Carette # TODO this should be optional...
def do_test_run():
""" Test run: run script on test infile, compare output to reference file."""
test_folder = "test_data_v0"
aln_infile0 = "%s/INPUT_alignment0_old-format.sam"%test_folder
aln_infile1 = "%s/INPUT_alignment1_genomic-unique.sam"%test_folder
aln_infile2 = "%s/INPUT_alignment2_for-genes.sam"%test_folder
aln_infile3 = "%s/INPUT_alignment3_for-merging.sam"%test_folder
gff_genefile = "%s/INPUT_gene-data-1_all-cases.gff3"%test_folder
dataset_to_remove = "%s/INPUT_mutants_to_remove.txt"%test_folder
test_runs = [
('cassette-end-5prime', "-e 5prime -r forward -n3 -L", [aln_infile1]),
('cassette-end-3prime', "-e 3prime -r forward -n3 -L", [aln_infile1]),
('read-direction-reverse', "-r reverse -e 5prime -n3 -L", [aln_infile1]),
('unknown-as-match', "--treat_unknown_as_match -e 5prime -r forward -n3 -L", [aln_infile1]),
('dont-count-cassette', "-l -e 5prime -r forward -n3 -L", [aln_infile1]),
('ignore-cassette', "-c -e 5prime -r forward -n3 -L", [aln_infile1]),
('separate-cassette', "-C -e 5prime -r forward -n3 -L", [aln_infile1]),
('sorted-by-count', "-o read_count -e 5prime -r forward -n3 -L", [aln_infile1]),
('with-gene-info_merged', "-e 5prime -r forward -g %s -n0"%gff_genefile, [aln_infile2]),
('with-gene-info_unmerged', "-B -e 5prime -r forward -g %s -n0"%gff_genefile, [aln_infile2]),
('multiple-infiles', "-e 5prime -r forward -n0 -L", [aln_infile1,aln_infile2]),
('dont-merge-tandems', "-n0", [aln_infile3]),
('merge-adjacent-none', "-n0 -Q -Y0", [aln_infile3]),
('merge-adjacent1-r3', "-MQ -D1 -w3 -Y0 -n0", [aln_infile3]),
('merge-adjacent1-r1', "-MQ -D1 -w1 -Y0 -n0", [aln_infile3]),
('merge-adjacent2-r3', "-MQ -D2 -w3 -Y0 -n0", [aln_infile3]),
('remove-from-other-all', "-x %s -n0"%dataset_to_remove, [aln_infile2]),
('remove-from-other-min4', "-x %s -z4 -n0"%dataset_to_remove, [aln_infile2]),
('remove-from-other-perfect', "-x %s -p -z4 -n0"%dataset_to_remove, [aln_infile2]),
('remove-not-other-all', "-X %s -n0"%dataset_to_remove, [aln_infile2]),
('remove-not-other-min4', "-X %s -Z4 -n0"%dataset_to_remove, [aln_infile2]),
('remove-not-other-perfect', "-X %s -P -Z4 -n0"%dataset_to_remove, [aln_infile2]),
('old-infile-format', "-e 5prime -r forward -n3 -L", [aln_infile0]),
]
# TODO add run-test for removing data from multiple files?
# MAYBE-TODO add run-test for a metadata file with 5' and 3' read counts?
# MAYBE-TODO add run-tests for other mutant-merging options? But they all have pretty good unit-tests.
# MAYBE-TODO add run-test for --gene_annotation_file?
# MAYBE-TODO add run-test for --input_collapsed_to_unique? Or is that unit-tested already?
# convert tests into (testname, arg_and_infile_string) format, adding the options that are always used
test_names_and_args = [('count-aln__'+testname, test_args+' -q '+' '.join(infiles))
for testname,test_args,infiles in test_runs]
parser = define_option_parser()
argument_converter = lambda parser,options,args: (args[:-1], args[-1], options)
return run_functional_tests(test_names_and_args, parser, main, test_folder,
argument_converter=argument_converter, append_to_outfilenames='.txt')
class Testing(unittest.TestCase):
""" Unit-tests this module. """
def test__(self):
print "NO UNIT-TESTS FOR THIS MODULE"
######### Main function code #########
def define_option_parser():
""" Populates and returns an optparse option parser object, with __doc__ as the usage string."""
from optparse import OptionParser
parser = OptionParser(__doc__)
# taken: aAbBcC-De---g-h---jJ--lLmMn-o-pPqQr---tTuUvVwWxXyYzZ
# free: ------d--EfF-G-HiI--kK-----N-O-----RsS--------------
### test options
parser.add_option('-t','--test_functionality', action='store_true', default=False,
help="Run the built-in unit test suite (ignores all other options/arguments; default %default).")
parser.add_option('-T','--test_run', action='store_true', default=False,
help="Run on a test input file, check output against reference files. "
+ "Ignores all other options/arguments. (default %default).")
### functionality options
parser.add_option('-e', '--read_cassette_end', choices=mutant_analysis_classes.SEQ_ENDS, default='5prime',
metavar='|'.join(mutant_analysis_classes.SEQ_ENDS),
help="Which end of the cassette are the sequenced reads from? (default %default).")
parser.add_option('-r','--read_direction', choices=mutant_analysis_classes.SEQ_DIRECTIONS, default='forward',
metavar='|'.join(mutant_analysis_classes.SEQ_DIRECTIONS),
help="Is the read in the forward or reverse direction compared to the cassette? (default %default).")
parser.add_option('--Carette', action="store_true", default=False,
help="Is this data from the Carette protocol instead of the MmeI one? (default %default).")
parser.add_option('-D', '--adjacent_max_distance', type='int', default=1, metavar='N',
help="Count/merge adjacent mutants only if they're at most N bases distant; (default %default)")
parser.add_option('-M', '--merge_adjacent_mutants', action="store_true", default=False,
help="Merge adjacent mutants if they satisfy the -W/-w constraints (default %default)")
parser.add_option('-W', '--merge_adjacent_leave_mutants', default='auto', metavar='N',
help="For -M: how many mutants to leave unmerged: a number, or 'auto' (based on opposite-strand "
+"adjacent mutants); ignored if -w option is given. (default %default)")
parser.add_option('-w', '--merge_adjacent_count_ratio', type='int', default=None, metavar='K',
help="For -M: merge mutants only if one has at least K times fewer reads than the other (default %default)")
parser.add_option('-Q', '--merge_opposite_tandem_mutants', action="store_true", default=False,
help="Merge opposite-strand same-position mutants (tail-to-tail-tandems) "
+"if they satisfy the -Y/-y contraints. (default %default)")
parser.add_option('-Y', '--merge_opposite_leave_mutants', default='auto', metavar='N',
help="For -Q: how many mutants to leave unmerged: a number, or 'auto' (based on opposite-strand "
+"adjacent mutants); ignored if -y option is given, (default %default)")
parser.add_option('-y', '--merge_opposite_count_ratio', type='int', default=None, metavar='K',
help="For -Q: merge mutants only if the readcounts are within Kx of each other. (default %default)")
parser.add_option('--merge_mutant_choice_method', choices=['by_ratio','random'], default='by_ratio', metavar='by_ratio|random',
help="For -M and -Q: if using non-zero -W/-Y (and NOT using -w/-y), should the mutants to merge be chosen "
+"randomly, or by highest/lowest readcount ratio? (default %default)")
parser.add_option('-j', '--merge_in_cassette', action='store_true', default=False,
help="For adjacent and tandem merging/counts: include mutants in cassette (default %default)")
parser.add_option('-J', '--merge_in_other_chrom', action='store_true', default=False,
help="For adjacent and tandem merging/counts: include mutants in non-cassette non-nuclear chromosomes "
+"(like chloroplast and mitochondrial) (default %default)")
parser.add_option('-x', '--remove_mutants_from_file', metavar='FILE1,FILE2',
help='Remove all mutants present in FILE1,FILE2,etc from the datasets (see -z/-p for read count cutoff).'
+' Any number of files is allowed - must be comma-separated without spaces.')
parser.add_option('-z', '--remove_from_file_readcount_min', type='int', default=1, metavar='M',
help='When applying -x, only remove mutants with at least N reads in FILE (default %default).')
parser.add_option('-p', '--remove_from_file_min_is_perfect', action='store_true', default=False,
help='When applying -x with -z M, compare M to perfect readcount, not total. (default %default).')
parser.add_option('-X', '--remove_mutants_not_from_file', metavar='FILE',
help='Remove all mutants NOT present in FILE from the datasets (see -Z/-P for read count cutoff).')
parser.add_option('-Z', '--remove_not_from_file_readcount_min', type='int', default=1, metavar='M',
help='When applying -X, only remove mutants with at least N reads in FILE (default %default).')
parser.add_option('-P', '--remove_not_from_file_min_is_perfect', action='store_true', default=False,
help='When applying -X with -z M, compare M to perfect readcount, not total. (default %default).')
parser.add_option('-c', '--ignore_cassette', action='store_true', default=False,
help="Ignore reads aligning to cassette (just print total count in the header as removed) "
+"(default %default)")
parser.add_option('-C', '--separate_cassette', action='store_true', default=False,
help="Like -c, but also add a *_cassette.txt file with ONLY reads aligning to cassette "
+"(and print total non-cassette count in the header of that file as removed) (default %default)")
parser.add_option('-l', '--dont_count_cassette', action='store_true', default=False,
help="Don't give separate cassette read/mutant totals in the header; (default %default)")
parser.add_option('-L', '--dont_count_other', action='store_true', default=False,
help="Don't give separate read/mutant totals in the header for 'strange' chromosomes "
+"(not cassette and not named chromosome* or scaffold*; (default %default)")
# extremely minor functionality options, do we even care??
parser.add_option('--treat_unknown_as_match', action="store_true", default=False,
help="When counting perfect reads, treat undefined alignment regions as matches (default %default)")
parser.add_option('--dont_treat_unknown_as_match', action="store_false", dest='treat_unknown_as_match',
help="Turn -u off.")
# MAYBE-TODO add user-provided mutation-count cutoffs like in old deepseq_count_alignments.py, instead of just all reads and perfet reads? Currently useless, since we're only allowing one mutation in bowtie. parser.add_option('-m', '--mutation_cutoffs', default="1,3,10", metavar="<comma-separated-int-list>")
### input options
parser.add_option('-u','--input_collapsed_to_unique', action='store_true', default=False,
help="Use to get correct original total read counts if the data was collapsed to unique sequences "
+"using fastx_collapser before alignment (default %default).")
parser.add_option('-U','--input_not_collapsed_to_unique', action='store_false', dest="input_collapsed_to_unique",
help="Turn -c off.")
parser.add_option('-m', '--input_metadata_file', default='AUTO', metavar='FILE',
help="File containing preprocessing and alignment metadata (scripts/options used etc). "
+"Can be a filename, AUTO for <infile_basename>_info.txt (warning will be raised if not found), "
+"or NONE to not look for a metadata file at all. Default %default.")
### gene-finding and gene-annotation options
parser.add_option('-g', '--gene_position_reference_file', default=None, metavar='FILE',
help="File to use to look up gene IDs based on chromosomal location (default %default)")
parser.add_option('--detailed_gene_features', action="store_true", default=True,
help="Find out what part of the gene (UTR,intron,exon) a mutant hit, based on the -g file "
+"(default %default). May take a lot of memory - increase --N_detail_run_groups to fix that.")
parser.add_option('--no_detailed_gene_features', action="store_false", dest='detailed_gene_features',
help="Turns --detailed_gene_features off.")
parser.add_option('--N_detail_run_groups', type="int", default=5, metavar='N',
help="How many passes to split reading the detailed_gene_features into (default %default) "
+"- may take a lot of memory (and CPU) if read in a single pass; too many passes waste CPU.")
# MAYBE-TODO add a "flank" option (with variable size), to catch mutants that are in the flanks of genes? Do we care?
# MAYBE-TODO add a "negative flank" option (with variable size), to ignore mutants that are in the start/end of genes?
parser.add_option('-A', '--gene_annotation_file', default=None, metavar='FILE',
help="Tab-separated file to use to look up gene names/descriptions from IDs (default %default)")
parser.add_option('-a', '--annotation_file_standard_type', type='int', default=None, metavar='V',
help="Use if file provided in -A is a standard Phytozome annotation file (missing a header): "
+"use value 4 for the one with chlamy v4.3 genome, or value 5 for v5 genome (default %default)")
### output format options
parser.add_option('-n', '--N_sequences_per_group', type='int', default=2, metavar='N',
help="How many most common sequences should be shown per group? (default %default)")
parser.add_option('-o', '--sort_data_key', choices=['position','read_count','none'], default='position',
metavar='position|read_count|none', help="Sort the output data: by alignment position, read count, "
+"or don't sort at all (default %default) - sorting may be slow for large datasets!")
parser.add_option('-B', '--dont_merge_boundary_features', action='store_true', default=False,
help="In the summary, count all feature-boundary cases separately instead of together "
+"(default %default)")
parser.add_option('-V', '--verbosity_level', action="store_true", default=1,
help="How much information to print to STDOUT: 0 - nothing, 1 - summary only, "
+"2 - summary and progress reports. (Default %default).")
parser.add_option('-q', '--quiet', action="store_const", const=0, dest='verbosity_level', help="Equivalent to -V 0.")
parser.add_option('-v', '--verbose', action="store_const", const=2, dest='verbosity_level', help="Equivalent to -V 2.")
return parser
def get_info_from_metadata_files(infiles, input_metadata_file, cassette_end, verbosity_level):
""" Parse metadata files to get wrong-format and unaligned/multiple read counts (or 'unknown' if cannot be determined).
Returns a tuple of the following counts: discarded, wrong_start, no_cassette, non_aligned, unaligned, multiple_aligned.
Any of them can be 'unknown', meaning it could not be determined.
It looks for the information in the metadata file for each infile; if it can't find or parse the metadata file for
any infile, the final count is unknown.
Special treatment for the non_aligned value for old-format files, in which that count is not in metadata at all.
"""
# if the option specified no metadata files, total discarded readcount cannot be determined
if input_metadata_file == 'NONE':
return 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown'
# make sure the -m option has a value that will work with the number of infiles
if len(infiles)>1 and not input_metadata_file=='AUTO':
print "Warning: when multiple input files are given, the -m option must be NONE or AUTO - ignoring other value."
return 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown'
# get the read counts for each infile; only return the total at the end, if all values are found
counts_per_file = {'discarded':[], 'wrong_start':[], 'no_cassette':[], 'other_end':[], 'unaligned':[], 'multiple_aligned':[]}
file_formats = []
for infile in infiles:
file_format = '?'
# take the actual input_metadata_file value as the file name, or infer it from the infile name if 'AUTO'
if input_metadata_file == 'AUTO':
infile_base = os.path.splitext(infile)[0]
if infile_base.endswith('_genomic-unique'): infile_base = infile_base[:-len('_genomic-unique')]
curr_input_metadata_file = infile_base + '_info.txt'
if verbosity_level>1:
print 'Automatically determining metadata input file name: %s'%curr_input_metadata_file
else:
curr_input_metadata_file = input_metadata_file
if verbosity_level>1:
print 'Metadata input file name provided in options: %s'%curr_input_metadata_file
# if file is missing, readcounts cannot be determined
if not os.path.exists(curr_input_metadata_file):
if verbosity_level>0:
print 'Warning: metadata input file %s not found! Proceeding without it.'%curr_input_metadata_file
counts_per_file['discarded'].append('unknown')
counts_per_file['wrong_start'].append('unknown')
counts_per_file['no_cassette'].append('unknown')
counts_per_file['unaligned'].append('unknown')
counts_per_file['multiple_aligned'].append('unknown')
file_formats.append(file_format)
continue
# go through the metadata file to find the lines with various pieces of useful information
# (if the line isn't found, the related readcount cannot be determined)
### total discarded read count (there are two possible formats of that line, old and new)
for line in open(curr_input_metadata_file):
if line.startswith('## final "bad" reads'): # new discarded-read line format
counts_per_file['discarded'].append(int(line.split(':\t')[1].split(' (')[0]))
file_format = 'new'
break
if line.startswith('## reads removed: '): # old discarded-read line format
counts_per_file['discarded'].append(int(line.split()[3]))
file_format = 'old'
break
else: # in a for-else loop the else is executed if the for wasn't ended with break, i.e. the line wasn't found
if verbosity_level>0:
print("Warning: metadata input file %s didn't contain discarded read count line! "%curr_input_metadata_file
+"Proceeding without it.")
counts_per_file['discarded'].append('unknown')
file_formats.append(file_format)
### all the other information only shows up in new-format files - in old-format just assume unknown
# (except the other_end value, which was always 0 in old-format because old-format lacked that dual-end processing)
if file_format=='old':
counts_per_file['wrong_start'].append('unknown')
counts_per_file['no_cassette'].append('unknown')
counts_per_file['other_end'].append(0)
counts_per_file['unaligned'].append('unknown')
counts_per_file['multiple_aligned'].append('unknown')
else:
### wrong-start discarded read count (new-format only)
for line in open(curr_input_metadata_file):
if line.startswith('# "bad" read count (wrong-start)'):
counts_per_file['wrong_start'].append(int(line.split(':\t')[1].split(' (')[0]))
break
else:
if verbosity_level>0:
print("Warning: metadata file %s didn't contain wrong-start readcount line! "%curr_input_metadata_file
+"Proceeding without it.")
counts_per_file['wrong_start'].append('unknown')
### no-cassette discarded read count (new-format only)
for line in open(curr_input_metadata_file):
if line.startswith('# "bad" read count (no-cassette)'):
counts_per_file['no_cassette'].append(int(line.split(':\t')[1].split(' (')[0]))
break
else:
if verbosity_level>0:
print("Warning: metadata file %s didn't contain no-cassette readcount line! "%curr_input_metadata_file
+"Proceeding without it.")
counts_per_file['no_cassette'].append('unknown')
### other-end read count (new-format only, optional)
# this line is not present if there weren't any other-end reads, so if it's not, we'll save 0 instead of unknown,
# since it means a 0, not an unknown value (and old-format files didn't have this functionality so it's always 0 there)
other_end = [x for x in mutant_analysis_classes.SEQ_ENDS if not x==cassette_end][0].replace("prime","'")
for line in open(curr_input_metadata_file):
if line.startswith('# "good" %s read count after cassette stripping'%other_end):
counts_per_file['other_end'].append(int(line.split(':\t')[1].split(' (')[0]))
break
else:
if verbosity_level>0:
print("Warning: metadata file %s didn't contain other-end readcount line! "%curr_input_metadata_file
+"Proceeding without it.")
counts_per_file['other_end'].append(0)
# for the purposes of this_end readcounts, other_end readcounts should be included in the discarded count
# (UNLESS the discarded or other_end count is unknown, then just set discarded to that.)
if 'unknown' in (counts_per_file['discarded'][-1], counts_per_file['other_end'][-1]):
counts_per_file['discarded'][-1] = 'unknown'
else:
counts_per_file['discarded'][-1] += counts_per_file['other_end'][-1]
### unaligned read count (new-format only)
for line in open(curr_input_metadata_file):
if line.startswith('# unaligned: '):
counts_per_file['unaligned'].append(int(line.split(': ')[1].split(' (')[0]))
break
else:
if verbosity_level>0:
print("Warning: metadata file %s didn't contain unaligned readcount line! "%curr_input_metadata_file
+"Proceeding without it.")
counts_per_file['unaligned'].append('unknown')
### multiple-aligned read count (new-format only)
for line in open(curr_input_metadata_file):
if line.startswith('# multiple-genomic: '):
counts_per_file['multiple_aligned'].append(int(line.split(': ')[1].split(' (')[0]))
break
else:
if verbosity_level>0:
print("Warning: metadata file %s "%curr_input_metadata_file
+"didn't contain genomic-multiple readcount line! Proceeding without it.")
counts_per_file['multiple_aligned'].append('unknown')
# MAYBE-TODO also grab the "good" readcounts (from preprocessing and alignment) and make sure they match
# the numbers we see in the dataset?
# The calculation for total non-aligned is a bit complicated, because:
# for new-format files that information comes from the metadata (and split into unaligned/multiple subcategories),
# but for the old-format files it comes from the data infile itself (without the category split.
# So for each old-format file, the general nonaligned count can be assumed to be 0 (and the real number will be
# (added during actual infile processing), but the unaligned/multiple counts stay unknown
counts_per_file['total_non_aligned'] = []
for file_format, unaligned_count, multiple_count in zip(file_formats, counts_per_file['unaligned'],
counts_per_file['multiple_aligned']):
if file_format=='old': counts_per_file['total_non_aligned'].append(0)
elif 'unknown' in (unaligned_count, multiple_count): counts_per_file['total_non_aligned'].append('unknown')
else: counts_per_file['total_non_aligned'].append(\
unaligned_count+multiple_count)
# do sums for each count category: of the length of the list is wrong or unknown is on the list, return unknown:
# if some metadata files were missing the discarded counts line, total discarded readcount cannot be determined
# if the discarded read counts for all infiles were found, return the sum as the total discarded read count
total_counts = {}
for key, list_of_counts in counts_per_file.iteritems():
if len(list_of_counts)<len(infiles) or ('unknown' in list_of_counts):
if verbosity_level>0:
print "Warning: %s read count not found for some files! Ignoring all values, keeping 'unknown'."%key
total_counts[key] = 'unknown'
else:
total_counts[key] = sum(list_of_counts)
return (total_counts['discarded'], total_counts['wrong_start'], total_counts['no_cassette'], total_counts['other_end'],
total_counts['total_non_aligned'], total_counts['unaligned'], total_counts['multiple_aligned'])
# TODO merge this with a similar code bit in mutant_join_datasets.py to minimize code duplication
def save_dataset_files(dataset, outfile, verbosity_level=0, if_pickle=True, count_cassette=True, count_other=True,
merge_boundary_features=True, sort_data_by='position', N_sequences_per_mutant=5, options="N/A"):
""" Print summary and data to output file; optionally print summary to stdout; optionally pickle dataset to picklefile.
The options argument is only used to be printed in the header to make it clear how the file was generated -
it should be the applicable optparse options object if there is one, or a text message otherwise.
"""
# print summary info to stdout if desired
if verbosity_level>1: print "\nDATA SUMMARY:"
if verbosity_level>0: dataset.print_summary(merge_boundary_features=merge_boundary_features,
count_cassette=count_cassette, count_other=count_other)
# print full data to outfile
if verbosity_level>1: print "printing output - time %s."%time.ctime()
with open(outfile,'w') as OUTFILE:
write_header_data(OUTFILE,options)
OUTFILE.write("### SUMMARY:\n")
dataset.print_summary(OUTFILE, line_prefix="# ", header_prefix="## ", merge_boundary_features=merge_boundary_features,
count_cassette = count_cassette, count_other=count_other)
OUTFILE.write("### HEADER AND DATA:\n")
dataset.print_data(OUTPUT=OUTFILE, sort_data_by=sort_data_by, N_sequences=N_sequences_per_mutant,
header_line=True, header_prefix='# ')
# print pickled dataset to picklefile, if desired
if if_pickle:
outfile_basename = os.path.splitext(outfile)[0]
pickled_outfile = outfile_basename + '.pickle'
with open(pickled_outfile,'w') as PICKLEFILE:
pickle.dump(dataset, PICKLEFILE, 0)
def main(infiles, outfile, options):
""" Run the main functionality of the module (see module docstring for more information), excluding testing.
The options argument should be generated by an optparse parser.
"""
### parse/process/reformat some options
options.ignore_cassette |= options.separate_cassette
options.count_cassette = not options.dont_count_cassette
options.count_other = not options.dont_count_other
options.merge_boundary_features = not options.dont_merge_boundary_features
# MAYBE-TODO change outfile to a folder? Since it'll have three things in it now...
outfile_basename = os.path.splitext(outfile)[0]
mutant_merging_outfile = outfile_basename + '_merging-info.txt'
# MAYBE-TODO let -C take an optional argument to put the cassette files elsewhere?
cassette_outfile = outfile_basename + '_cassette.txt'
cassette_merging_outfile = outfile_basename + '_cassette_merging-info.txt'
### generate empty alignment set object with basic read position/orientation properties defined by options
if options.Carette: dataset_class = mutant_Carette.Insertional_mutant_pool_dataset_Carette
else: dataset_class = mutant_analysis_classes.Insertional_mutant_pool_dataset
all_alignment_data = dataset_class(options.read_cassette_end, options.read_direction=='reverse')
if options.separate_cassette:
cassette_alignment_data = dataset_class(options.read_cassette_end, options.read_direction=='reverse')
# MAYBE-TODO refactor the whole bunch of "if options.separate_cassette:" clauses to avoid code duplication?
### parse preprocessing/alignment metadata file to get discarded/not-aligned/etc readcounts, pass to all_alignment_data
# (all_alignment_data initializes them to 'unkown', so if file is not given or can't be found/parsed, do nothing)
N_discarded, N_wrong_start, N_no_cassette, N_other_end, N_non_aligned, N_unaligned, N_multiple = \
get_info_from_metadata_files(infiles, options.input_metadata_file, options.read_cassette_end, options.verbosity_level)
if 'unknown' not in (N_wrong_start, N_no_cassette):
assert N_discarded == N_wrong_start+N_no_cassette+N_other_end,\
"Discarded subtotals don't add up to discarded total! %s+%s+%s != %s"%(N_wrong_start,N_no_cassette,
N_other_end,N_discarded)
all_alignment_data.summary.add_discarded_reads(N_discarded, N_wrong_start, N_no_cassette, N_other_end)
if options.separate_cassette:
cassette_alignment_data.summary.add_discarded_reads(N_discarded, N_wrong_start, N_no_cassette, N_other_end)
all_alignment_data.summary.add_nonaligned_reads(N_non_aligned, N_unaligned, N_multiple)
if options.separate_cassette:
cassette_alignment_data.summary.add_nonaligned_reads(N_non_aligned, N_unaligned, N_multiple)
# MAYBE-TODO also get the final total number of reads from the metadata infile and make sure it's the same
# as the number of processed reads I get from all_alignment_data.print_summary()?
### parse input file and store data - the add_alignment_reader_to_data function here does pretty much all the work!
for infile in infiles:
# if this is a new-style *_genomic-unique.sam file and has a matching *_cassette.sam file, parse that file too
part_infiles = [infile]
if infile.endswith('_genomic-unique.sam'):
cassette_file = infile[:-len('_genomic-unique.sam')] + '_cassette.sam'
if os.path.exists(cassette_file):
part_infiles.append(cassette_file)
for part_infile in part_infiles:
# initialize a parser for the SAM infile
if options.verbosity_level>1:
print "parsing input file %s - time %s."%(part_infile, time.ctime())
infile_reader = HTSeq.SAM_Reader(part_infile)
# fill the new alignment set object with data from the infile parser
all_alignment_data.add_alignment_reader_to_data(infile_reader,
uncollapse_read_counts = options.input_collapsed_to_unique,
ignore_cassette = options.ignore_cassette, cassette_only = False,
treat_unknown_as_match = options.treat_unknown_as_match)
if options.separate_cassette:
cassette_alignment_data.add_alignment_reader_to_data(infile_reader,
uncollapse_read_counts = options.input_collapsed_to_unique,
ignore_cassette = False, cassette_only = True,
treat_unknown_as_match = options.treat_unknown_as_match)
### optionally remove mutants based on another dataset - BEFORE adjacent mutant counting/merging
# remove mutants that ARE present in another file (also do it for cassette mutants if those are separate)
if options.remove_mutants_from_file:
for other_file in options.remove_mutants_from_file.split(','):
other_dataset = mutant_analysis_classes.read_mutant_file(other_file)
all_alignment_data.remove_mutants_in_other_dataset(other_dataset,
readcount_min=options.remove_from_file_readcount_min, perfect_reads=options.remove_from_file_min_is_perfect)
if options.separate_cassette:
cassette_alignment_data.remove_mutants_in_other_dataset(other_dataset,
readcount_min=options.remove_from_file_readcount_min, perfect_reads=options.remove_from_file_min_is_perfect)
# remove mutants that are NOT present in another file (also do it for cassette mutants if those are separate)
# TODO should I implement using multiple files here too?
if options.remove_mutants_not_from_file:
other_dataset = mutant_analysis_classes.read_mutant_file(options.remove_mutants_not_from_file)
all_alignment_data.remove_mutants_not_in_other_dataset(other_dataset,
readcount_min=options.remove_not_from_file_readcount_min, perfect_reads=options.remove_not_from_file_min_is_perfect)
if options.separate_cassette:
cassette_alignment_data.remove_mutants_not_in_other_dataset(other_dataset,
readcount_min=options.remove_not_from_file_readcount_min, perfect_reads=options.remove_not_from_file_min_is_perfect)
### optionally merge some mutant categories
with open(mutant_merging_outfile, 'w') as MERGEFILE:
with open(cassette_merging_outfile, 'w') as CASSETTE_MERGEFILE:
# 1) adjacent same-strand mutants (since they're probably just artifacts of indels during deepseq/PCR)
if options.merge_adjacent_mutants:
try: leave_N_mutants = int(options.merge_adjacent_leave_mutants)
except ValueError: leave_N_mutants = options.merge_adjacent_leave_mutants
if options.merge_adjacent_count_ratio is not None: leave_N_mutants = 'use_ratio'
all_alignment_data.merge_adjacent_mutants(merge_max_distance=options.adjacent_max_distance,
leave_N_mutants=leave_N_mutants, min_count_ratio = options.merge_adjacent_count_ratio,
leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes = options.merge_in_cassette,
merge_other_chromosomes = options.merge_in_other_chrom, OUTPUT = MERGEFILE)
if options.merge_in_cassette and options.separate_cassette:
cassette_alignment_data.merge_adjacent_mutants(merge_max_distance = options.adjacent_max_distance,
leave_N_mutants=leave_N_mutants, min_count_ratio = options.merge_adjacent_count_ratio,
leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes = True,
merge_other_chromosomes = False, OUTPUT = CASSETTE_MERGEFILE)
# 2) opposite-strand same-position mutants (since they're probably just tail-to-tail cassette tandems)
if options.merge_opposite_tandem_mutants:
try: leave_N_mutants = int(options.merge_opposite_leave_mutants)
except ValueError: leave_N_mutants = options.merge_opposite_leave_mutants
if options.merge_opposite_count_ratio is not None: leave_N_mutants = 'use_ratio'
all_alignment_data.merge_opposite_tandem_mutants(leave_N_mutants=leave_N_mutants,
max_count_ratio = options.merge_opposite_count_ratio, leave_method=options.merge_mutant_choice_method,
merge_cassette_chromosomes = options.merge_in_cassette,
merge_other_chromosomes = options.merge_in_other_chrom, OUTPUT = MERGEFILE)
if options.merge_in_cassette and options.separate_cassette:
cassette_alignment_data.merge_opposite_tandem_mutants(leave_N_mutants=leave_N_mutants,
max_count_ratio = options.merge_opposite_count_ratio, leave_method=options.merge_mutant_choice_method,
merge_cassette_chromosomes = True, merge_other_chromosomes = False, OUTPUT = CASSETTE_MERGEFILE)
# 3) count adjacent mutants, even if not doing any merging
# Actually there's always a count done after each merge, but that count doesn't print the details to anything,
# so just run it again here with detail-printing - MAYBE-TODO find a more efficient way instead of counting twice?
all_alignment_data.count_adjacent_mutants(max_distance_to_print = options.adjacent_max_distance,
max_distance_to_count = 10000, count_cassette_chromosomes = options.merge_in_cassette,
count_other_chromosomes = options.merge_in_other_chrom, OUTPUT = MERGEFILE)
if options.separate_cassette:
cassette_alignment_data.count_adjacent_mutants(max_distance_to_print = options.adjacent_max_distance,
max_distance_to_count = 10, count_cassette_chromosomes = True,
count_other_chromosomes = False, OUTPUT = CASSETTE_MERGEFILE)
# MAYBE-TODO make an option for max_distance_to_count? I'm using a much lower one for cassette because it's so dense.
# since there's no optional as/with statement, just remove the cassette_merging_outfile if unwanted
if not options.separate_cassette:
os.remove(cassette_merging_outfile)
### optionally parse gene position/info files and look up the genes for each mutant in the data
if options.gene_position_reference_file is not None:
genefile = options.gene_position_reference_file
if options.verbosity_level>1: print "adding genes from file %s to mutant data - time %s."%(genefile, time.ctime())
all_alignment_data.find_genes_for_mutants(genefile, detailed_features=options.detailed_gene_features,
N_run_groups=options.N_detail_run_groups,
verbosity_level=options.verbosity_level)
# if we have gene info, optionally also add annotation
if options.gene_annotation_file:
if options.verbosity_level>1:
print "adding gene annotation from file %s - time %s."%(options.gene_annotation_file, time.ctime())
all_alignment_data.add_gene_annotation(options.gene_annotation_file,
if_standard_Phytozome_file=options.annotation_file_standard_type, print_info=(options.verbosity_level >= 2))
### output data to files
save_dataset_files(all_alignment_data, outfile, options.verbosity_level, True, options.count_cassette, options.count_other,
options.merge_boundary_features, options.sort_data_key, options.N_sequences_per_group, options)
# TODO write some info about all the other files that go with this one (pickle, merging-info, *cassette*)
if options.separate_cassette:
save_dataset_files(cassette_alignment_data, cassette_outfile, 0, True, False, False,
options.merge_boundary_features, options.sort_data_key, options.N_sequences_per_group, options)
# TODO probably write some extra bit of info about this being the cassette-file to MAINFILENAME
if __name__ == "__main__":
""" Allows both running and importing of this file. """
parser = define_option_parser()
(options, args) = parser.parse_args()
# if ran with -t option, do unit tests and quit
if options.test_functionality:
print("*** You used the -t option - ignoring all other options/arguments, running the built-in test suite. ***")
print("\n * unit-tests for the mutant_analysis_classes.py module")
# to run tests for another file, have to use TextTestRunner, not unittest.main - make a test suite with
# autodetection of all tests (see http://docs.python.org/library/unittest.html#unittest.TestLoader)
test_suite_1 = unittest.defaultTestLoader.loadTestsFromModule(mutant_analysis_classes)
unittest.TextTestRunner(verbosity=1).run(test_suite_1)
# to run tests for current module, just run unittest.main, passing it only the filename
# (by default it takes all of sys.argv and complains about options/arguments it can't recognize)
print("\n * unit-tests for this module (%s)"%sys.argv[0])
unittest.main(argv=[sys.argv[0]]) # unittest.main automatically runs sys.exit()
if options.test_run:
print("*** You used the -T option - ignoring all other options and running the built-in example test runs. ***")
test_result = do_test_run()
sys.exit(test_result)
# otherwise parse the arguments and run main function
if len(args)<2:
parser.print_help()
sys.exit("\nError: at least one infile and exactly one outfile are required!")
outfile = args[-1]
infiles = args[:-1]
main(infiles, outfile, options)