/
selenoprofiles_3.py
executable file
·5465 lines (4962 loc) · 356 KB
/
selenoprofiles_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python -u
__author__ = "Marco Mariotti"
__email__ = "marco.mariotti@crg.eu"
__licence__ = "GPLv3"
__version__ = "3.6a"
global temp_folder; global split_folder
from string import *
import sys
import traceback
from types import MethodType
from commands import *
sys.path.insert(0, "/users/rg/mmariotti/libraries/")
sys.path.append('/users/rg/mmariotti/scripts')
from MMlib import *
try: import annotations.GO.Parsers.oboparser as oboparser
except: sys.exc_clear()
allowed_output_formats=['p2g', 'fasta', 'gff', 'gtf', 'cds', 'secis', 'three_prime', 'five_prime', 'dna', 'introns', 'aligned_cds']
allowed_output_formats_descriptions={'fasta':'fasta file with the predicted protein sequence',
'gff' : 'gff file with the genomic coordinates of the prediction',
'gtf': 'analog to gff, but with this syntax for the last field: gene_id "id_for_prediction"; transcript_id "id_for_prediction"; (...)',
'three_prime': 'fasta file with the sequence downstream of the prediction. Its length is defined by the three_prime_length parameter in the main configuration file',
'cds':'fasta file with the coding sequence of the prediction. If frameshifts are predicted, the frameshifts-causing nucleotides are excluded. If the prediction is complete at 3\', the stop codon is included',
'secis':'file with the SECISes found downstream of the prediction (for selenoprotein search - requires SECISearch3)',
'p2g':'selenoprofiles standard output. It contains all essential information for the prediction, including the query-target alignment, the genomic coordinates etc ',
'dna':'fasta file with the full predicted gene sequence, including the intronic sequences (and frameshifts if any)',
'introns':'fasta file with the sequence of the introns, split into different fasta headers',
'five_prime':'fasta file with the sequence upstream of the prediction. Its length can defined by the five_prime_length option or setting it as parameter in the main configuration file',
'aligned_cds': 'fasta file with a pairwise alignment between the coding sequence and a fake back translation of the blast_query_master sequence where gaps in the original alignment have been converted to X. useful to obtain alignments of cds between different predictions without realigning'}
help_msg="""Selenoprofiles version """+str(__version__)+""" for profile-based prediction of protein families in nucleotide databases. In a single run, selenoprofiles can search multiple profiles in a single target.
usage: Selenoprofiles results_folder target_file -s "species name" -p ARG [other options]
As ARG of -p, you must provide one or more profiles (comma separated, if multiple). Arguments can be filenames (aligned fasta files) or profile names. If they are profile names, files called like profile_name.fa or profile_name.fasta must be present in the profiles folder defined in your main configuration file. Profile alignments are normally built with default options: a profile_name.fa.config file containing the profile settings is created. See script selenoprofiles_build_profile.py to build a profile with non-default options.
The following routines are normally executed only if their output files are not found. You can force their execution specifying either the short or long option. Forcing a routine forces also the execution of all subsequent steps. Note that this may cause certain files to be overwritten, but none will be deleted.
-B || -blast Run blast and filtering of its output. All next steps are for each blast hit (merged by colinearity)
-E || -exonerate Run cyclic_exonerate
-G || -genewise Run genewise. For the blast hits with empty exonerate outputs, genewise is run if the option genewise_to_be_sure is active.
-C || -choose For each hit, choose the best prediction among available ones (blast, exonerate, genewise)
-F || -filter Filter results according to p2g_filtering property of profile, then to p2g_refiltering property.
-D || -database Store filtered results in a sqlite database
-O || -output Produce output files according to output options (see below or in the manual).
Before outputing, results are stored in a SQLite database inside the results_folder. If selenoprofiles finds in the database the results from a previous run, it will load them instead of running the pipeline, unless any routine prior to output is specified as above.
Output files will be created inside a folder called like: results_folder/species_name.target_file_name/output/
These output formats are built-in in selenoprofiles:
"""+join([ (format+':').ljust(2+max([len(i) for i in allowed_output_formats ]))+allowed_output_formats_descriptions.setdefault(format, 'no description available for this method') for format in allowed_output_formats], '\n')+"""
Use option -output_FORMAT to activate the corresponding output for each prediction. For example option -output_gff will produce gff files.
To see the default active options, see your main configuration file.
Additionally, if a least one prediction is output, a fasta alignment called PROFILE.ali is created: this contains the sequences of the profile along with all predictions for this family in this target.
A few other options:
-no_splice || -N for use on RNA sequences or bacterial genomes. Genewise is deactivated in this mode
-genetic_code + use a non-standard genetic code; see NCBI codes; implies -tblastn (see -help full)
-test prints the slave programs and modules available in this selenoprofiles installation, then quits
-print_opt print currently active options
-h full display full list of options and of accessory programs
Please refer to the manual, available at http://big.crg.cat/services/selenoprofiles
If selenoprofiles has been useful for your research, please cite:
Mariotti M, Guigo R. Selenoprofiles: profile-based scanning of eukaryotic genome sequences for selenoprotein genes.
Bioinformatics. 2010 Nov 1;26(21):2656-63."""
full_help="""
####### Full list of other options #######
Options with + require an argument. Options can be specified either in command line with syntax -option value or in the configuration file as option = value. If no argument is required for an option, in the configuration file you must use value=1. To deactivate an option which is active by default, use -option 0.
## system and global configuration
-bin_folder + folder where the executables run by selenoprofiles are searched
-profiles_folder + folder where the profile alignments are searched
-temp + temporary folder. A folder with random name is created here, used and deleted at the end of the computation
-save_chromosomes active by default. Selenoprofiles tries to recycle the single-sequence fasta files extracted from the genome, to minimize computation. These files are kept in a subfolder of the folder specified with -temp. Turn off to save disk space
-no_colors disable printing in colors to atty terminals. Put "no_colors=1" in your configuration file to set this as default
-GO_obo_file + path to the gene_ontology_ext.obo file used in GO tools-based filtering (see manual)
## prediction programs
-dont_exonerate do not run exonerate. Not recommended.
-dont_genewise do not run genewise. Use to reduce the time required for computation
-genewise_to_be_sure active by default. When exonerate produce no output or its prediction does not overlap the seed blast hit, genewise is run, seeded using the blast hits coordinates. Turn off this option not to run genewise in these cases, to reduce the time required for computation
-no_blast do not allow choosing a blast prediction (over a genewise or exonerate prediction). Use this if an accurate splice site prediction is crucial for you
-tblastn use simple tblastn (single query) instead of psitblastn (profile based PSSM)
-exonerate_extension + nt lenght of extension used on both sides by the cyclic exonerate procedure (see paper or manual)
-genewise_extension + nt length of extension used on both sides when running genewise on gene boundaries defined by exonerate
-genewise_tbs_extension + nt length of extension used on both sides when running genewise blast hits for which exonerate produced no output (only if option -genewise_to_be_sure is active)
-blast_opt/exonerate_opt/genewise_opt "+" command line options always used when running psitblastn/exonerate/genewise. Additional profile-specific options are also available (see profile attributes below)
## database
-max_attempts_database + if you're running multiple parallel instances of selenoprofiles on the same target, sometimes selenoprofiles will find the database locked. This sets the maximum number of times that selenoprofiles will try again before crashing.
-sleep_time + when the database is found locked, this is the time in seconds separating two consecutive attempts
-full_db normally, selenoprofiles will store in the database only the results passing all filters. If this option is active, all results are written, allowing for fast retrieving (see also option -state)
-no_db do not store anything in the database, and exit after the filtering step. This option make sense only if you want to heavily parallelize the use of selenoprofiles, when thousands of accesses to the database may become a problem; this option allow to produce all intermediate text files, but will produce no output. Selenoprofiles must be run again without this option when files for all profiles are ready, so that results will be stored in the database, redundancy of results will be removed and output will be produced
-stop exit after storing results to the database, producing no output files. When all results are ready in the database, you should run selenoprofiles again with option -merge to remove redundancy and get output.
-merge force running the procedure to remove inter-profile redundancy of all results in the database, and proceed to output
-no_merge force skipping the procedure to remove inter-profile redundancy
## additional output options (see above for basics)
-five_prime_length + length in nucleotide used when outputing the five prime sequence (output option -output_five_prime must be active)
-three_prime_length + length in nucleotide used when outputing the three prime sequence (output option -output_three_prime must be active)
-outfolder + use this as output folder, instead of results_folder/species_name.target_file_name/output/
-output_ali active by default. A fasta alignment file is produced for each family comprising all filtered results. Turn off to speed up outputing
-output_FORMAT_file + redirect all output of the specified format (see above for available formats) to the argument file
-output_filter "+" argument is a python expression evaluating a p2g result, named x. Only the results for which the evaluation is true are output. e.g: -output_filter "x.chromosome=='chr1'" Note: if option -full_db is active, the filtered out results will also be considered unless specifically excluded by the output filter
-state + determines what kind of filtering label the results must have to be output. Possible values: kept, filtered, refiltered, redundant, overlapping. Multiple values can be provided comma-separated. Note: this option will work only if option -full_db was active when writing results to the database
-fasta_add "+" argument is a python expression manipulating a p2g result, named x. The expression is evaluated to a string which is added to the fasta header used for the result (for example in the fasta output, cds output, ali output etc). Useful to quickly add custom information to the output
## miscellaneous
-fam_list + file with list of profiles, one per line. This is an alternative way to provide the list of profiles that overrides option -p
-name + specify manually the target file name. Normally it is derived from the file name
-log + write a copy of the output normally printed to screen into a file provided as argument
-add + provide a file with python code that is executed before the pipeline flow. It can be used to customize selenoprofiles (see manual)
-clean remove intermediate files (blast, exonerate, genewise etc), to save disk space. Use only for non-parallelized run of selenoprofiles, or making sure that results are stored in the db before cleaning
-filtered_blast_file + write an output file with all blast hits passing the blast filter. Mostly for refining filters
-blast_filtering_warning active by default. When too many blast hits pass filtering for a profile, normally the program prints a warning and goes on with the first blast hits encountered. If you turn this option off, in these cases the program will crash instead
-debug if an error occurs, instead of crashing directly, it prints a preview of the error, then waits for keyboard input before exiting. Useful when debugging to inspect the files in the temporary folder (deleted when exiting)
-print_commands print to screen every bash command before running it. Extremely verbose
#### Families set ####
If you often run the same list of families, you may want to define them with a keyword. This is done by adding a line in the configuration file with the syntax:
families_set.NAME = fam1,fam2,fam3
You can then use such defined keywords (NAME in the example) as argument(s) of the -p option. Families set definition can include other sets names. Examples: families_set.first_set = fam1,fam2,fam3 families_set.extended_set = first_set,fam4,fam5
#### Actions ####
Actions are a fast way to customize the pipeline workflow for specific purposes. Each action is performed on all predictions active at a specific moment of the pipeline, depending on the action category. The possible categories of actions are post_blast_filter, post_blast, post_blast_merge, pre_choose, pre_filtering, post_filtering, pre_output.
Actions can be specified either in the command line or in the main configuration file, like other options.
The syntax for command line actions is: -ACTION.category.name "+", where name is a user-defined name for the action and the argument is a piece of python code that will be executed, in which you can refer to the active p2g prediction as x.
Actions can be used to modify/improve predictions (see for example the default actions in the main configuration file) or to add custom output to the log of selenoprofiles.
Example of command line action: (prints the sequence of results labelled as pseudogenes)
-ACTION.post_filtering.print_pseudo " if x.label=='pseudo': print x.output_id()+' '+ x.protein() "
See selenoprofiles manual for more information.
####### Profile attributes #######
These attributes are read from the .config file of each profile alignment, and can be manually added to such .config file, or chosen when running selenoprofiles_build_profile.py.
In the .config file, attributes are set with the syntax attribute_name = value
All attributes not specified for a certain profile are assumed to be defaults read from the main configuration file, there in the syntax attribute_name.DEFAULT = value
If you want to use the same value for many profiles, then you should set attributes with keywords. In this case you need to define an attribute keyword name in the main configuration file with the syntax attribute_name.KEYWORD_NAME = value ; then, you can use the keyword in the .config file of any profile using the syntax attribute_name = KEYWORD_NAME ; as an example, see the SELENO keyword for attributes blast_options, exonerate_options and genewise_options in the main configuration file. This particular keyword is used for selenoprotein families.
### Full list of attributes (all attributes require an argument)
.name profile name, normally derived from the profile alignment filename [compulsory]
.blast_filtering python expression evaluated to decide if a blast hit (named x) passes blast filtering; e.g. x.evalue < 1e-2
.p2g_filtering python expression evaluated to decide if a p2g hit (named x) passes filtering. This filter is applied after the prediction choice step; e.g. x.coverage() >= 0.5
.p2g_refiltering python expression evaluated to decide if a p2g hit (named x) passes refiltering. This filter is applied after the p2g_refiltering to allow two types of discarded hits, filtered and refiltered.
.max_blast_hits maximum number of blast hits allowed for this profile. See also option blast_filtering_warning.
.queries defines which sequences in the profile are elegible as queries for exonerate and genewise. There are several possible syntaxes:
all all sequences are queries (default)
[...] you provide directly the list of queries titles, or indices (starting from 0) in the ordered titles list
last_query:TITLE you provide the last title in the ordered titles list that will be labelled as query. This may be a incomplete title: it just have to be found in a title
best:0.XX you provide the proportion of the full list of queries in the ordered list that will be labelled
NOTE for selenoprotein families: in the first, third and fourth cases, those titles for which at least one U position in the alignment contains a gap in the sequence are not considered as queries, unless the you prefix the value of the queries attribute with "not_only_U:"
Examples of valid values for attribute queries: best:0.5 last_query:XP_854030 not_only_U:all not_only_U:best:0.8
.clustering_seqid before running psitblastn, the profile alignment is split into clusters, and a search is performed for each of them, to ensure adequate sensitivity. This is the maximum sequence identity within each cluster
.max_column_gaps within each cluster, a blast query is computed by building a consensus of sequences in the cluster. This sets the maximum proportion of gaps in a column of a blast cluster, above which the position will not be present in the blast query
.tags list of tags (strings in perl-like regexp syntax) used by the tag_score method, for tag blast filtering (see manual). e.g. ['deiodinase', '\\WDI\\d ']
.neutral_tags list of tags scored neutrally in the tag_score method. This allows skipping non-informative titles without penalty
.tag_db path to the fasta file used as database for the tag blast procedures, used when tag_score and go_score methods are invoked. Normally the same nr database can be used for all profiles but you may differentiate them to speed up or improve the process
.tag_blast_options command line options used when running tag blast (blastp agains tag_db) for this profile
.go_terms list of GO terms (as "GO:XXXXXXX" strings) used by the go_score method, for gene ontology/tag blast based filtering (see manual). e.g. ['GO:0004364']
.uniref2go_db path to the file mapping the GO terms to the proteins in the tag_db. Used for go_score
.blast_options/exonerate_options/genewise_options command line options used when running psitblastn/exonerate/genewise for this profile
####### Accessory programs #######
This pipeline comes with a few accessory programs:
selenoprofiles_build_profile.py to build a custom profile alignment, or inspect its features for filtering, such as AWSI distribution or associated GO terms
selenoprofiles_database.py to manipulate a selenoprofiles sqlite database, to clean it, or for fast output of stored results
selenoprofiles_join_alignments.py useful to collect an alignment for profiles searched in multiple targets, typically different species
selenoprofiles_tree_drawer.py for graphical output of the predicted genes along a species tree
"""
terminal_colors={'routine':'blue', 'database':'magenta', 'profile':'green'}
set_MMlib_var('colored_keywords', {'ERROR':'red,underscore', 'WARNING':'red'})
def load(config_filename='/users/rg/mmariotti/scripts/selenoprofiles_3.config', args={}):
"""Load all global variables later used, reading configuration file and command line options. """
### initialising command line options and initialising file / objects lists . opt is a dictionary the all the command_line options. (0: False 1:True in case of boolean values). The -config FILE option allows to specify a different configuration file. Every option in the configuration file is read, then eventually replaced by the correspondant command line option.
for i in range(len(sys.argv)):
if sys.argv[i] == "-config": config_filename=sys.argv[i+1]
command_line_synonyms={'B':'blast', 'E':'exonerate','G':'genewise','C':'choose', 'F':'filter', 'O':'output', 'P':'profile', 'S':'species', 's':'species', 'p':'profile', 'D':'database', 'no_splicing':'no_splice', 'N':'no_splice'}
non_config_options=['t', 'v', 'name', 'blast', 'exonerate', 'genewise', 'filter', 'choose', 'output', 'fam_list', 'print_commands', 'species', 'dont_exonerate', 'dont_genewise', 'debug', 'log', 'state', 'filtered_blast_file', 'output_filter', 'add', 'clean', 'fasta_add', 'outfolder', 'five_prime_length', 'database', 'no_blast', 'stop', 'no_merge', 'no_db', 'merge', 'test', 'no_splice', 'genetic_code', 'tblastn']
for keyword in allowed_output_formats: non_config_options.extend(['output_'+keyword+'_file', 'output_'+keyword])
# reading configuration file
def_opt= configuration_file(config_filename)
# complete list of global variables
global families_sets, keywords, opt, sleep_time, max_attempts_database, temp_folder, split_folder, bin_folder, profiles_folder, target_file, reference_genome_filename, three_prime_length, five_prime_length, profiles_names, profiles_hash, results_folder, target_name, target_species, target_results_folder, results_db_file, results_db, actions, blast_folder, exonerate_folder, genewise_folder, prediction_choice_folder, filtered_list_folder, output_folder, blast_nr_folder, target_file_index, chromosome_length_file, exonerate_extension, genewise_extension, genewise_tbs_extension, blast_options, exonerate_options, genewise_options, genewise_tbs_options, output_file_handlers, max_chars_per_column
# setting sets of families, defined by keywords starting with families_set. in the config file. e.g families_set.eukaryotic = sps,GPx,MsrA,DI,15-kDa,Fep15
families_sets={} # example: key_ machinery -> value_ ['sps', 'eEFsec', 'pstk', ... ]
if def_opt.has_key('families_set'):
for k in def_opt['families_set']:
families_sets[ k ] = [del_white(word) for word in def_opt['families_set'][k].split(',')]
# setting filtering defaults or keywords functions. here, we define dynamically functions in this environment, reading their text representation (python code) from the configuration file
for x in [x for x in def_opt if '.' in x]: del def_opt[x]
keywords={}; keywords_text={}
for category in profile_alignment.parameters:
keywords[category]={}
if not def_opt.has_key(category): raise notracebackException, "ERROR configuration file lacks default value for "+category+' (example: '+category+'.DEFAULT = value ) '
keywords_text[category]=def_opt[category]
for kword in def_opt[category]: # one is surely DEFAULT, then more can be defined.
function_text=def_opt[category][kword]
if 'filtering' in category: function_text='lambda x:'+function_text
elif 'options' in category or '_db' in category: function_text='"""'+function_text+'"""'
try: exec('keywords[category][kword]='+str(function_text))
except:
printerr("selenoprofiles ERROR can't assign function "+str(function_text)+' to keyword '+str(kword)+' in category '+str(category), 1)
raise
#preparing to read command line options
for i in non_config_options:
if not i in def_opt: def_opt[i]=0
## the "if args" construct is to allow running this "load" function from inside an external python script. Normally, options are read from command line
if args:
opt=options()
for k in def_opt:
if args.has_key(k): opt[k]=args[k]
else: opt[k]=def_opt[k]
else: opt=command_line(def_opt, help_msg, ['r','t'], synonyms=command_line_synonyms, tolerated_regexp=['ACTION.*']+[p+'.*' for p in profile_alignment.parameters], strict= notracebackException, advanced={'full':full_help} ); #### reading options from command line
# allowing to specify profile parameters on the command line
for x in opt:
if '.' in x:
category=x[:x.find('.')]
kword= x[x.find('.')+1:]
value=opt[x]
keywords_text[category][kword]=str(value)
function_text=value
if 'filtering' in category: function_text='lambda x:'+function_text
elif 'options' in category or '_db' in category: function_text='"""'+function_text+'"""'
try: exec('keywords[category][kword]='+str(function_text))
except:
printerr("selenoprofiles ERROR can't assign function "+str(function_text)+' to keyword '+str(kword)+' in category '+str(category), 1)
raise
set_MMlib_var('opt', opt);
sleep_time=opt['sleep_time']
max_attempts_database=opt['max_attempts_database']
if opt['log']:
if opt['log']==1: raise notracebackException, "selenoprofiles ERROR an log output must be provided with option -log"
global log_file; log_file=open(opt['log'], 'w'); set_MMlib_var('log_file', log_file)
# set options and parameters
temp_folder=Folder(random_folder(opt['temp'])); test_writeable_folder(temp_folder, 'Please provide a different folder with option -temp'); set_MMlib_var('temp_folder', temp_folder);
if opt['save_chromosomes']: split_folder=Folder(opt['temp'])
else: split_folder=temp_folder
test_writeable_folder(split_folder, 'split_folder'); set_MMlib_var('split_folder', split_folder)
bin_folder=Folder(opt['bin_folder']); set_MMlib_var('bin_folder', bin_folder)
profiles_folder=Folder(opt['profiles_folder']); check_directory_presence(profiles_folder, 'profiles_folder', notracebackException)
if opt['genetic_code']:
set_genetic_code(opt['genetic_code'])
opt['tblastn']=1
if not opt['dont_exonerate']:
exonerate_version=float(bash('exonerate --version')[1].split('\n')[0].split()[-1][:3])
if exonerate_version<2.4:
raise notracebackException, "ERROR exonerate version detected: {}\nAlternative genetic codes are bugged in exonerate versions <2.4!\nPlease download and install exonerate version 2.4.0 if you want to use -genetic_code".format(exonerate_version)
for k in keywords['blast_options']: keywords['blast_options'][k]+= ' -D '+str(opt['genetic_code'])
for k in keywords['exonerate_options']: keywords['exonerate_options'][k]+=' --geneticcode '+str(opt['genetic_code'])
for k in keywords['genewise_options']: keywords['genewise_options'][k]=keywords['genewise_options'][k].format(GENETIC_CODE=opt['genetic_code'])
else:
for k in keywords['genewise_options']: keywords['genewise_options'][k]=keywords['genewise_options'][k].format(GENETIC_CODE=1)
###############
### test routine
if opt['test']:
write('\n'); write('TEST', how=terminal_colors['routine']); write(' routine', 1)
write('Slave programs:', 1)
write(' blastall (ncbi blast) : ')
b=bash('blastall ')
if b[0]==256: write('found', 1)
else: write('NOT FOUND! blastall is compulsory in the selenoprofiles pipeline, please install it!', 1)
write(' blastpgp (ncbi blast) : ')
b=bash('blastpgp --help')
if b[0]==256: write('found', 1)
else: write('NOT FOUND! blastpgp is compulsory in the selenoprofiles pipeline, please install it!', 1)
write(' formatdb (ncbi blast) : ')
b=bash('formatdb --help')
if b[0]==256: write('found', 1)
else: write('NOT FOUND! formatdb is compulsory in the selenoprofiles pipeline, please install it!', 1)
write(' exonerate (exonerate pkg) : ')
b=bash('exonerate ')
if b[0]==256: write('found', 1)
else: write('NOT FOUND! you could still run using option -dont_exonerate' , 1)
write(' fastaindex (exonerate pkg) : ')
b=bash('fastaindex ')
if b[0]==256: write('found', 1)
else: write('NOT FOUND! the fasta suite by exonerate is compulsory for the selenoprofiles pipeline, please install it! ' , 1)
write(' fastafetch (exonerate pkg) : ')
b=bash('fastafetch ')
if b[0]==256: write('found', 1)
else: write('NOT FOUND! the fasta suite by exonerate is compulsory for the selenoprofiles pipeline, please install it! ' , 1)
write(' fastasubseq (exonerate pkg) : ')
b=bash('fastasubseq ')
if b[0]==256: write('found', 1)
else: write('NOT FOUND! the fasta suite by exonerate is compulsory for the selenoprofiles pipeline, please install it! ' , 1)
write(' genewise (Wise2 package) : ')
b=bash('genewise ')
if b[0]==16128: write('found', 1)
else: write('NOT FOUND! you could still run using option -dont_genewise' , 1)
write(' SECISearch3 (Seblastian) : ')
b=bash('Seblastian.py ')
if b[0]==2048: write('found', 1)
else: write('NOT FOUND! SECISearch is recommended to search for selenoproteins; you can ignore this if you want to search your custom profiles' , 1)
### ##
write('\nPython modules and miscellaneous:', 1)
write(' oboparser (module for GO) : ')
try:
import annotations.GO.Parsers.oboparser as oboparser
write('found', 1)
except:
write('NOT FOUND! you cannot use method go_score() for filtering. This precludes the use of the built-in selenoprotein profiles', 1)
sys.exc_clear()
write(' obofile (GO structure) : ')
if is_file(opt['GO_obo_file']): write('found', 1)
else: write('NOT FOUND! you cannot use method go_score() for filtering. This precludes the use of the built-in selenoprotein profiles', 1)
write(' uniref2go db (GO annotation) : ')
if is_file( keywords ['uniref2go_db']['DEFAULT'] ): write('found', 1)
else: write('NOT FOUND! You cannot use methods go_score(), unless you defined a different uniref2go_db attribute for that profile. This precludes the use of the built-in selenoprotein profiles', 1)
write(' uniref database (tag/GO blast) : ')
if is_file( keywords ['tag_db']['DEFAULT'] ): write('found', 1)
else: write('NOT FOUND! You cannot use methods tag_score() and go_score(), unless you defined a different tag_db attribute for that profile. This precludes the use of the built-in selenoprotein profiles', 1)
write(' Pylab (interactive graphs) : ')
try:
import Pylab
write('found', 1)
except:
write('NOT FOUND! you cannot use the graphical tools ( -d or -D ) of selenoprofiles_build_profile.py', 1)
sys.exc_clear()
write(' ete2 (interactive trees) : ')
try:
import ete2
write('found', 1)
except:
write('NOT FOUND! you cannot use the results viewer program called selenoprofiles_tree_drawer.py', 1)
sys.exc_clear()
sys.exit()
###############
##### setting target
target_file= opt['t']; check_file_presence(target_file, 'target file', notracebackException)
target_file=abspath(target_file)
reference_genome_filename= target_file; set_MMlib_var('reference_genome_filename', target_file) #to allow some [gene].fasta_sequence() calls without specifying the genome file. see [blasthit].place_selenocysteine() using [blasthit].cds()
three_prime_length=opt['three_prime_length']; set_MMlib_var('three_prime_length', three_prime_length)
five_prime_length=opt['five_prime_length']; set_MMlib_var('five_prime_length', five_prime_length)
if opt['output_three_prime'] and not three_prime_length: raise notracebackException, "ERROR output_three_prime is active but three_prime_length is 0 or not defined!"
if opt['output_five_prime'] and not five_prime_length: raise notracebackException, "ERROR output_five_prime is active but five_prime_length is 0 or not defined!"
# setting routine cascade
next_must_be_active=0
for o in ['blast', 'exonerate', 'genewise', 'choose', 'filter','database', 'output']:
if opt[o]: next_must_be_active=1
elif next_must_be_active: opt[o]=1
if opt['print_commands']: set_MMlib_var('print_commands', 1)
## determining families
if opt['fam_list']:
write('Reading families list from file: '+str(opt['fam_list']), 1)
check_file_presence(opt['fam_list'], 'families list file', notracebackException)
opt['profile']= join( [line.strip() for line in open(opt['fam_list'], 'r')], ',')
# here I change each set of families with the list of its members, iteratively
profile_files=opt['profile'].split(',')
profiles_names=[]; profiles_hash={}
index_profile=0
while any ([families_sets.has_key(p) for p in profile_files]):
this_profile=profile_files[index_profile]
while families_sets.has_key(this_profile):
profile_files[index_profile:index_profile+1]= families_sets[this_profile]
this_profile=profile_files[index_profile]
index_profile+=1
# removing redundancy in profiles names
profiles_files_length=len(profile_files)
for i in range(profiles_files_length-1, -1, -1):
profile_file=profile_files[i]
if profile_file in profile_files[:i]:
profile_files.pop(i)
# changing profile names with profile file
write('/'+'-'*119, 1)
for f in range(len(profile_files)):
this_profile=profile_files[f]
if this_profile == 'NONE': continue
this_profile_found=False
for ext in ['', '.fa', '.fasta', '.ali']:
for profiles_folder_attempt in ['', profiles_folder]:
file_attempt=profiles_folder_attempt+this_profile+ext
if not this_profile_found and is_file(file_attempt):
profile_files[f]= file_attempt
this_profile_found=True
if not this_profile_found: raise notracebackException, "selenoprofiles ERROR can't find profile: "+this_profile
else:
write( '|-'+'-'*abs(f%8-4)+' load profile '+profile_files[f], 1)
profile_ali=profile_alignment(profile_files[f])
profiles_names.append(profile_ali.name)
if profiles_hash.has_key(profile_ali.name): raise notracebackException, "selenoprofiles ERROR profile names must be unique! This was found twice: "+profile_ali.name
profiles_hash[profile_ali.name] = profile_ali
write('\\'+'-'*119, 1)
# filtering state
if not opt['state']: opt['state']='kept'
## determining target name and output folders
results_folder=Folder(opt['r']);
if not opt['outfolder']: test_writeable_folder(results_folder, 'selenoprofiles results folder')
if opt['name']: target_name=opt['name']
else:
target_name=base_filename(target_file)
if target_name.split('.')[-1] in ['fasta', 'fa']: target_name=join(target_name.split('.')[:-1], '.')
target_name=replace_chars(target_name, '.', '_')
#species_library=opt['species_library']; check_file_presence(species_library, 'species_library'); set_MMlib_var('species_library', species_library) ## this will be used for getting species name
## database for output, linking to species
target_species=None; set_species_from_previous_run=False
if not target_name=='genome' and not opt['species']:
### no species information provided!
# checking if there's a results subfolder with results on the same target, so we can use the same species
service('no species provided. searching existing folders for a previous run on the same target... ')
target_links_found=bbash('find '+results_folder+' -maxdepth 2 -mindepth 2 -name link_target.fa -type l ').split('\n')
if target_links_found==['']: target_links_found=[]
for possible_target_link in target_links_found:
try:
possible_target_file= dereference(possible_target_link)
if possible_target_file == target_file:
masked_species_name = possible_target_link.split('/')[-2].split('.')[0]
#taxid, library_species_name= get_species_from_library( unmask_characters( replace(masked_species_name, '_', ' ') ) )
#target_species=species( library_species_name ); target_species.taxid = taxid
target_species=unmask_species( masked_species_name )
write('Previous run on the same target found. Species name: '+str(target_species), 1)
set_species_from_previous_run=True
break
except: raise
if not set_species_from_previous_run:
## if we didn't find it, we set species to unidentified, printing a warning
target_species='unidentified'
printerr('WARNING no species name provided! Using: "unidentified"', 1)
else:
### something provided; either the target is called genome, so we can derive the species name from the folder, or the option -species (-s) was provided. setting variable species_name, which may be converted to ncbi syntax
if target_name=='genome' and not opt['species']: target_species=abspath(target_file).split('/')[-2]
elif opt['species']: target_species=opt['species']
else: target_species='unidentified'
if '_' in target_species or '{ch' in target_species: target_species=unmask_species(target_species)
# try:
# taxid, library_species_name= get_species_from_library( species_name )
# if not library_species_name in [species_name, unmask_characters(species_name), unmask_characters(replace_chars(species_name, '_', ' '))]:
# printerr('WARNING species name '+species_name+' changed to ncbi-defined scientific name: '+library_species_name, 1)
# target_species=species( library_species_name )
# target_species.taxid=taxid
# except Exception:
# raise notracebackException, 'ERROR species not recognized! Only ncbi taxonomy names are accepted'
## if species name is unidentified, we make a uniq target name to be sure not to overwrite results
if target_species=='unidentified': target_name=uniq_id_for_file(target_file)
target_results_folder=Folder(results_folder + mask_species(target_species) + '.'+target_name) #if you change this, change above as well, in the end of mv command
results_db_file=target_results_folder+'results.sqlite'
if not opt['no_db']:
results_db=selenoprofiles_db(results_db_file)
if not results_db.has_table('results'):
results_db.initialise_db() #creating database if necessary
write('DATABASE', how=terminal_colors['database']); write(' initialising: '+results_db_file, 1)
results_db.update_to_last_version() # attempt updating database from last versions of selenoprofiles if necessary
link_target_file=target_results_folder+'link_target.fa'
if not is_file(link_target_file):
write('Creating link: '+link_target_file+' --> '+target_file, 1)
bash('rm '+link_target_file) # if we entered here because link is broken, cleaning up and relinking
bbash('ln -fs '+target_file+' '+link_target_file)
# preparing actions hash. keys of actions are categories (pre_filtering, post_filtering), and their values are hashes. Keys of these hashes are ids (can be anything) which are evaluated in alphabetical order (the sorted() function is used). Values corresponding to such keys are strings which are executed in the python environment, denoting the chosen_hit object with "x"
actions={} ; action_categories=['pre_blast_filter', 'post_blast', 'post_blast_merge', 'pre_choose', 'pre_filtering', 'post_filtering', 'pre_output']
if not def_opt.has_key('ACTION'): printerr('WARNING no action is specified in the config file', 1)
else: actions=def_opt['ACTION']
for category in action_categories:
if not actions.has_key(category): actions[category]={}
#expanding with action defined in command line
for k in opt:
if k.startswith('ACTION.'):
try:
category= k.split('.')[1]
if not category in action_categories: raise Exception
action_id= k.split('.')[2]
except: raise Exception, "selenoprofiles ERROR actions must have the form -ACTION.category.id \"action code\" (command line) or ACTION.category.id = \"action code\" (configuration file), where category must be one of "+join(action_categories, ', ')+" and id is any non-null string."
actions[ category ] [ action_id ]= opt[k]
# setting folders depending on keep_ options
blast_folder=Folder(target_results_folder+'blast')
exonerate_folder=Folder(target_results_folder+'exonerate')
genewise_folder=Folder(target_results_folder+'genewise')
prediction_choice_folder=Folder(target_results_folder+'prediction_choice')
filtered_list_folder=Folder(target_results_folder+'filtering')
output_folder=Folder(target_results_folder+'output')
if opt['outfolder']: output_folder=Folder(opt['outfolder']); test_writeable_folder(output_folder, 'outfolder');
blast_nr_folder=Folder(target_results_folder+'tag_blast')
## indexing and formatting if necessary
# fastaindex
target_file_index = join( target_file.split('.')[:-1],'.' )+'.index'; index_in_progress_file=join( target_file.split('.')[:-1],'.' )+'.index_in_progress'
if not is_file(target_file_index) and not is_file(index_in_progress_file):
write_to_file('look in '+temp_folder, index_in_progress_file) ## creating file to indicate we're computing the index
write('Indexing '+target_file+' with fastaindex ... ', 1)
try:
target_file_index = fastaindex(target_file, silent=True)
bbash('rm '+index_in_progress_file)
except:
bash('rm '+index_in_progress_file)
raise
elif is_file(index_in_progress_file):
index_waited=0
while is_file(index_in_progress_file):
write('Another instance of selenoprofiles is computing the index of '+target_file+' ; waiting ...', 1)
index_waited+=1
time.sleep(sleep_time)
service('waited: '+str(sleep_time*index_waited)+' seconds ')
if not is_file(target_file_index): raise notracebackException, "ERROR after waiting, the index was not found : "+target_file_index
# formatdb
formatdb_in_progress_file= join( target_file.split('.')[:-1],'.' )+'.formatdb_in_progress'
if bash('ls '+target_file+'.*nin')[0]:
write_to_file('look in '+temp_folder, formatdb_in_progress_file) ## creating file to indicate we're computing with formatdb
write('Formatting '+target_file+' with formatdb ... ', 1)
try:
formatdb(target_file, is_protein=False, silent=True)
bbash('rm '+formatdb_in_progress_file)
except:
bash('rm '+formatdb_in_progress_file)
raise
elif is_file(formatdb_in_progress_file):
index_waited=0
while is_file(formatdb_in_progress_file):
write('Another instance of selenoprofiles is formatting '+target_file+' ; waiting ...', 1)
index_waited+=1
time.sleep(sleep_time)
service('waited: '+str(sleep_time*index_waited)+' seconds ')
if bash('ls '+target_file+'.*nin')[0]: raise notracebackException, "ERROR after waiting, the file produced by formatdb were not found : "+target_file+'.*nin'
#chromosome lenghts
chromosome_length_file=join( target_file.split('.')[:-1],'.' )+'.lengths'; chromosome_length_in_progress_file= join( target_file.split('.')[:-1],'.' )+'.lengths_in_progress'
if not is_file(chromosome_length_file) and not is_file(chromosome_length_in_progress_file):
write_to_file('look in '+temp_folder, chromosome_length_in_progress_file) ## creating file to indicate we're computing the index
write('Computing length of all sequences in '+target_file+' --> '+chromosome_length_file, 1)
try:
bbash('fastalength '+target_file+ ' > '+temp_folder+'chrom_lengths && mv '+temp_folder+'chrom_lengths '+chromosome_length_file)
bbash('rm '+chromosome_length_in_progress_file)
except:
bbash('rm '+chromosome_length_in_progress_file)
raise
elif is_file(chromosome_length_in_progress_file):
index_waited=0
while is_file(chromosome_length_in_progress_file):
write('Another instance of selenoprofiles is computing the length of the sequences in '+target_file+' ; waiting ...', 1)
index_waited+=1
time.sleep(sleep_time)
service('waited: '+str(sleep_time*index_waited)+' seconds ')
if not is_file(chromosome_length_file): raise notracebackException, "ERROR after waiting, the sequence lengths file was not found : "+chromosome_length_file
# handling no_splice option
if opt['no_splice']: opt['dont_genewise']=1
# determining parameter options of programs: blast, exonerate, genewise. These are the default options, but they can be overriden by the profile options.
exonerate_extension=opt['exonerate_extension']
genewise_extension=opt['genewise_extension']
genewise_tbs_extension=opt['genewise_tbs_extension']
blast_options={}; blast_options_string_split=str(opt['blast_opt']).split()
while blast_options_string_split:
try:
option_name = blast_options_string_split.pop(0)[1:]; value = blast_options_string_split.pop(0) #######
if value[0] in '\'"':
done=0
while len(value)==1 or value[-1] != value[0] :
value+=blast_options_string_split.pop(0)+' '
blast_options[option_name]=value
except: raise notracebackException, 'selenoprofiles ERROR parsing blast options: '+blast_options_string_split
exonerate_options={}; exonerate_options_string_split=str(opt['exonerate_opt']).split()
while exonerate_options_string_split:
if len(exonerate_options_string_split)==1: raise notracebackException, "selenoprofiles exonerate_opt ERROR the words number is not even... can't find a value for option: "+str(exonerate_options_string_split[0])
option_name = exonerate_options_string_split.pop(0)[1:]; value = exonerate_options_string_split.pop(0)
exonerate_options[option_name]=value
genewise_options={}; genewise_options_string_split=str(opt['genewise_opt']).split()
while genewise_options_string_split:
if len(genewise_options_string_split)==1: raise notracebackException, "selenoprofiles genewise_opt ERROR the words number is not even... can't find a value for option: "+str(genewise_options_string_split[0])
option_name = genewise_options_string_split.pop(0)[1:]; value = genewise_options_string_split.pop(0)
genewise_options[option_name]=value
genewise_tbs_options=genewise_options.copy()
# preparing filehandlers for output files like fasta, gff, gtf.. if any has been specified in command_line ( e.g. -output_fasta_file any_file.fa )
output_file_handlers={} # will host the filehandlers to write in specified output files, if any
for keyword in allowed_output_formats:
if opt['output_'+keyword+'_file']:
output_file_handlers[keyword]= open( opt['output_'+keyword+'_file'], 'w')
try: output_file_handlers[keyword]= open( opt['output_'+keyword+'_file'], 'w')
except:
printerr('ERROR can\t open option -'+'output_'+keyword+'_file'+' file for writing: '+opt['output_'+keyword+'_file'])
raise
# for pretty printing
max_chars_per_column={'id':18+4, 'chromosome':10, 'strand':1}
## computing summary of current options
summary=''
summary+=' Options '.center(120, '#')+'\n'
#summary+='command line options: '
output_options= []
for k in sorted(opt.keys()):
#if def_opt.has_key(k) and opt[k]!=def_opt[k]: summary+='-'+str(k)+' '+str(opt[k])+' '
if k.startswith('output_') and opt[k]: output_options.append(k)
#summary+='\n'
summary+='| output folder: '+results_folder+'\n'
summary+='| target file: '+target_file+'\n'
summary+='| target species: '+str(target_species)+'\n' # (taxid:'+str(target_species.taxid)+')\n'
summary+='| profiles list: '+join(profiles_names, ' ')+'\n'
summary+='| configuration file: '+config_filename+'\n'
summary+='| temporary folder: '+temp_folder+'\n'
# program options/ filtering / db
summary+='|\n########## Filtering procedures, program options, profile attributes defined:\n'
for category in sorted(keywords_text.keys()):
summary+="| "+category.ljust(19)
for k_index, keyword in enumerate(keywords_text[category]):
if k_index>0: summary+="\n| "+' '*19
summary+='| '+keyword+': '+str(keywords_text[category][keyword]).ljust(25)+' '
summary+='\n'
# actions
summary+='|\n########## Active actions:\n'
for category in actions:
for action_id in actions[category]:
summary+=('| '+category+'.'+str(action_id)).ljust(19)+' = '+actions[category][action_id]+'\n'
if not actions: summary+='| None\n'
other_options=''
for k in opt:
if not k in {'__synonyms__':1, 't':1, 'r':1, 'species':1, 'temp':1, 'config':1, 'profile':1} and not k.startswith('ACTION') and (not k.startswith('output_') or k.endswith('_file')) and (k!='state' or opt[k]!='kept') and (not k in def_opt or opt[k]!=def_opt[k]):
other_options+='\n| '+k+': '+str(opt[k])+''
if other_options: summary+='|\n########## Routines and other non-default options:'+other_options+'\n'
summary+='|\n########## Output options: '
if output_options:
for o in output_options: summary+=o.split('output_')[1]+' '
else: summary+='None'
summary+='\n'+'#'*120
return summary
def mask_species(species_name): return replace( mask_characters ( species_name ), ' ', '_')
def unmask_species(species_name): return unmask_characters ( replace( species_name , '_', ' ') )
def load_chromosome_lengths(chromosome_length_file, max_chars=0):
"""Utility to load chromosome lenghts from a fastalength output file and also set it as a MMlib variable; also performing controls on the file """
global chromosome_lengths; chromosome_lengths={}
for line in open(chromosome_length_file, 'r'):
fasta_identifier = line.split()[1]
length=int(line.split()[0])
if chromosome_lengths.has_key(fasta_identifier):
bash('rm '+chromosome_length_file)
raise notracebackException, "ERROR the target file has a duplicate fasta identifier! ("+line.split()[1]+') Please modify it and rerun. Note: remove the .index and *.fa.n* blast formatting files after changing the target file'
if length==0:
bash('rm '+chromosome_length_file)
raise notracebackException, "ERROR the target file has a length zero entry! ("+line.split()[1]+') Please modify it and rerun. Note: remove the .index and *.fa.n* blast formatting files after changing the target file'
if is_number(fasta_identifier) and fasta_identifier[0]=='0':
bash('rm '+chromosome_length_file)
raise notracebackException, "ERROR the target file has a numeric fasta identifier starting with zero! ("+line.split()[1]+') This would cause an unexpected blast behavior. Please modify this or these ids and rerun. Note: remove the .index and *.fa.n* blast formatting files after changing the target file'
if ':subseq(' in fasta_identifier:
bash('rm '+chromosome_length_file)
raise notracebackException, "ERROR with fasta header: "+fasta_identifier+' ; this was generated by fastasubseq and will cause unexpected behavior of this program, since it is using fastasubseq itself to cut sequences. Please clean the titles in your target file from ":subseq(" tags. Note: remove the .index and *.fa.n* blast formatting files after changing the target file '
if max_chars and len(fasta_identifier)>max_chars:
bash('rm '+chromosome_length_file)
raise notracebackException, "ERROR with fasta header: "+fasta_identifier+' is too long. The maximum length for a fasta identifier (first word of the title) is '+str(max_chars)+' characters. Please clean the titles in your target file. Note: remove the .index and *.fa.n* blast formatting files after changing the target file'
if '/' in fasta_identifier:
bash('rm '+chromosome_length_file)
raise notracebackException, "ERROR with fasta header: "+fasta_identifier+' has forbidden character: "/" \nPlease clean the titles in your target file. Note: remove the .index and *.fa.n* blast formatting files after changing the target file'
chromosome_lengths[fasta_identifier]=length
set_MMlib_var('chromosome_lengths', chromosome_lengths)
############################################################################################
################################################3
### MAIN PROGRAM START
def main():
write('', 1);
write('| '); write('Selenoprofiles v'+str(__version__), how='reverse'); write(' Host: '+bbash('echo $HOSTNAME')+' Date: '+bbash('date'), 1)
options_summary= load() #loading variables and getting a nice summary to show
write('\n'+options_summary+'\n', 1)
if opt['add']: # with -add options, the user can add code to be executed right here. This can include the definition of function that will be used in the customized steps
added_file=opt['add']
check_file_presence(added_file, 'added code file')
all_lines_of_added_file=open(added_file, 'r').readlines()
try: exec(join(all_lines_of_added_file, ''))
except: printerr('add ERROR importing code from file : '+ added_file); raise
global max_chars_per_column; global blast_nr_folder_profile_subfolder
# computing length of all chromosomes in the target, or just loading it from chromosome_length_file
write('Loading length of all chromosomes from '+chromosome_length_file, 1)
load_chromosome_lengths(chromosome_length_file)
for k in chromosome_lengths:
if len(k)> max_chars_per_column['chromosome']: max_chars_per_column['chromosome']=len(k)
all_results_are_loaded_from_db = True
chromosomes_with_results={} #filled after filtering, when writing in database
skipped_profiles={} # taken off the profiles_names list
for profile_index, family in enumerate(profiles_names):
try:
# load profile
profile_ali=profiles_hash[family]
if len(family)+18>max_chars_per_column['id']: max_chars_per_column['id']=len(family)+18
write('\n'+profile_ali.summary(), 1, how=terminal_colors['profile'])
### important data structures to keep links to all loaded data:
blast_hits_hash={} #original_blast_id : -> (super)blasthit_object
exonerate_hits_hash={} #original_blast_id : -> exonerate_object
genewise_hits_hash={} #original_blast_id : -> genewise_object
blast_hits_of_empty_exonerates_hash={} #index collection
considered_indexes=[] #index collection
chosen_predictions={} #original_blast_it: -> p2g_hit object chosen (blast, exonerate or genewise)
if profile_ali.nseq()>1:
if not profile_ali.has_title('BLAST_QUERY_MASTER'): # it could have it already if the profile has been built now
profile_ali.add( 'BLAST_QUERY_MASTER', profile_ali.blast_queries().seq_of('BLAST_QUERY_MASTER') )
clusters_relative_positions = profile_ali.clusters_relative_positions() # to convert quickly positions on any cluster query to the corresponding position on the blast master query, we precompute all of them and put them into a hash.
blast_folder_profile_subfolder=Folder(blast_folder+family)
#check if this target has already been scanned for this profile.
if not opt['no_db']: db_has_results_for_this_profile= results_db.has_results_for_profile(family)
if not opt['no_db'] and db_has_results_for_this_profile in ['ONGOING', 'UNCHECKED', 'WAITING']:
raise notracebackException, "ERROR the search for profile: "+family+' is already being computed by another instance of selenoprofiles! If this is not true, use the script selenoprofiles_database.py to clean the file results.sqlite'
elif opt['no_db'] or db_has_results_for_this_profile=='NO' or any( [opt[o] for o in ['blast', 'exonerate', 'genewise', 'choose', 'filter', 'database']] ):
all_results_are_loaded_from_db= False
if not opt['no_db']:
results_db.set_has_results_for_profile(family, 'ONGOING')
results_db.save()
#########
## BLAST
blast_parsers=[]
write('\n'); write('BLAST:', how=terminal_colors['routine']); write(' psitblastn of profile '+family+' --> '+blast_folder_profile_subfolder+' ', 1)
#### change for tblastn
for cluster_index in range( profile_ali.n_clusters() ): #index is 0 based
blast_outfile= blast_folder_profile_subfolder+family+'.psitblastn.'+str(cluster_index+1)
cluster_profile_ali= profile_ali.clusters()[cluster_index]
## debug
#cluster_profile_ali.display(temp_folder+'cluster_profile_ali.fa')
#profile_ali.display(temp_folder+'profile_ali.fa')
#raw_input(temp_folder+'profile_ali.fa '+temp_folder+'cluster_profile_ali.fa')
if profile_ali.nseq()==1:
write( ('cluster'+str(cluster_index+1)).ljust(15)+(' (1 seq) --tblastn--').ljust(40) )
if opt['blast'] or not is_file(blast_outfile) or not is_valid_blast_output(blast_outfile):
write('R> '+blast_outfile, 1)
blast_parsers.append( tblastn(profile_ali, target_file, outfile=blast_outfile, blast_options=blast_options) )
else:
write('L> '+blast_outfile, 1)
blast_parsers.append( parse_blast(blast_outfile) )
else:
write( ('cluster'+str(cluster_index+1)).ljust(15)+(' ('+str(cluster_profile_ali.nseq()-1)+' seq'+ 's'*int(cluster_profile_ali.nseq()>2) +')').ljust(24) )
if opt['blast'] or not is_file(blast_outfile) or not is_valid_blast_output(blast_outfile):
write(' R> '+blast_outfile, 1)
if not opt['tblastn']:
blast_parsers.append( psitblastn(cluster_profile_ali, target_file, outfile=blast_outfile, blast_options=blast_options) )
else:
blast_parsers.append( multi_tblastn(cluster_profile_ali, target_file, outfile=blast_outfile, blast_options=blast_options) )
else:
write(' L> '+blast_outfile, 1)
blast_parsers.append( parse_blast(blast_outfile) )
if opt['filtered_blast_file']: filtered_blast_file_h=open(opt['filtered_blast_file'], 'w')
## filtering blast hits depending on the options on the profile
blast_hits=[] ; blast_hit_index=1
for cluster_id, blast_hits_parser in enumerate(blast_parsers):
cluster_id+=1 #to have it 1-based
blast_queries_alignment= alignment() #alignment of blast_query_master and of the query for this cluster. Used to speed up the process of blast_h.set_query_to_master (see below)
blast_queries_alignment.add('q', profile_ali.blast_queries().seq_of('BLAST_QUERY_'+str(cluster_id)))
blast_queries_alignment.add('BLAST_QUERY_MASTER', profile_ali.blast_queries().seq_of('BLAST_QUERY_MASTER'))
blast_queries_alignment.remove_useless_gaps()
for blast_h in blast_hits_parser:
if not chromosome_lengths.has_key(blast_h.chromosome) and is_number(blast_h.chromosome.split('_')[0]) and base_filename(target_file) in blast_h.chromosome:
raise notracebackException, "selenoprofiles ERROR the chromosome of the blast hit is not recognized. This is a known bug of blast which happens when the fasta titles in the target have \"|\" characters, but are not gis codes from ncbi. Please, reformat your target file (taking care also that all chromosome names are unique) and rerun selenoprofiles."
if blast_hit_index> profile_ali.max_blast_hits_number_value():
##too many blast hits. now a little trick to avoid a painful error message that gawk will print to screen since we wouldn't get to the end of the blaster_parser
done_end_of_parser=False
while not done_end_of_parser:
try: blast_hits_parser.next()
except StopIteration: done_end_of_parser=True
if opt['blast_filtering_warning']:
printerr("profile "+family+" WARNING too many blast hits! you should set a more strict blast filtering for this family. Now keep running using only with the first "+str(profile_ali.max_blast_hits_number_value())+' blast hits', 1)
break
else:
raise skipprofileException, 'profile '+family+' ERROR too many blast hits! you should change the blast filtering for this family or set a higher value for parameter max_blast_hits in its profile configuration file. See blast filtering chapter in the manual for details. To keep running with the first max_blast_hits blast hits, use option -blast_filtering_warning'
blast_h.target=target_file; blast_h.profile=profile_ali; blast_h.species=target_species; blast_h.cluster_id=cluster_id
if profile_ali.nseq()>1: blast_h.set_query_to_master(clusters_relative_positions, blast_queries_alignment)
### CHANGED; now called later unless necessary (sec_is_aligned); it was: blast_h.place_selenocysteine() ## modifying in place the blast hit to have Us for selenocysteine, also in the target. After this, there are U both in query and target (here only in UGAs aligned to U in query)
x=blast_h
for action_id in sorted(actions['pre_blast_filter'].keys()):
try: exec(actions['pre_blast_filter'][action_id]) ### running action
except:
printerr('selenoprofiles ERROR trying to run pre_blast_filter action: '+str(actions['pre_blast_filter'][action_id]) +' on result:\n '+str(x), 1)
raise
#write('CTL '+blast_h.chromosome+' '+blast_h.positions_summary()+' '+str(round(blast_h.weighted_seq_identity_with_profile(), 4)), 1)
if profile_ali.blast_filtering_eval()(blast_h): #filtering blast hits
blast_h.place_selenocysteine()
blast_h.id = str(blast_hit_index); blast_h.query.id = str(blast_hit_index)+'_query'
blast_hits.append(blast_h)
blast_hit_index+=1
if opt['filtered_blast_file']: print >> filtered_blast_file_h, 'BLASTID:'+blast_h.id+'\n', blast_h
x=blast_h
for action_id in sorted(actions['post_blast'].keys()):
#write('id: '+str(index_id)+' checking pre action '+str(action_id)+' : '+actions['pre_filtering'][action_id], 1)
try: exec(actions['post_blast'][action_id]) ### running action
except:
printerr('selenoprofiles ERROR trying to run post_blast action: '+str(actions['post_blast'][action_id]) +' on result with index '+str(blast_hit_index), 1)
raise
## sometimes blast gives an output changing the chromosome names to titles like "1_genome.fa" , where 1 is the index in the file where the fasta title appears, and genome.fa is the target name. This happens presumably because of unwanted characters in the fasta titles. user should remove them by himself
if opt['filtered_blast_file']: filtered_blast_file_h.close()
if profile_ali.n_clusters()>1:
write("-- Removing blast hits overlapping from different clusters. before: "+str(len(blast_hits))+ " ; after: ", )
blast_hits= remove_overlapping_genes(blast_hits, cmp_fn=choose_among_overlapping_p2gs_intrafamily, phase=True)
write(str(len(blast_hits)), 1)
if not opt['no_splice']:
# merging by colinearity blast hits, meaning: if A is downstream of B, and also the query of A is downstream of the query of B, they're merged.
write("-- Merging by colinearity blast hits. before: "+str(len(blast_hits))+ " ; after: ", )
superblast_hits = merge_p2g_hits_by_colinearity( blast_hits, post_function=merge_p2g_hits_by_colinearity_post_function_blast_hits, sequence_collection=profile_ali )
write(str(len(superblast_hits)), 1)
del blast_hits
else:
superblast_hits = blast_hits
write("-- Number of filtered blast hits: "+str(len(blast_hits)), 1)
# superblasthits is actually a collection of both blasthits and superblasthits. You can use any blasthit method nonetheless
for superblast_hit in superblast_hits:
blast_hits_hash[superblast_hit.id] = superblast_hit
considered_indexes.append(int(superblast_hit.id))
x=superblast_hit
for action_id in sorted(actions['post_blast_merge'].keys()):
#write('id: '+str(index_id)+' checking pre action '+str(action_id)+' : '+actions['pre_filtering'][action_id], 1)
try: exec(actions['post_blast_merge'][action_id]) ### running action
except:
printerr('selenoprofiles ERROR trying to run post_blast_merge action: '+str(actions['post_blast_merge'][action_id]) +' on result with index '+str(x.id), 1)
raise
del superblast_hits #so I am deleting from memory those which were merged, but not the other ones cause they will be linked in blast_hits_hash
considered_indexes.sort()
##############
## EXONERATE
if opt['dont_exonerate']:
for hit_index in considered_indexes: blast_hits_of_empty_exonerates_hash[hit_index]=1
else:
exonerate_folder_profile_subfolder=Folder(exonerate_folder+family)
write('\n'); write('EXONERATE:', how=terminal_colors['routine']) ; write(' exonerate for profile '+family+' --> '+exonerate_folder+' # R> stands for run, L> for load', 1)
exonerate_mode={False:'p2g', True:'p2d'}[opt['no_splice']]
for hit_index in considered_indexes:
exonerate_outfile=exonerate_folder_profile_subfolder+family+'.'+str(hit_index)+'.exonerate'
superblast_hit= blast_hits_hash[str(hit_index)]
if opt['exonerate'] or not is_file(exonerate_outfile): #deciding if run exonerate or not
### running cyclic_exonerate, obtaining an (super)exonerate object. It can be empty (in this case its boolean evaluation will be False, and it will have a "error_message" attribute).
# the id of the exonerate hit is set to the id of the original blas hit here below.
# to output, a short description is printed after the output file, like this:
#sps.1.exonerate -> on:scaffold_6 strand:+ positions:1818342-1819187,1819199-18192912
exonerate_hit = cyclic_exonerate(profile_ali, target_file, outfile=exonerate_outfile, seed=superblast_hit, extension=exonerate_extension, exonerate_options=exonerate_options, mode=exonerate_mode, merge_multiple=not opt['no_splice'])
run_or_load_code='R'
else:
#loading file
exonerate_hit = superexoneratehit(); error_message=exonerate_hit.load(exonerate_outfile, seed=superblast_hit, merge_multiple=not opt['no_splice'])
if not exonerate_hit: exonerate_hit.error_message= error_message
run_or_load_code='L'
exonerate_hit.id=superblast_hit.id; exonerate_hit.query.id= exonerate_hit.id+'_query' #setting id property of exonerate object to its hit_index
exonerate_hit.profile=profile_ali; exonerate_hit.target= target_file; exonerate_hit.species=target_species
if exonerate_hit: short_description= exonerate_hit.header(no_species=True, no_id=True, compress=True)
else: short_description=exonerate_hit.error_message
write(description_format(exonerate_outfile.split('/')[-1]+' '+run_or_load_code+'> '+short_description), 1 )
exonerate_hits_hash[superblast_hit.id] = exonerate_hit
if not exonerate_hit:
#insert exhaustive exonerate code here ############
blast_hits_of_empty_exonerates_hash[hit_index]=1
if not considered_indexes: write(' -- no blast hit passed filtering --', 1)
else:
duplicated_exonerate_hits=[] #this will contain the list of exonerate identical or overlapping to some other exonerate hit
remove_overlapping_genes(exonerate_hits_hash.values(), scoring=len, phase=False, out_removed_genes=duplicated_exonerate_hits) #dropping output.. we're interested in the removed ones
duplicated_exonerate_hits_ids=sorted([int(e.id) for e in duplicated_exonerate_hits])
write('-- Removing duplicated exonerate hit'+'s'*int(len(duplicated_exonerate_hits_ids)>1)+' with id'+'s'*int(len(duplicated_exonerate_hits_ids)>1)+' : ')
if duplicated_exonerate_hits_ids:
write(join([str(i) for i in duplicated_exonerate_hits_ids], ', '), 1)
remove_items_from_list(considered_indexes, duplicated_exonerate_hits_ids, inplace=True) #removing duplicates from considered_indexes
else: write('(None)', 1)
###################
## GENEWISE
if opt['dont_genewise']: pass
else:
genewise_folder_profile_subfolder = Folder(genewise_folder+family)
write( '\n'); write('GENEWISE:', how=terminal_colors['routine']); write(' genewise for profile '+family+' --> '+genewise_folder+' # R> stands for run, L> for load', 1)
if not considered_indexes: write(' -- no blast hit passed filtering --', 1)
for i_i, hit_index in enumerate(considered_indexes): #i_i is the index of the index... it is not used.
if not hit_index in blast_hits_of_empty_exonerates_hash: #it means we have a non-empty exonerate output for this hit_index
genewise_outfile=genewise_folder_profile_subfolder+family+'.'+str(hit_index)+'.genewise' ; current_outfile=genewise_outfile
exonerate_seed_hit= exonerate_hits_hash[str(hit_index)]
if opt['genewise'] or not is_file(genewise_outfile): #### run genewise!
genewise_hit = genewise(profile_ali, target_file, outfile=genewise_outfile, seed=exonerate_seed_hit, extension=genewise_extension, genewise_options=genewise_options)
run_or_load_code='R'
else: # or load genewise file
genewise_hit = genewisehit(genewise_outfile)
run_or_load_code='L'
genewise_hit.id = str(hit_index); genewise_hit.query.id = genewise_hit.id+'_query' #setting id property of genewise object to its hit_index
genewise_hits_hash[genewise_hit.id] = genewise_hit
genewise_hit.profile=profile_ali
genewise_hit.species=target_species
genewise_hit.target=target_file
if genewise_hit: short_description= genewise_hit.header(no_species=True, no_id=True, compress=True)
else: short_description=genewise_hit.error_message
else: # genewise to be sure. the correspondent exonerate output was empty
if opt['genewise_to_be_sure']:
genewise_tbs_outfile=genewise_folder_profile_subfolder+family+'.'+str(hit_index)+'.genewise_tbs' ; current_outfile=genewise_tbs_outfile
blast_seed_hit=blast_hits_hash[str(hit_index)]
if opt['genewise'] or not is_file(genewise_tbs_outfile): #### run genewise tbs!
genewise_hit = genewise(profile_ali, target_file, outfile=genewise_tbs_outfile, seed=blast_seed_hit, extension=genewise_tbs_extension, genewise_options=genewise_tbs_options)
run_or_load_code='R'
else: #or load genewise tbs file
genewise_hit = genewisehit(genewise_tbs_outfile)
run_or_load_code='L'
genewise_hit.id = blast_seed_hit.id; genewise_hit.query.id = genewise_hit.id+'_query' #setting id property of genewise object to its hit_index
genewise_hits_hash[genewise_hit.id] = genewise_hit
genewise_hit.profile=profile_ali
genewise_hit.species=target_species
genewise_hit.target=target_file
if genewise_hit: short_description= genewise_hit.header(no_species=True, no_id=True, compress=True)
else: short_description=genewise_hit.error_message
#both in tbs and in normal routine: print summary
if opt['genewise_to_be_sure'] or not hit_index in blast_hits_of_empty_exonerates_hash:
genewise_hit.check_alignment ()
write(description_format(current_outfile.split('/')[-1]+' '+run_or_load_code+'> '+short_description), 1)
write('', 1)
###############
## choosing prediction and labelling
chosen_predictions_reasons_why={}
chosen_predictions_outfile=prediction_choice_folder+family+'.tab'
#running pre_choose actions. This must be here, and not inside the next if, to ensure they are executed even if the prediction choose file has already been produced
for index_id in considered_indexes:
exonerate_hit=False; genewise_hit=False; blast_hit=False
blast_hit= blast_hits_hash[str(index_id)]
if exonerate_hits_hash.has_key(str(index_id)): exonerate_hit= exonerate_hits_hash[str(index_id)]
if genewise_hits_hash.has_key(str(index_id)): genewise_hit= genewise_hits_hash[str(index_id)]
candidates=[blast_hit]
if exonerate_hit: candidates.append(exonerate_hit)
if genewise_hit: candidates.append(genewise_hit)
for action_id in sorted(actions['pre_choose'].keys()):
for x in candidates:
try: exec(actions['pre_choose'][action_id]) ### running action
except:
printerr('selenoprofiles ERROR trying to run pre_choose action: '+str(actions['pre_choose'][action_id]) +' on result with index '+str(index_id)+' and prediction program '+x.prediction_program(), 1)
raise
write('CHOOSE:', how=terminal_colors['routine']); write(' choosing among available predictions, assigning label --> '+chosen_predictions_outfile+' ')
indexes_to_remove={} ## keep track of the predictions we want to drop immediately after the choose step. uniq example on creation date: when the only available prediction is from blast, and you don't want blast predictions according to options
if opt['choose'] or not is_file(chosen_predictions_outfile):
write('RUN', 1)
chosen_predictions_outfile_text=''
if not considered_indexes: write(' -- no blast hit passed filtering --', 1)
for index_id in considered_indexes:
#recovering the three predictions
exonerate_hit=False; genewise_hit=False; blast_hit=False
blast_hit= blast_hits_hash[str(index_id)]
if exonerate_hits_hash.has_key(str(index_id)): exonerate_hit= exonerate_hits_hash[str(index_id)]
if genewise_hits_hash.has_key(str(index_id)): genewise_hit= genewise_hits_hash[str(index_id)]
candidates=[blast_hit]
if opt['no_blast']: candidates=[]
if exonerate_hit: candidates.append(exonerate_hit)
if genewise_hit: candidates.append(genewise_hit)
chosen_hit, reason_why = choose_prediction(candidates) ## choosing prediction here. see choose_prediction function for details
chosen_predictions[str(index_id)]=chosen_hit
chosen_predictions_reasons_why[str(index_id)]=reason_why
if not chosen_hit:
label='empty'
indexes_to_remove[index_id]=True
else:
#assigning label
if profile_ali.sec_pos(): label= assign_label_selenoprotein_families(chosen_hit)
else: label= assign_label_non_selenoprotein_families(chosen_hit)
chosen_hit.label=label ## assigning label property here
chosen_predictions_outfile_text+=str(index_id)+'\t'+chosen_hit.prediction_program()+'\t'+reason_why+'\t'+label+'\n'
# writing file with results from this step
if chosen_predictions_outfile_text: chosen_predictions_outfile_text=chosen_predictions_outfile_text[:-1] #removing last \n
write_to_file(chosen_predictions_outfile_text, chosen_predictions_outfile)