-
Notifications
You must be signed in to change notification settings - Fork 82
/
trim_msa_based_on_mask.py
executable file
·81 lines (65 loc) · 3.18 KB
/
trim_msa_based_on_mask.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python
###############################################################################
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
# #
###############################################################################
from __future__ import print_function
__prog_name__ = 'trim_msa_based_on_mask.py'
__prog_desc__ = 'Trim an MSA based on a prexisting mask. We assumed the mask has the same length as the MSA'
__author__ = 'Pierre Chaumeil'
__copyright__ = 'Copyright 2017'
__credits__ = ['Pierre Chaumeil']
__license__ = 'GPL3'
__version__ = '0.0.1'
__maintainer__ = 'Pierre Chaumeil'
__email__ = 'p.chaumeil@uq.edu.au'
__status__ = 'Development'
import sys
import argparse
from biolib.seq_io import read_fasta
class MSATrimmer(object):
def __init__(self):
"""Initialization."""
pass
def run(self, msa, mask, outf):
outfwriter = open(outf, 'w')
dict_genomes = read_fasta(msa, False)
with open(mask, 'r') as f:
maskstr = f.readline()
print(maskstr)
print(len(maskstr))
from future.utils import iteritems
for k, v in dict_genomes.iteritems():
aligned_seq = ''.join([v[i] for i in range(0, len(maskstr)) if maskstr[i] == '1'])
fasta_outstr = ">%s\n%s\n" % (k, aligned_seq)
outfwriter.write(fasta_outstr)
outfwriter.close()
if __name__ == '__main__':
print(__prog_name__ + ' v' + __version__ + ': ' + __prog_desc__)
print(' by ' + __author__ + ' (' + __email__ + ')' + '\n')
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--msa', help='Untrimmed multi sequence alignment.')
parser.add_argument('--mask', help='Mask file generated by gtdb.')
parser.add_argument('--output_file', help='Output file.')
args = parser.parse_args()
try:
msatrimmer = MSATrimmer()
msatrimmer.run(args.msa, args.mask, args.output_file)
except SystemExit:
print("\nControlled exit resulting from an unrecoverable error or warning.")
except:
print("\nUnexpected error:", sys.exc_info()[0])
raise