Permalink
Browse files

Fixed duble > in names, fixed \r\n on names ends

	modified:   .gitignore
	modified:   README.md
	modified:   bin/cmdfatool.py
	modified:   fatool/fa.py
	modified:   fatool/sequence.py
	modified:   fatool/tests/test_sequence.py
  • Loading branch information...
1 parent 0ff0ed9 commit a2b810b550b3a7c01291bf258a41296ac7ee91bd @blazejmarciniak blazejmarciniak committed Aug 4, 2016
Showing with 125 additions and 113 deletions.
  1. +3 −1 .gitignore
  2. +25 −1 README.md
  3. +44 −48 bin/cmdfatool.py
  4. +3 −2 fatool/fa.py
  5. +46 −59 fatool/sequence.py
  6. +4 −2 fatool/tests/test_sequence.py
View
@@ -6,4 +6,6 @@
# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
-/*.egg
+/*.egg
+
+build/
View
@@ -197,7 +197,7 @@ optional arguments:
user who have fired script it will be noted in report
- cutNameMarker:
+ cutNameMarker
usage: cmdfatool.py cutNameMarker [-h] -f FAFILE -m MARKER -l LENGTH
@@ -210,3 +210,27 @@ optional arguments:
-l LENGTH, --length LENGTH length of cut
--keepMarker KEEPMARKER weather to keep marker or not default 1 (Yes)
-o OUTPUT, --output OUTPUT output file default: output.fa
+
+ translateDNA2Proteins
+
+usage: cmdfatool.py translateDNA2Proteins [-h] -f FAFILE [-o OUTPUT]
+ [--startCodons [STARTCODONS [STARTCODONS ...]]]
+ [--stopCodons [STOPCODONS [STOPCODONS ...]]]
+ [--tdict {STD,VMTO,YMTO,BAPP}]
+ [--nss] [--report REPORT]
+ [--operator [OPERATOR [OPERATOR ...]]]
+
+optional arguments:
+ -h, --help show this help message and exit
+ -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa
+ -o OUTPUT, --output OUTPUT output file default: output.fa
+ --startCodons [STARTCODONS [STARTCODONS ...]] list of start codons separated by space bar
+ --stopCodons [STOPCODONS [STOPCODONS ...]] list of stop codons separated by space bar
+ --tdict {STD,VMTO,YMTO,BAPP}
+ Which dictionary use for translation: STD - standard,
+ VMTO - Vertebrate Mitochondrial, YMTO - Yeast
+ Mitochondrial, BAPP - Bacterial Archaeal Plant and
+ Plastid
+ --nss No Start Stop
+ --report REPORT report results into file if not supplied stdout
+ --operator [OPERATOR [OPERATOR ...]] user who have fired script it will be noted in report
View
@@ -130,23 +130,14 @@ def main():
sub_trn_d2p.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa')
sub_trn_d2p.add_argument('--startCodons', help='list of start codons separated by space bar', nargs='*', type=str)
sub_trn_d2p.add_argument('--stopCodons', help='list of stop codons separated by space bar', nargs='*', type=str)
+ sub_trn_d2p.add_argument(
+ '--tdict', help='Which dictionary use for translation: STD - standard, VMTO - Vertebrate Mitochondrial, YMTO - Yeast Mitochondrial, BAPP - Bacterial Archaeal Plant and Plastid',
+ type=str, choices=['STD', 'VMTO', 'YMTO', 'BAPP'], default = 'STD'
+ )
sub_trn_d2p.add_argument('--nss', help='No Start Stop', action='store_true')
sub_trn_d2p.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w'))
sub_trn_d2p.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str)
sub_trn_d2p.set_defaults(func=translate_dna_to_protein)
- '''
- sub_fap = subparsers.add_parser('findPrimer', help='show statistics of fa file')
- sub_fap.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True)
- sub_fap.add_argument('--start', help='strat codon 5\'', type=str, required=True)
- sub_fap.add_argument('--stop', help='stop codon 3\'', type=str, required=True)
- sub_fap.add_argument('--minlen', help='minimum length (detfault 50bp)', type=str, default=50)
- sub_fap.add_argument('--maxlen', help='max length (detfault 1000bp)', type=str, default=1000
- sub_fap.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w'))
- sub_fap.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str)
- sub_fap.set_defaults(func=find_primers)
- '''
- #parser.add_argument('--operator', help='user who have fired script it will be noted in report', type=str)
- #parser.add_argument('--report', help='log file if not supplied stdout', type=argparse.FileType('w'))
args = parser.parse_args()
@@ -203,7 +194,7 @@ def cut_fa(args):
def extract_names(args):
logger = logging.getLogger(__name__)
- logger.setLevel(logging.info)
+ logger.setLevel(logging.INFO)
logger.info('command: extractNames starting')
rep = str(make_log_header('extractNames', args.operator))
fafile = args.fafile
@@ -213,7 +204,7 @@ def extract_names(args):
names = fa.show_names()
with output as o:
for r in names:
- o.write('>'+r)
+ o.write(r+'\n')
rep += 'Number of neames founded:\t' + str(len(names))
rep += '\n\n------------------------------------------------------'
rep += '\nFinished:\t'+str(datetime.datetime.now())
@@ -239,7 +230,7 @@ def extract_contigs(args):
else:
result_fa.write(args.output)
rep += '\nContigs to remove:\t'+str(len(elist))
- rep += '\Extracted contigs:\t'+str(len(result_ta.contigs))
+ rep += '\Extracted contigs:\t'+str(len(result_fa.contigs))
rep += '\n\n------------------------------------------------------'
rep += '\nFinished:\t'+str(datetime.datetime.now())
if args.report:
@@ -429,67 +420,72 @@ def cut_name_pattern(args):
for r in fa.contigs:
r.leave_name_after_marker(args.marker, args.length, args.keepMarker)
fa.write(args.output)
+
+def print_frame_output(r_dict):
+ i = 0
+ otp = ''
+ for f in r_dict:
+ otp += 'FRAME:\t'+str(i+1)+'\n'
+ otp += '\nBEFORE:\t '+f[0]
+ otp += '\nTRANSLATION:\n\n'+f[1]
+ otp += '\n\nAFTER:\t '+f[2]
+ otp += '\n------------------------------------------------\n'
+ i+=1
+ return otp
def translate_dna_to_protein(args):
rep = str(make_log_header('translate2protein', args.operator))
fa = Fa.load_from_file(args.fafile)
+ if args.tdict == 'STD':
+ tdict = Sequence.tdict_standard
+ elif args.tdict == 'VMTO':
+ tdict = Sequence.tdict_vertebrate_mitochondrial
+ elif args.tdict == 'YMTO':
+ tdict = Sequence.tdict_yeast_mitochondrial
+ elif args.tdict == '????????':
+ tdict = Sequence.tdict_standard
+ elif args.tdict == 'BAPP':
+ tdict = Sequence.tdict_bacterial_archaeal_plant_plastid
+ else:
+ print 'applied dictionary is not valid!'
+ exit(1)
+
r_dict = {}
otp = ''
if args.nss:
for r in fa.contigs:
- r_dict = r.translate2protein({})
+ r_dict = r.translate2protein(tdict)
otp += '\n=============================\n'+r.name+'\n=============================\n'
- otp += '\nFORWARD\n'
- i = 0
- for f in r_dict['fwd']:
- otp += 'FRAME:\t'+str(i+1)+'\n'
- otp += 'BEFORE:\t '+f[0]
- otp += 'TRANSLATION:\n '+f[1]
- otp += 'AFTER:\t '+f[2]
- otp += '\n------------------------------------------------\n'
- i+=1
- otp += '\nREVERS\n'
- otp += '\n------------------------------------------------\n'
- i = 0
- for f in r_dict['rev']:
- otp += 'FRAME:\t'+str(i+1)+'\n'
- otp += 'BEFORE:\t '+f[0]
- otp += 'TRANSLATION:\n '+f[1]
- otp += 'AFTER:\t '+f[2]
- otp += '\n------------------------------------------------\n'
- i+=1
+ otp += '\nFORWARD\n\n'
+ otp += print_frame_output(r_dict['fwd'])
+ otp += '\n'+'='*15+'\n'
+ otp += '\nREVERS\n\n'
+ otp += print_frame_output(r_dict['rev'])
rep += otp
else:
- tdict = {
- 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
- 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I',
- 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N',
- 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R',
- 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
- 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*'
- }
for r in fa.contigs:
r_dict = r.translate2protein_in_range(args.startCodons, args.stopCodons, tdict)
otp += '\n=============================\n'+r.name+'\n=============================\n'
- otp += 'FORWARD\n'
+ otp += 'FORWARD\n\n'
i = 0
for f in r_dict['fwd']:
otp += 'FRAME:\t'+str(i+1)+'\n'
for k in f:
otp += '\n'+k[0]+' start: '+str(k[1])
otp += '\n------------------------------------------------\n'
- otp += '\n=================================================\n'
- otp += 'REVERS\n'
+ otp += '\n'+'='*15+'\n'
+ i += 1
+ otp += 'REVERS\n\n'
i = 0
for f in r_dict['rev']:
otp += 'FRAME:\t'+str(i+1)+'\n'
for k in f:
otp += '\n'+k[0]+' start: '+str(k[1])
otp += '\n------------------------------------------------\n'
- otp += '\n=================================================\n'
+ i += 1
rep += otp
fa.write(args.output)
View
@@ -52,7 +52,7 @@ def load_content(content):
nc = content.split('>')
contigs_list = []
for r in nc[1:]:
- contigs_list.append(Sequence('>'+r.split('\n', 1)[0], re.sub('^>.*\n', '', '>'+r.rstrip())))
+ contigs_list.append(Sequence('>'+r.split('\n', 1)[0].rstrip(), re.sub('^>.*\n', '', '>'+r.rstrip())))
return contigs_list
def write(self, fafile):
@@ -97,11 +97,12 @@ def show_names(self):
def extract(self, contigs_name_list):
+ print contigs_name_list
new_contig_list = []
for r in contigs_name_list:
if r in self.contigs_idx:
new_contig_list.append(self.contigs[self.contigs_idx[r]])
- return Fa(new_contig_list, 'extr_'+self.name)
+ return Fa(new_contig_list, '>extr_'+self.name)
def remove(self, contigs_name_list):
new_contig_list = []
View
@@ -8,6 +8,50 @@
class Sequence(object):
+ # 1
+ tdict_standard = {
+ 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
+ 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I',
+ 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N',
+ 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R',
+ 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
+ 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*'
+ }
+
+ start_standard = ['ATG', 'TTG', 'CTG']
+
+ standard_stop = ['TAA', 'TAG', 'TGA']
+
+ # 2
+ tdict_vertebrate_mitochondrial = {
+ 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
+ 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'M', 'ATC':'I', 'ATT':'I',
+ 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N',
+ 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'*', 'AGG':'*', 'CGA':'R', 'CGC':'R', 'CGG':'R',
+ 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
+ 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'W', 'TAA':'*'
+ }
+
+ # 3
+ tdict_yeast_mitochondrial = {
+ 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
+ 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'M', 'ATC':'I', 'ATT':'I',
+ 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'T', 'CTC':'T', 'CTG':'T', 'CTT':'T', 'ATG':'M', 'AAC':'N', 'AAT':'N',
+ 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R',
+ 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
+ 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'W', 'TAA':'*'
+ }
+
+ # 11
+ tdict_bacterial_archaeal_plant_plastid = {
+ 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
+ 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I',
+ 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N',
+ 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R',
+ 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
+ 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*'
+ }
+
def __init__(self, name, seq):
if Sequence.validate_name_string(name):
self.name = name
@@ -30,47 +74,7 @@ def validate_seq(self):
'''
validates general seqence not specified for DNA or others.
'''
- # pattern to find not allowed chars.
- pattern = re.compile('[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]')
- if pattern.search(self.seq):
- if re.search('(\d+)', self.seq):
- seq_array = self.seq.split('\n')
- new_array = [] # array to store new sequence
- for r in seq_array:
- r = r.lstrip() # removing ' ' from beginings and ends
- nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa
- new_array.append(nr)
-
- end_of_seq_array = len(seq_array)
- # if min. two lines calculate expected line length
- if end_of_seq_array > 1:
- line_length = int(new_array[1][0])-int(new_array[0][0])
-
- # validate ecah block (between " ") of sequence ()
- i = 0
- while i < end_of_seq_array:
- if not re.search('(\d+)', new_array[i][0]):
- return 7 # line doesn't starts with digit
- if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1):
- return 0 # bad line length
- for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit
- if len(r) != 10: # block not eq 10
- if len(r) < 10: # if less it can be ok if last elem of last line
- if(i == end_of_seq_array - 1):
- if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem.
- return 0 # not last elem of last line
- else:
- return 0 # not last line
- else:
- return 0 # block not eq 10
- if pattern.search(r):
- return 0
- i += 1
- else:
- return 0 # digit is not first char
- # return pattern.search(self.seq) but nan error code returned before
- return 1
- return 1 # valid
+ return Sequence.generic_validate(self.seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]')
@staticmethod
def generic_validate(seq, domain):
@@ -199,7 +203,6 @@ def cut(self, length, step):
def cut_name(self, length, start = 0):
self.name = self.name[start:length]
- print self.name
def leave_name_after_marker(self, mark, length = 0, keep_marker = 1):
m = re.search(re.escape(mark), self.name)
@@ -305,7 +308,6 @@ def translate2protein_in_range_generic(seq, start, stop, tdict):
while i+3 <= stop_pos:
ret += Sequence.translate(seq[i:i+3], tdict)
if re.match(p_stop, seq[i:i+3]):
- #print 'exiting on: '+seq[i:i+3]
i = i+3
break
else:
@@ -322,14 +324,6 @@ def translate2protein_in_range_generic(seq, start, stop, tdict):
return [frame1, frame2, frame3]
def translate2protein_in_range(self, start, stop, tdict):
- tdict = {
- 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
- 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I',
- 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N',
- 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R',
- 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
- 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*'
- }
f = Sequence.translate2protein_in_range_generic(self.seq, start, stop, tdict)
r = Sequence.translate2protein_in_range_generic(self.reverse().seq, start, stop, tdict)
@@ -353,14 +347,7 @@ def translate2protein_generic(seq, tdict):
return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,'')]
def translate2protein(self, tdict):
- tdict = {
- 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
- 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I',
- 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N',
- 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R',
- 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
- 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*'
- }
+
f = Sequence.translate2protein_generic(self.seq, tdict)
r = Sequence.translate2protein_generic(self.reverse().seq, tdict)
return {'fwd':f, 'rev':r}
Oops, something went wrong.

0 comments on commit a2b810b

Please sign in to comment.