diff --git a/README.md b/README.md index a6ff1da..715f31b 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ optional arguments: --operator OPERATOR user who have fired script it will be noted in log - extractNames: + extractNames usage: cmdfatool.py extractNames [-h] -f FAFILE [-o OUTPUT] [--report REPORT] [--operator OPERATOR] @@ -79,7 +79,7 @@ optional arguments: --operator OPERATOR user who have fired script it will be noted in log - extractContigs: + extractContigs usage: cmdfatool.py extractContigs [-h] -f FAFILE --list LIST -o OUTPUT [--report REPORT] [--operator OPERATOR] @@ -126,7 +126,7 @@ optional arguments: --operator OPERATOR user who have fired script it will be noted in log - split + split usage: cmdfatool.py split [-h] -f FAFILE -d OUTPUTDIR [--report REPORT] [--operator OPERATOR] @@ -139,7 +139,7 @@ optional arguments: --operator OPERATOR user who have fired script it will be noted in log - reverse + reverse usage: cmdfatool.py reverse [-h] -f FAFILE -o OUTPUT [--report REPORT] [--operator OPERATOR] @@ -152,7 +152,7 @@ optional arguments: --operator OPERATOR user who have fired script it will be noted in log - validate + validate usage: cmdfatool.py validate [-h] -f FAFILE -t TYPE [--details] @@ -164,7 +164,7 @@ optional arguments: --details set if you want to see detaild validation info - stats + stats usage: cmdfatool.py stats [-h] -f FAFILE [--report REPORT] [--operator [OPERATOR [OPERATOR ...]]] @@ -175,7 +175,7 @@ optional arguments: --report REPORT log file if not supplied stdout --operator [OPERATOR [OPERATOR ...]] user who have fired script it will be noted in log - findPrimer: + findPrimer usage: cmdfatool.py findPrimer [-h] -f FAFILE --start START --stop STOP --mode {FF,FR} [--minlen MINLEN] [--maxlen MAXLEN] @@ -186,9 +186,9 @@ optional arguments: -h, --help show this help message and exit -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa - --start START strat codon 5' - --stop STOP stop codon 3' - --mode {FF,FR} FF (start forward, stop forward) or FR (start 5' stop 3') + --start START first sequence to be found + --stop STOP last sequence to be found + --mode {FF,FR} FF (start - forward orientated, stop - forward orientated) or FR (start - forward orientated, stop - reverse orientated) --minlen MINLEN minimum length (detfault 50bp) --maxlen MAXLEN max length (detfault 1000bp) --mml MML mismatch level number of allowed missmatches in primers (detfault 0) @@ -209,4 +209,4 @@ optional arguments: -m MARKER, --marker MARKER marker that indicates start of cut -l LENGTH, --length LENGTH length of cut --keepMarker KEEPMARKER weather to keep marker or not default 1 (Yes) - -o OUTPUT, --output OUTPUT output file default: output.fa \ No newline at end of file + -o OUTPUT, --output OUTPUT output file default: output.fa diff --git a/bin/cmdfatool.py b/bin/cmdfatool.py index d45754e..00554f7 100644 --- a/bin/cmdfatool.py +++ b/bin/cmdfatool.py @@ -125,6 +125,15 @@ def main(): sub_lnam.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) sub_lnam.set_defaults(func=cut_name_pattern) + sub_trn_d2p = subparsers.add_parser('translateDNA2Proteins', help='display translation to proteins') + sub_trn_d2p.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) + sub_trn_d2p.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa') + sub_trn_d2p.add_argument('--startCodons', help='list of start codons separated by space bar', nargs='*', type=str) + sub_trn_d2p.add_argument('--stopCodons', help='list of stop codons separated by space bar', nargs='*', type=str) + sub_trn_d2p.add_argument('--nss', help='No Start Stop', action='store_true') + sub_trn_d2p.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) + sub_trn_d2p.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) + sub_trn_d2p.set_defaults(func=translate_dna_to_protein) ''' sub_fap = subparsers.add_parser('findPrimer', help='show statistics of fa file') sub_fap.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) @@ -421,6 +430,77 @@ def cut_name_pattern(args): r.leave_name_after_marker(args.marker, args.length, args.keepMarker) fa.write(args.output) +def translate_dna_to_protein(args): + rep = str(make_log_header('translate2protein', args.operator)) + fa = Fa.load_from_file(args.fafile) + r_dict = {} + otp = '' + if args.nss: + for r in fa.contigs: + r_dict = r.translate2protein({}) + otp += '\n=============================\n'+r.name+'\n=============================\n' + otp += '\nFORWARD\n' + i = 0 + for f in r_dict['fwd']: + otp += 'FRAME:\t'+str(i+1)+'\n' + otp += 'BEFORE:\t '+f[0] + otp += 'TRANSLATION:\n '+f[1] + otp += 'AFTER:\t '+f[2] + otp += '\n------------------------------------------------\n' + i+=1 + otp += '\nREVERS\n' + otp += '\n------------------------------------------------\n' + i = 0 + for f in r_dict['rev']: + otp += 'FRAME:\t'+str(i+1)+'\n' + otp += 'BEFORE:\t '+f[0] + otp += 'TRANSLATION:\n '+f[1] + otp += 'AFTER:\t '+f[2] + otp += '\n------------------------------------------------\n' + i+=1 + rep += otp + + else: + tdict = { + 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', + 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', + 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', + 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', + 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', + 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' + } + for r in fa.contigs: + + r_dict = r.translate2protein_in_range(args.startCodons, args.stopCodons, tdict) + otp += '\n=============================\n'+r.name+'\n=============================\n' + otp += 'FORWARD\n' + i = 0 + + for f in r_dict['fwd']: + otp += 'FRAME:\t'+str(i+1)+'\n' + for k in f: + otp += '\n'+k[0]+' start: '+str(k[1]) + otp += '\n------------------------------------------------\n' + otp += '\n=================================================\n' + otp += 'REVERS\n' + i = 0 + for f in r_dict['rev']: + otp += 'FRAME:\t'+str(i+1)+'\n' + for k in f: + otp += '\n'+k[0]+' start: '+str(k[1]) + otp += '\n------------------------------------------------\n' + otp += '\n=================================================\n' + rep += otp + + fa.write(args.output) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.report: + with args.report as log_file: + log_file.write(rep) + else: + print rep + def cut_name(args): pass diff --git a/fatool/sequence.py b/fatool/sequence.py index b9015b4..1facb01 100644 --- a/fatool/sequence.py +++ b/fatool/sequence.py @@ -286,13 +286,12 @@ def translate2protein_in_range_generic(seq, start, stop, tdict): # creating pattern to find stop codons for r in stop: p_stop += r+'|' - p_stop = '('+p.rstrip('|')+')' + p_stop = '('+p_stop.rstrip('|')+')' - # match for start contigs m = re.finditer(p, seq) # there will be stored latest string position for each frame - frame_iterator[0,0,0] + frame_iterator = [0,0,0] stop_pos = len(seq) # where to stop searching if no stopcodon found @@ -303,9 +302,10 @@ def translate2protein_in_range_generic(seq, start, stop, tdict): # set i for start position of current start contig i = r.start() ret = '' - while i+3 <= stop: + while i+3 <= stop_pos: ret += Sequence.translate(seq[i:i+3], tdict) if re.match(p_stop, seq[i:i+3]): + #print 'exiting on: '+seq[i:i+3] i = i+3 break else: @@ -321,18 +321,36 @@ def translate2protein_in_range_generic(seq, start, stop, tdict): return [frame1, frame2, frame3] + def translate2protein_in_range(self, start, stop, tdict): + tdict = { + 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', + 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', + 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', + 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', + 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', + 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' + } + + f = Sequence.translate2protein_in_range_generic(self.seq, start, stop, tdict) + r = Sequence.translate2protein_in_range_generic(self.reverse().seq, start, stop, tdict) + + return {'fwd':f, 'rev':r} + + @staticmethod def translate2protein_generic(seq, tdict): # +5 to secure all frames f1 = '' f2 = '' f3 = '' + i = 0 while i+5 < len(seq): f1 += Sequence.translate(seq[i:i+3], tdict) f2 += Sequence.translate(seq[i+1:i+4], tdict) f3 += Sequence.translate(seq[i+2:i+5], tdict) + i = i + 3 - return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,)] + return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,'')] def translate2protein(self, tdict): tdict = { @@ -348,7 +366,7 @@ def translate2protein(self, tdict): return {'fwd':f, 'rev':r} @staticmethod - def translate(contig, tdict): + def translate(codon, tdict): if codon in tdict: return tdict[codon] else: