Permalink
Browse files

Merge branch 'master' of github.com:BiobankLab/FA_TOOL

Conflicts:
	README.md

	modified:   README.md
	modified:   bin/cmdfatool.py
	modified:   fatool/sequence.py
  • Loading branch information...
2 parents f455fa2 + a7a6333 commit 65dca7247265393d2874e2745430d5e1e9978158 @blazejmarciniak blazejmarciniak committed Jun 27, 2016
Showing with 115 additions and 17 deletions.
  1. +11 −11 README.md
  2. +80 −0 bin/cmdfatool.py
  3. +24 −6 fatool/sequence.py
View
@@ -66,7 +66,7 @@ optional arguments:
--operator OPERATOR user who have fired script it will be noted in log
- extractNames:
+ extractNames
usage: cmdfatool.py extractNames [-h] -f FAFILE [-o OUTPUT] [--report REPORT]
[--operator OPERATOR]
@@ -79,7 +79,7 @@ optional arguments:
--operator OPERATOR user who have fired script it will be noted in log
- extractContigs:
+ extractContigs
usage: cmdfatool.py extractContigs [-h] -f FAFILE --list LIST -o OUTPUT
[--report REPORT] [--operator OPERATOR]
@@ -126,7 +126,7 @@ optional arguments:
--operator OPERATOR user who have fired script it will be noted in log
- split
+ split
usage: cmdfatool.py split [-h] -f FAFILE -d OUTPUTDIR [--report REPORT]
[--operator OPERATOR]
@@ -139,7 +139,7 @@ optional arguments:
--operator OPERATOR user who have fired script it will be noted in log
- reverse
+ reverse
usage: cmdfatool.py reverse [-h] -f FAFILE -o OUTPUT [--report REPORT]
[--operator OPERATOR]
@@ -152,7 +152,7 @@ optional arguments:
--operator OPERATOR user who have fired script it will be noted in log
- validate
+ validate
usage: cmdfatool.py validate [-h] -f FAFILE -t TYPE [--details]
@@ -164,7 +164,7 @@ optional arguments:
--details set if you want to see detaild validation info
- stats
+ stats
usage: cmdfatool.py stats [-h] -f FAFILE [--report REPORT]
[--operator [OPERATOR [OPERATOR ...]]]
@@ -175,7 +175,7 @@ optional arguments:
--report REPORT log file if not supplied stdout
--operator [OPERATOR [OPERATOR ...]] user who have fired script it will be noted in log
- findPrimer:
+ findPrimer
usage: cmdfatool.py findPrimer [-h] -f FAFILE --start START --stop STOP --mode
{FF,FR} [--minlen MINLEN] [--maxlen MAXLEN]
@@ -186,9 +186,9 @@ optional arguments:
-h, --help show this help message and exit
-f FAFILE, --fafile FAFILE
file to show statistics usualy *.fa
- --start START strat codon 5'
- --stop STOP stop codon 3'
- --mode {FF,FR} FF (start forward, stop forward) or FR (start 5' stop 3')
+ --start START first sequence to be found
+ --stop STOP last sequence to be found
+ --mode {FF,FR} FF (start - forward orientated, stop - forward orientated) or FR (start - forward orientated, stop - reverse orientated)
--minlen MINLEN minimum length (detfault 50bp)
--maxlen MAXLEN max length (detfault 1000bp)
--mml MML mismatch level number of allowed missmatches in primers (detfault 0)
@@ -209,4 +209,4 @@ optional arguments:
-m MARKER, --marker MARKER marker that indicates start of cut
-l LENGTH, --length LENGTH length of cut
--keepMarker KEEPMARKER weather to keep marker or not default 1 (Yes)
- -o OUTPUT, --output OUTPUT output file default: output.fa
+ -o OUTPUT, --output OUTPUT output file default: output.fa
View
@@ -125,6 +125,15 @@ def main():
sub_lnam.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str)
sub_lnam.set_defaults(func=cut_name_pattern)
+ sub_trn_d2p = subparsers.add_parser('translateDNA2Proteins', help='display translation to proteins')
+ sub_trn_d2p.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True)
+ sub_trn_d2p.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa')
+ sub_trn_d2p.add_argument('--startCodons', help='list of start codons separated by space bar', nargs='*', type=str)
+ sub_trn_d2p.add_argument('--stopCodons', help='list of stop codons separated by space bar', nargs='*', type=str)
+ sub_trn_d2p.add_argument('--nss', help='No Start Stop', action='store_true')
+ sub_trn_d2p.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w'))
+ sub_trn_d2p.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str)
+ sub_trn_d2p.set_defaults(func=translate_dna_to_protein)
'''
sub_fap = subparsers.add_parser('findPrimer', help='show statistics of fa file')
sub_fap.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True)
@@ -421,6 +430,77 @@ def cut_name_pattern(args):
r.leave_name_after_marker(args.marker, args.length, args.keepMarker)
fa.write(args.output)
+def translate_dna_to_protein(args):
+ rep = str(make_log_header('translate2protein', args.operator))
+ fa = Fa.load_from_file(args.fafile)
+ r_dict = {}
+ otp = ''
+ if args.nss:
+ for r in fa.contigs:
+ r_dict = r.translate2protein({})
+ otp += '\n=============================\n'+r.name+'\n=============================\n'
+ otp += '\nFORWARD\n'
+ i = 0
+ for f in r_dict['fwd']:
+ otp += 'FRAME:\t'+str(i+1)+'\n'
+ otp += 'BEFORE:\t '+f[0]
+ otp += 'TRANSLATION:\n '+f[1]
+ otp += 'AFTER:\t '+f[2]
+ otp += '\n------------------------------------------------\n'
+ i+=1
+ otp += '\nREVERS\n'
+ otp += '\n------------------------------------------------\n'
+ i = 0
+ for f in r_dict['rev']:
+ otp += 'FRAME:\t'+str(i+1)+'\n'
+ otp += 'BEFORE:\t '+f[0]
+ otp += 'TRANSLATION:\n '+f[1]
+ otp += 'AFTER:\t '+f[2]
+ otp += '\n------------------------------------------------\n'
+ i+=1
+ rep += otp
+
+ else:
+ tdict = {
+ 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
+ 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I',
+ 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N',
+ 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R',
+ 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
+ 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*'
+ }
+ for r in fa.contigs:
+
+ r_dict = r.translate2protein_in_range(args.startCodons, args.stopCodons, tdict)
+ otp += '\n=============================\n'+r.name+'\n=============================\n'
+ otp += 'FORWARD\n'
+ i = 0
+
+ for f in r_dict['fwd']:
+ otp += 'FRAME:\t'+str(i+1)+'\n'
+ for k in f:
+ otp += '\n'+k[0]+' start: '+str(k[1])
+ otp += '\n------------------------------------------------\n'
+ otp += '\n=================================================\n'
+ otp += 'REVERS\n'
+ i = 0
+ for f in r_dict['rev']:
+ otp += 'FRAME:\t'+str(i+1)+'\n'
+ for k in f:
+ otp += '\n'+k[0]+' start: '+str(k[1])
+ otp += '\n------------------------------------------------\n'
+ otp += '\n=================================================\n'
+ rep += otp
+
+ fa.write(args.output)
+ rep += '\n\n------------------------------------------------------'
+ rep += '\nFinished:\t'+str(datetime.datetime.now())
+ if args.report:
+ with args.report as log_file:
+ log_file.write(rep)
+ else:
+ print rep
+
def cut_name(args):
pass
View
@@ -286,13 +286,12 @@ def translate2protein_in_range_generic(seq, start, stop, tdict):
# creating pattern to find stop codons
for r in stop:
p_stop += r+'|'
- p_stop = '('+p.rstrip('|')+')'
+ p_stop = '('+p_stop.rstrip('|')+')'
- # match for start contigs
m = re.finditer(p, seq)
# there will be stored latest string position for each frame
- frame_iterator[0,0,0]
+ frame_iterator = [0,0,0]
stop_pos = len(seq) # where to stop searching if no stopcodon found
@@ -303,9 +302,10 @@ def translate2protein_in_range_generic(seq, start, stop, tdict):
# set i for start position of current start contig
i = r.start()
ret = ''
- while i+3 <= stop:
+ while i+3 <= stop_pos:
ret += Sequence.translate(seq[i:i+3], tdict)
if re.match(p_stop, seq[i:i+3]):
+ #print 'exiting on: '+seq[i:i+3]
i = i+3
break
else:
@@ -321,18 +321,36 @@ def translate2protein_in_range_generic(seq, start, stop, tdict):
return [frame1, frame2, frame3]
+ def translate2protein_in_range(self, start, stop, tdict):
+ tdict = {
+ 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
+ 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I',
+ 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N',
+ 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R',
+ 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
+ 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*'
+ }
+
+ f = Sequence.translate2protein_in_range_generic(self.seq, start, stop, tdict)
+ r = Sequence.translate2protein_in_range_generic(self.reverse().seq, start, stop, tdict)
+
+ return {'fwd':f, 'rev':r}
+
+
@staticmethod
def translate2protein_generic(seq, tdict):
# +5 to secure all frames
f1 = ''
f2 = ''
f3 = ''
+ i = 0
while i+5 < len(seq):
f1 += Sequence.translate(seq[i:i+3], tdict)
f2 += Sequence.translate(seq[i+1:i+4], tdict)
f3 += Sequence.translate(seq[i+2:i+5], tdict)
+ i = i + 3
- return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,)]
+ return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,'')]
def translate2protein(self, tdict):
tdict = {
@@ -348,7 +366,7 @@ def translate2protein(self, tdict):
return {'fwd':f, 'rev':r}
@staticmethod
- def translate(contig, tdict):
+ def translate(codon, tdict):
if codon in tdict:
return tdict[codon]
else:

0 comments on commit 65dca72

Please sign in to comment.