diff --git a/build/lib/fatool/__init__.py b/build/lib/fatool/__init__.py deleted file mode 100644 index 22fb0c7..0000000 --- a/build/lib/fatool/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .sequence import Sequence -from .fa import Fa -import fuzzy \ No newline at end of file diff --git a/build/lib/fatool/fa.py b/build/lib/fatool/fa.py deleted file mode 100644 index dfbad5a..0000000 --- a/build/lib/fatool/fa.py +++ /dev/null @@ -1,216 +0,0 @@ -# -*- coding: utf-8 -*- - - -import re -import math -from fatool import Sequence -import logging - -class Fa(object): - def __init__(self, contigs_list, name): - logger = logging.getLogger(__name__) - - logger.debug('creating Fa object') - self.name = name - self.contigs = [] - self.contigs_idx = {} - for r in contigs_list: - if not isinstance(r, Sequence): - logger.error('Supplied param is not Sequence object') - raise TypeError('Wrong param supplied Sequence was expected') - if not r.name in self.contigs_idx: - if len(self.contigs) > 0: - logger.debug('appending contig: '+r.name) - self.contigs.append(r) - else: - logger.debug('adding first contig: '+r.name) - self.contigs = [r] - - self.contigs_idx[r.name] = len(self.contigs) - 1 - else: - logger.error('Sequence name: '+r.name+' already exists in file') - raise NameError('Sequence name already exists: '+r.name) - - - @staticmethod - def load_from_file(file): - if isinstance(file, str): - with open(file, 'r') as f: - contigs = Fa.load_content(f.read()) - name = file - else: - name = file.name - with file as f: - contigs = Fa.load_content(f.read() ) - - - return Fa(contigs, name) - - @staticmethod - def load_content(content): - #print content - nc = content.split('>') - contigs_list = [] - for r in nc[1:]: - contigs_list.append(Sequence('>'+r.split('\n', 1)[0], re.sub('^>.*\n', '', '>'+r.rstrip()))) - return contigs_list - - def write(self, fafile): - if isinstance(fafile, str): - with open(fafile, 'w') as f: - f.write(str(self)) - else: - with fafile as f: - f.write(str(self)) - - def write_multiple_files(self, dir): - dir = dir.rstrip('/') - dir = dir.rstrip('\\') - if len(dir) > 0: - dir = dir+'/' - for r in self.contigs: - with open(dir+r.name+'.fa', 'w') as w: - w.write(str(r)) - - def add_contigs(self, contig_list, owrite=0): - for r in contig_list: - self.add_contig(r, owrite) - - - def add_contig(self, contig, owrite = 0): - if not isinstance(contig, Sequence): - raise TypeError('Wrong param supplied contig was expected') - if contig.name in self.contigs_idx: - if owrite == 1: - #rem old item and add new name - del self.contigs[self.contigs_idx[contig.name]] - self.contigs.append(contig) - for a, r in enumerate(self.contigs): - #print 'cnt '+str(r) - self.contigs_idx[r.name] = a - else: - self.contigs.append(contig) - self.contigs_idx[contig.name] = len(self.contigs) - 1 - - def show_names(self): - return sorted(self.contigs_idx, key=self.contigs_idx.get) - - - def extract(self, contigs_name_list): - new_contig_list = [] - for r in contigs_name_list: - if r in self.contigs_idx: - new_contig_list.append(self.contigs[self.contigs_idx[r]]) - return Fa(new_contig_list, 'extr_'+self.name) - - def remove(self, contigs_name_list): - new_contig_list = [] - for r in self.contigs: - if not r.name in contigs_name_list: - new_contig_list.append(r) - return Fa(new_contig_list, 'rem_'+self.name) - - def validate(self): - ''' - ''' - - def nl_statistics(self, g, percent): - ''' - Counts statistics of N50, L50, N75 etc. - g array containing sorted contigs by length, from biggest to lowest - ''' - ncount = -1 # index & number of contigs with +1 - nsum = 0 - stop = math.floor(self.stats['L']*(percent/100.00)) - while nsum < stop: - ncount += 1 - nsum += g[ncount] - - self.stats['N'+str(percent)] = g[ncount] - self.stats['L'+str(percent)] = ncount + 1 - - def bp_stats(self, length): - self.stats['totalc'] += 1 - if length > 50000: - self.stats['nbp50000'] += 1 # number of contigs with length - self.stats['lbp50000'] += length # total length of contigs with min. len - elif length > 25000: - self.stats['nbp25000'] += 1 - self.stats['lbp25000'] += length - elif length > 10000: - self.stats['nbp10000'] += 1 - self.stats['lbp10000'] += length - elif length > 5000: - self.stats['nbp5000'] += 1 - self.stats['lbp5000'] += length - elif length > 1000: - self.stats['nbp1000'] += 1 - self.stats['lbp1000'] += length - - def statistics(self): - self.stats = { - 'A': 0, 'C': 0, 'T': 0, 'G': 0, 'N': 0, 'L': 0, - 'nbp1000': 0, 'nbp5000': 0, 'nbp10000': 0, 'nbp25000': 0, 'nbp50000': 0, - 'lbp1000': 0, 'lbp5000': 0, 'lbp10000': 0, 'lbp25000': 0, 'lbp50000': 0, - 'totalc':0 - } - nstat_list = [] - bp_stats = [] - for r in self.contigs: - temp = r.statistics() - self.stats['A'] += temp['A'] - self.stats['C'] += temp['C'] - self.stats['T'] += temp['T'] - self.stats['G'] += temp['G'] - self.stats['N'] += temp['N'] - self.stats['L'] += temp['L'] - nstat_list.append(temp['L']) - self.bp_stats(temp['L']) - - self.stats['longest'] = max(nstat_list) - nstat_list.sort() - nstat_list.reverse() - - self.nl_statistics(nstat_list, 50) - self.nl_statistics(nstat_list, 75) - self.nl_statistics(nstat_list, 90) - - #print self.stats - - return self.stats - - def sort(self, mono): - contig_list = [] - temp = {} # dict to store name:len(contig) - for r in self.contigs: - temp[r.name] = len(r) - - if mono == -1: - for r in sorted(temp, key=temp.get)[::-1]: - contig_list.append(self.contigs[self.contigs_idx[r]]) - else: - for r in sorted(temp, key=temp.get): - contig_list.append(self.contigs[self.contigs_idx[r]]) - - return Fa(contig_list, 'sorted_'+self.name) - - def reverse(): - cl = [] - for r in self.contigs: - cl.append(r.reverse) - return Fa(cl, 'rev_'+self.name) - - def join(self, fa_list, owrite = 0): - for fa in fa_list: - if not isinstance(fa, Fa): - raise TypeError('Wrong param supplied Fa was expected') - self.add_contigs(fa.contigs, owrite) - - def count_contigs(self): - return len(self.contigs) - - def __str__(self): - return_string = '' - for r in self.contigs: - return_string += str(r) - return return_string diff --git a/build/lib/fatool/fuzzy.py b/build/lib/fatool/fuzzy.py deleted file mode 100644 index 1e41376..0000000 --- a/build/lib/fatool/fuzzy.py +++ /dev/null @@ -1,93 +0,0 @@ -# -*- coding: utf-8 -*- - -#import math -import logging - -def find_aprox_match_iter(needle, hstack, missmatch_level, hs_start_pos = 0): - i = hs_start_pos # start iterate from start position - start = hs_start_pos # start of founded region at begining start of search - mmatch_count = 0 # missmatch counter - needle_len = len(needle) - j = 0 # needle iterator - while i < len(hstack): - if hstack[i] != needle[j]: - mmatch_count += 1 - if mmatch_count > missmatch_level: - # if missmatch level oversized back to strat + 1 and start again - i -= j - # needle iterator restart (-1) because it will be increased in a moment - j = -1 - # new start = start + 1 - start = i+1 - #print 'start = '+str(start) - # reset mmatch_count - mmatch_count = 0 - i += 1 - j += 1 - # if needle iterator = len of needle match found return it. - if j >= needle_len: - return (start,i,mmatch_count) - -def find_all_aprox_matches(needle, hstack, missmatch_level, hs_start_pos): - ret_list = [] # list of matches to return - i = hs_start_pos # start iteration from start position - needle_len = len(needle) - while i+needle_len <= len(hstack): - r = find_aprox_match_iter(needle, hstack, missmatch_level, i) - # match found append to list strat new look in start + 1 position - if r: - ret_list.append(r) - i = r[0]+1 - # match not found - no more maches in hstack - else: - break - return ret_list - -# return string from between two aproximated motifs -def find_motif_in_aprox_range(start_motif, stop_motif, hstack, missmatch_level, hs_start_pos = 0): - start = 0 - stop = 0 - start = find_aprox_match_iter(start_motif, hstack, missmatch_level, hs_start_pos = 0) - stop = find_aprox_match_iter(stop_motif, hstack, missmatch_level, start[1]) - if start and stop: - return hstack[start[1]:stop[0]] - -def find_all_motifs_in_aprox_range(start_motif, stop_motif, hstack, missmatch_level, hs_start_pos = 0, len_min = 0, len_max = float('inf')): - i = hs_start_pos - start = 0 - stop = 0 - ret_list = [] - logger = logging.getLogger(__name__) - #logger.setLevel(logging.DEBUG) - logger.debug([start_motif, stop_motif, hstack, missmatch_level, hs_start_pos, len_min, len_max]) - logger.debug(hstack) - - while i <= len(hstack): - start = find_aprox_match_iter(start_motif, hstack, missmatch_level, i) - stop = find_aprox_match_iter(stop_motif, hstack, missmatch_level, start[1]) - if start and stop: - if stop[1] - start[0] >= len_min and stop[1] - start[0] <= len_max: - ret_list.append(hstack[start[0]:stop[1]]) - i = start[0]+1 - else: - break - logger.debug(ret_list) - return ret_list - -def find_motif(needle, hstack, missmatch_level, hs_start_pos = 0): - r = 0 - r = find_aprox_match_iter(needle, hstack, missmatch_level, hs_start_pos = 0) - if r: - return hstack[r[0]:r[1]] - -def find_all_motifs(needle, hstack, missmatch_level, hs_start_pos = 0): - i = hs_start_pos - ret_list = [] - while i <= len(hstack): - r = find_aprox_match_iter(needle, hstack, missmatch_level, i ) - if r: - ret_list.append(hstack[r[0]:r[1]]) - i = r[0]+1 - else: - break - return ret_list \ No newline at end of file diff --git a/build/lib/fatool/sequence.py b/build/lib/fatool/sequence.py deleted file mode 100644 index 1facb01..0000000 --- a/build/lib/fatool/sequence.py +++ /dev/null @@ -1,425 +0,0 @@ -# -*- coding: utf-8 -*- - -from string import maketrans -from collections import Counter -import fuzzy -import re -import logging - - -class Sequence(object): - def __init__(self, name, seq): - if Sequence.validate_name_string(name): - self.name = name - else: - raise NameError('Sequence name have to start with ">"') - self.seq = seq - #self.quality = quality - - # def is_valid(self): - - # def validate_name(self): - - - @staticmethod - def validate_name_string(nstr): - if re.search('^>', nstr): - return 1 - - def validate_seq(self): - ''' - validates general seqence not specified for DNA or others. - ''' - # pattern to find not allowed chars. - pattern = re.compile('[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') - if pattern.search(self.seq): - if re.search('(\d+)', self.seq): - seq_array = self.seq.split('\n') - new_array = [] # array to store new sequence - for r in seq_array: - r = r.lstrip() # removing ' ' from beginings and ends - nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa - new_array.append(nr) - - end_of_seq_array = len(seq_array) - # if min. two lines calculate expected line length - if end_of_seq_array > 1: - line_length = int(new_array[1][0])-int(new_array[0][0]) - - # validate ecah block (between " ") of sequence () - i = 0 - while i < end_of_seq_array: - if not re.search('(\d+)', new_array[i][0]): - return 7 # line doesn't starts with digit - if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): - return 0 # bad line length - for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit - if len(r) != 10: # block not eq 10 - if len(r) < 10: # if less it can be ok if last elem of last line - if(i == end_of_seq_array - 1): - if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. - return 0 # not last elem of last line - else: - return 0 # not last line - else: - return 0 # block not eq 10 - if pattern.search(r): - return 0 - i += 1 - else: - return 0 # digit is not first char - # return pattern.search(self.seq) but nan error code returned before - return 1 - return 1 # valid - - @staticmethod - def generic_validate(seq, domain): - # pattern created from passed domain (domain contains chars that are not allowed) - pattern = re.compile(domain) #'[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]' - # if sequence contains illegal chars - if pattern.search(seq): - # if digits it can be ok if format like (60 xxxxxxxxxx xxx...) - if re.search('(\d+)', seq): - # to check that we have to transform array - seq_array = seq.split('\n') - new_array = [] # array to store new sequence as array of arrays - for r in seq_array: - r = r.lstrip() # removing ' ' from beginings and ends - nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa - new_array.append(nr) - - end_of_seq_array = len(seq_array) - # if min. two lines calculate expected line length - if end_of_seq_array > 1: - line_length = int(new_array[1][0])-int(new_array[0][0]) - - # validate ecah block (between " " [space]) of given sequence - i = 0 - while i < end_of_seq_array: - if not re.search('(\d+)', new_array[i][0]): - return 7 # line doesn't starts with digit - if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): - return 0 # bad line length - for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit - if len(r) != 10: # block not eq 10 - if len(r) < 10: # if less it can be ok if last elem of last line - if(i == end_of_seq_array - 1): - if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. - return 0 # not last elem of last line - else: - return 0 # not last line - else: - return 0 # block not eq 10 - if pattern.search(r): - return 0 - i += 1 - else: - return 0 # digit is not first char - # return pattern.search(seq) but nan error code returned before - return 1 - return 1 # valid - - # def validate_dna_seq(self): - - # def validate_other_seq(self): - - @staticmethod - def detailed_validate_generic(seq, domain): - not_valid = 0 - missmatches = {} - # pattern created from passed domain (domain contains chars that are not allowed) - pattern = re.compile(domain) - # find not allowed chars in sequence - m = pattern.finditer(seq) - log_info = [] - # if not allowed chars found - if m: - # it may be 61 xxxxxxxxxx xxx.... format - if re.search('(\d+)', seq): - seq_array = seq.split('\n') - new_array = [] # array to store new sequence after cleaning and transformation - for r in seq_array: - r = r.lstrip() # removing ' ' from beginings and ends - nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa - new_array.append(nr) - end_of_seq_array = len(seq_array) - # if min. two lines calculate expected line length - if end_of_seq_array > 1: - line_length = int(new_array[1][0])-int(new_array[0][0]) - - # validate each block (between " " [space]) of given sequence - i = 0 - while i < end_of_seq_array: - # digit on begining of line was not found - error - if not re.search('(\d+)', new_array[i][0]): - log_info.append('line '+str(i+1)+": line doesn't starts with digit") # line doesn't starts with digit - # check if line length = expected line length last line can be shorter - if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): - #return 0 # bad line length - log_info.append('line '+str(i+1)+': bad line length') - #chcek all blocks if are eq 10 (last can be shorter) - for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit - if len(r) != 10: # block not eq 10 - if len(r) < 10: # if less it can be ok if last elem of last line - if(i == end_of_seq_array - 1): - if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. - log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains les then 10 chars') # not last elem of last line - else: - log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains les then 10 chars') # not last line - else: - log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains more then 10 chars') # block gt 10 - # if block contains illegal chars now after transtrmation it should contain only legal chars. - if pattern.search(r): - log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains illegal chars') - i += 1 - else: - # in this case it is not seq like "10 xxxxx xxxxx" - for mitem in m: - log_info.append('Position:\t'+str(mitem.start())+'\tvalue:\t'+str(mitem.group())) - # none of not allowed chars were found sequence OK - return log_info - # def detailed_validate_dna_seq(self): - - # def detailed_validate_other_seq(self): - - def cut(self, length, step): - ''' - cutting contig into smaller parts accordigly to supplied params - length of contig (number of chars) - step offset between current and next start - ''' - self.normalize() - i = 0 - contig_end = len(self.seq) # last position of contig - contig_list = [] # contig list returning by function - while i+length <= contig_end: - contig_list.append(Sequence(self.name+'_frag_'+str(i + 1)+':'+str(i + length), str(self.seq[i:i+length]))) - i = i+step - return contig_list - - def cut_name(self, length, start = 0): - self.name = self.name[start:length] - print self.name - - def leave_name_after_marker(self, mark, length = 0, keep_marker = 1): - m = re.search(re.escape(mark), self.name) - logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) - logger.debug(m) - logger.debug(keep_marker) - if m: - # keep original marker or skip it - - if keep_marker == 1: - s = m.start() - else: - s = m.end() - # defined length or return string to end - if length > 0: - self.name = '>'+self.name[s:s+length].lstrip('>') - else: - self.name = '>'+self.name[s:].lstrip('>') - return 1 - return 0 - - - def reverse(self): - ''' - creates reversed sequence - ''' - self.normalize() - nr = re.sub('\n', '', self.seq) - rev = nr[::-1] - rev = rev.translate(maketrans('ACTGactg', 'TGACtgac')) - # creating 80 chars lines - #rev = re.sub("(.{80})", '\\1\n', rev, 0) - return Sequence('>rev_'+self.name.lstrip('>'), rev) - - - def normalize(self): - self.seq = re.sub(' ', '', self.seq) - self.seq = re.sub('^\d', '', self.seq, re.M) - self.seq = re.sub('\n', '', self.seq) - - def statistics(self): - ''' - returns simple statistics for contig - ''' - self.normalize() - r = {} - c = Counter(self.seq) - r['A'] = c['A']+c['a'] - r['C'] = c['C']+c['c'] - r['G'] = c['G']+c['g'] - r['T'] = c['T']+c['t'] - r['N'] = c['N']+c['n'] - r['L'] = len(self.seq) - return r - - #def getRange(self, start, stop): - # return self.seq[start:stop] - - def translate_dna2rna(self): - nc = self.seq.translate(maketrans('ACTGactg', 'UGACugac')) - return Sequence('>rna_'+self.name, nc) - - def translate_rna2dna(self): - nc = self.seq.translate(maketrans('UGACugac', 'ACTGactg')) - return Sequence('>dna_'+self.name, nc) - - # ctrl f1 frame 1 forward, r1 frame 1 revers, fall torward all frames, rall reverse all frames, all in this way? - # supply dict of translation or its constant? - @staticmethod - def translate2protein_in_range_generic(seq, start, stop, tdict): - p = '' - p_stop = '' - # search results in distribution to frames - frame1 = [] - frame2 = [] - frame3 = [] - - # creating pattern (from dict) to find start codons - for r in start: - p += r+'|' - p = '('+p.rstrip('|')+')' - - # creating pattern to find stop codons - for r in stop: - p_stop += r+'|' - p_stop = '('+p_stop.rstrip('|')+')' - - m = re.finditer(p, seq) - - # there will be stored latest string position for each frame - frame_iterator = [0,0,0] - - stop_pos = len(seq) # where to stop searching if no stopcodon found - - # using each found start codon - for r in m: - # if start is lower then last used position skip it. - if frame_iterator[r.start()%3] <= r.start(): - # set i for start position of current start contig - i = r.start() - ret = '' - while i+3 <= stop_pos: - ret += Sequence.translate(seq[i:i+3], tdict) - if re.match(p_stop, seq[i:i+3]): - #print 'exiting on: '+seq[i:i+3] - i = i+3 - break - else: - i = i+3 - - frame_iterator[r.start()%3] = i - if r.start()%3 == 0: - frame1.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) - elif r.start()%3 == 1: - frame2.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) - elif r.start()%3 == 2: - frame3.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) - - return [frame1, frame2, frame3] - - def translate2protein_in_range(self, start, stop, tdict): - tdict = { - 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', - 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', - 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', - 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', - 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', - 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' - } - - f = Sequence.translate2protein_in_range_generic(self.seq, start, stop, tdict) - r = Sequence.translate2protein_in_range_generic(self.reverse().seq, start, stop, tdict) - - return {'fwd':f, 'rev':r} - - - @staticmethod - def translate2protein_generic(seq, tdict): - # +5 to secure all frames - f1 = '' - f2 = '' - f3 = '' - i = 0 - while i+5 < len(seq): - f1 += Sequence.translate(seq[i:i+3], tdict) - f2 += Sequence.translate(seq[i+1:i+4], tdict) - f3 += Sequence.translate(seq[i+2:i+5], tdict) - i = i + 3 - - return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,'')] - - def translate2protein(self, tdict): - tdict = { - 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', - 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', - 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', - 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', - 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', - 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' - } - f = Sequence.translate2protein_generic(self.seq, tdict) - r = Sequence.translate2protein_generic(self.reverse().seq, tdict) - return {'fwd':f, 'rev':r} - - @staticmethod - def translate(codon, tdict): - if codon in tdict: - return tdict[codon] - else: - return '|'+codon+'|' - - def find_aprox_motif(self, motif, missmatch_level): - self.normalize() - return fuzzy.find_all_motifs(motif, self.seq, missmatch_level, hs_start_pos = 0) - - def find_primers(self, start, stop, mode, len_min = 50, len_max = 10000): - return self.find_aprox_primers(start, stop, mode, 0, len_min, len_max) - - - def find_aprox_primers(self, start, stop, mode, missmatch_level = 0, len_min = 50, len_max = 10000): - #start 5'->3' - # add missmatch_level condition if 50%> - logger = logging.getLogger(__name__) - #logger.setLevel(logging.DEBUG) - logger.debug('given args: start:'+start+' stop: '+stop+' mode: '+mode+' mm level: '+str(missmatch_level)+' len_min: '+str(len_min)+' len_max: '+str(len_max)) - #logger.debug('sequence: '+self.seq) - if mode.upper() == 'FR': - rev = stop[::-1] - stop = rev.translate(maketrans('ACTGactg', 'TGACtgac')) - elif mode.upper() != 'FF': - raise ('Unexpected mode: '+str(mode)+' expected values [FR|FF]') - - r_list = [] - self.normalize() - - res = fuzzy.find_all_motifs_in_aprox_range(start, stop, self.seq, missmatch_level, 0, len_min, len_max) - if res: - r_list.extend(res) - - res = fuzzy.find_all_motifs_in_aprox_range(start, stop, self.reverse().seq, missmatch_level, 0, len_min, len_max) - if res: - r_list.extend(res) - - logger.debug(r_list) - return r_list - - def __str__(self): - ''' - creates nicely outputed string - ''' - return self.name+'\n'+re.sub("(.{80})", '\\1\n', self.seq, 0)+'\n' - - - def __len__(self): - return len(self.seq) - - def __cmp__(self, other): - if self.seq == other.seq: - return 0 - - def __eq__(self, other): - return self.seq == other.seq \ No newline at end of file diff --git a/build/scripts-2.7/cmdfatool.py b/build/scripts-2.7/cmdfatool.py deleted file mode 100644 index 00554f7..0000000 --- a/build/scripts-2.7/cmdfatool.py +++ /dev/null @@ -1,515 +0,0 @@ -# -*- coding: utf-8 -*- - - -import sys -import argparse -import re -import datetime -from string import maketrans -from fatool import * -from decimal import * -import logging - - -def main(): - logging.basicConfig(level=logging.INFO) - logger = logging.getLogger(__name__) - logger.setLevel(logging.INFO) - #logger.setLevel(logging.DEBUG) - parser = argparse.ArgumentParser() - #parser.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - parser.add_argument('-v', '--version', help='display version number and exit', action='version', version='%(prog)s 0.3.1') - subparsers = parser.add_subparsers(title='fatool commands', help='each has own params, for more details use: command -h') - - sub_cut = subparsers.add_parser('cut', help='split supplied sequence into smaller parts, according to given params') - sub_cut.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_cut.add_argument('-r', '--range', help='cutted sequence length', type=int, required=True) - sub_cut.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa') - sub_cut.add_argument('-s', '--step', help='step length default: 1', type=int, default=1) - sub_cut.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_cut.add_argument('--operator', help='user who have fired script it will be noted in report', type=str) - sub_cut.set_defaults(func=cut_fa) - - sub_en = subparsers.add_parser('extractNames', help='extracting contigs names only') - sub_en.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_en.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w')) - sub_en.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_en.add_argument('--operator', help='user who have fired script it will be noted in report', type=str) - sub_en.set_defaults(func=extract_names) - - sub_ec = subparsers.add_parser('extractContigs', help='extracting contigs specified in file (output in new file)') - sub_ec.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_ec.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True) - sub_ec.add_argument('-o', '--output', help='output file; if --multifile is set output directory', type=str, required=True) - sub_ec.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_ec.add_argument('--operator', help='user who have fired script it will be noted in report', type=str) - sub_ec.add_argument('--multifile', help='if this flag is set each contig will be saved in separate file', action='store_true') - sub_ec.set_defaults(func=extract_contigs) - - sub_rc = subparsers.add_parser('remContigs', help='removing contigs specified in file (output in new file)') - sub_rc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_rc.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True) - sub_rc.add_argument('-o', '--output', help='output file if not supplied stdout', type=str, required=True) - sub_rc.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_rc.add_argument('--operator', help='user who have fired script it will be noted in report', type=str) - sub_rc.set_defaults(func=remove_contigs) - - sub_jc = subparsers.add_parser('join', help='joining two or more files, yet not verifing duplicates') - sub_jc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_jc.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w'), required=True) - sub_jc.add_argument('--files', help='files to be joined', nargs='*', type=argparse.FileType('r')) - sub_jc.add_argument('--overwrite', help='if set owerwrites contigs with same name', action='store_true') - sub_jc.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_jc.add_argument('--operator', help='user who have fired script it will be noted in report', type=str) - sub_jc.set_defaults(func=join) - - sub_sc = subparsers.add_parser('split', help='each cotig saved into separate file') - sub_sc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_sc.add_argument('-d', '--outputDir', help='output directory where splited contigs will be saved', type=str, required=True) - sub_sc.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_sc.add_argument('--operator', help='user who have fired script it will be noted in report', type=str) - sub_sc.set_defaults(func=split_contigs) - - sub_r = subparsers.add_parser('reverse', help='reverse all sequences in file') - sub_r.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_r.add_argument('-o', '--output', help='output file; if --multifile is set output directory', type=argparse.FileType('w'), required=True) - sub_r.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_r.add_argument('--operator', help='user who have fired script it will be noted in report', type=str) - sub_r.set_defaults(func=reverse) - - sub_v = subparsers.add_parser('validate', help='validates fa file') - sub_v.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_v.add_argument('-t', '--type', help='type of sequence 0 - general, 1 DNA, 2 - amino', type=int, required=True) - sub_v.add_argument('--details', help='set if you want to see detaild validation info', action='store_true') - sub_v.set_defaults(func=validate) - - sub_s = subparsers.add_parser('stats', help='show statistics of fa file') - sub_s.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) - sub_s.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_s.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) - sub_s.set_defaults(func=statistics) - ''' - sub_fm = subparsers.add_parser('findMotif', help='display motifs position in contig') - sub_fm.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) - sub_fm.add_argument('--mml', help='mismatch level number of allowed missmatches in primers (detfault 0)', type=str, default=0) - sub_fm.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_fm.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) - sub_fm.set_defaults(func=find_motif) - ''' - sub_fp = subparsers.add_parser('findPrimer', help='display list of founded primers') - sub_fp.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) - sub_fp.add_argument('--start', help='strat codon 5\'', type=str, required=True) - sub_fp.add_argument('--stop', help='stop codon 3\'', type=str, required=True) - sub_fp.add_argument('--mode', help='FF (start forward, stop forward) or FR (start 5\' stop 3\')', type=str, choices=['FF', 'FR'], default = 'FR', required=True) - sub_fp.add_argument('--minlen', help='minimum length (detfault 50bp)', type=int, default=50) - sub_fp.add_argument('--maxlen', help='max length (detfault 1000bp)', type=int, default=1000) - sub_fp.add_argument('--mml', help='mismatch level number of allowed missmatches in primers (detfault 0)', type=int, default=0) - sub_fp.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_fp.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) - sub_fp.set_defaults(func=find_primers) - - sub_cn = subparsers.add_parser('cutName', help='cuts name from position to given length') - sub_cn.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) - sub_cn.add_argument('--start', help='start of cut', type=int, required=True) - sub_cn.add_argument('-l', '--length', help='length of cut', type=int, required=True) - sub_cn.set_defaults(func=cut_name) - - sub_lnam = subparsers.add_parser('cutNameMarker', help='cuts name leaving defined number of chars after begining of marker') - sub_lnam.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) - sub_lnam.add_argument('-m', '--marker', help='marker that indicates start of cut', type=str, required=True) - sub_lnam.add_argument('-l', '--length', help='length of cut', type=int, required=True) - sub_lnam.add_argument('--keepMarker', help='weather to keep marker or not default 1 (Yes)', type=int, required=True) - sub_lnam.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa') - #sub_lnam.add_argument('-d', '--outputDir', help='output directory where multiple contigs will be saved', type=str) - sub_lnam.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_lnam.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) - sub_lnam.set_defaults(func=cut_name_pattern) - - sub_trn_d2p = subparsers.add_parser('translateDNA2Proteins', help='display translation to proteins') - sub_trn_d2p.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) - sub_trn_d2p.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa') - sub_trn_d2p.add_argument('--startCodons', help='list of start codons separated by space bar', nargs='*', type=str) - sub_trn_d2p.add_argument('--stopCodons', help='list of stop codons separated by space bar', nargs='*', type=str) - sub_trn_d2p.add_argument('--nss', help='No Start Stop', action='store_true') - sub_trn_d2p.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_trn_d2p.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) - sub_trn_d2p.set_defaults(func=translate_dna_to_protein) - ''' - sub_fap = subparsers.add_parser('findPrimer', help='show statistics of fa file') - sub_fap.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) - sub_fap.add_argument('--start', help='strat codon 5\'', type=str, required=True) - sub_fap.add_argument('--stop', help='stop codon 3\'', type=str, required=True) - sub_fap.add_argument('--minlen', help='minimum length (detfault 50bp)', type=str, default=50) - sub_fap.add_argument('--maxlen', help='max length (detfault 1000bp)', type=str, default=1000 - sub_fap.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) - sub_fap.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) - sub_fap.set_defaults(func=find_primers) - ''' - #parser.add_argument('--operator', help='user who have fired script it will be noted in report', type=str) - #parser.add_argument('--report', help='log file if not supplied stdout', type=argparse.FileType('w')) - - args = parser.parse_args() - - - args.func(args) - -def resolve_operator(operator_arg_list): - # makes prity print of opoerator - op = '' - for r in operator_arg_list: - op += r+' ' - return op.rstrip() - -def make_log_header(cmd, op): - stats_rep = '\n-------------------------------------------------------------' - stats_rep +='\ncmdfatool '+str(cmd)+' \n\nstarted:\t'+str(datetime.datetime.now()) - if op: - stats_rep += '\nOperator:\t'+resolve_operator(op) - stats_rep += '\n-------------------------------------------------------------\n' - return stats_rep - - -def cut_fa(args): - #logging.basicConfig(level=logging.ERROR) - #logging.basicConfig(level=logging.DEBUG) - logger = logging.getLogger(__name__) - - logger.setLevel(logging.DEBUG) - logger.debug('debug mode started') - logger.info('command: cut starting') - rep = str(make_log_header('cut', args.operator)) - - fafile = args.fafile - output = args.output - split_range = args.range - step = args.step - - f = Fa.load_from_file(fafile) - logger.info('file: '+fafile.name+' loaded') - contig_list = [] - for r in f.contigs: - contig_list += r.cut(split_range, step) - logger.info('cutted contigs added from conting: '+r.name) - result_fa = Fa(contig_list, 'splited') - logger.info('trying to write file') - result_fa.write(output) - logger.info('file written') - rep += '\n\n------------------------------------------------------' - rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(rep) - - -def extract_names(args): - logger = logging.getLogger(__name__) - logger.setLevel(logging.info) - logger.info('command: extractNames starting') - rep = str(make_log_header('extractNames', args.operator)) - fafile = args.fafile - output = args.output - - fa = Fa.load_from_file(fafile) - names = fa.show_names() - with output as o: - for r in names: - o.write('>'+r) - rep += 'Number of neames founded:\t' + str(len(names)) - rep += '\n\n------------------------------------------------------' - rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(rep) - -def extract_contigs(args): - # default all extracted contigs in one file - # with flag multifile save each contig to separate file - - rep = str(make_log_header('extractContigs', args.operator)) - - fa = Fa.load_from_file(args.fafile) - rep += 'Number of contigs in orginal file:\t'+str(len(fa.contigs)) - - #file with contigs names one per line - with args.list as cntgs: - elist = [c.strip() for c in cntgs] - result_fa = fa.extract(elist) - if( args.multifile): - result_fa.write_multiple_files(args.output) - else: - result_fa.write(args.output) - rep += '\nContigs to remove:\t'+str(len(elist)) - rep += '\Extracted contigs:\t'+str(len(result_ta.contigs)) - rep += '\n\n------------------------------------------------------' - rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(rep) - else: - print rep - -def remove_contigs(args): - # contigs from list are removed, others saved to file - rep = str(make_log_header('remContigs', args.operator)) - fa = Fa.load_from_file(args.fafile) - rep += 'Number of contigs in orginal file:\t'+str(len(fa.contigs)) - # file that contains list of contigs one per line - with args.list as cntgs: - rlist = [c.strip() for c in cntgs] - rep += 'Number of contigs to remove:\t'+len(rlist) - result_fa = fa.remove(rlist) - rep += 'Number of contigs after remove:\t'+str(len(fa.contigs)) - rep += 'Contigs removed:\t'+str(len(fa.contigs) - len(result_fa.contigs)) - result_fa.write(args.output) - rep += '\n\n------------------------------------------------------' - rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(stats_rep) - else: - print stats_rep - - -def join(args): - # joins contig from multiple files - rep = str(make_log_header('join', args.operator)) - fa = Fa.load_from_file(args.fafile) - fa_list = [] - contigs_to_add = 0 - # list of Fa files to join. - for r in args.files: - if len(r) > 0: - fa2add = Fa.load_from_file(r) - fa_list.append(fa2add) - contigs_to_add += fa2add.count_contigs() - rep += '\nOrginal contigs number:\t'+Fa.count_contigs() - rep += '\nTotal files to join with orginal file:\t'+len(args.files) - rep += '\nTotal contigs to add:\t'+str(contigs_to_add) - fa.join(fa_list, args.overwrite) - rep += '\nNumber of contigs after join:\t'+str(fa.count_contigs()) - fa.write(args.output) - rep += '\n\n------------------------------------------------------' - rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(stats_rep) - else: - print stats_rep - -def split_contigs(args): - #writes each contig in single file - rep = str(make_log_header('split', args.operator)) - fa = Fa.load_from_file(args.fafile) - fa.write_multiple_files(args.output) - rep += '\n\n------------------------------------------------------' - rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(rep) - else: - print rep - - -def statistics(args): - # returns statistics of fa file - stats_rep = str(make_log_header('stats', args.operator)) - fa = Fa.load_from_file(args.fafile) - stats = fa.statistics() - stats_rep += '\n\nNumber of N:\t'+str(stats['N']) - stats_rep += '\nNumber of A:\t'+str(stats['A']) - stats_rep += '\nNumber of C:\t'+str(stats['C']) - stats_rep += '\nNumber of T:\t'+str(stats['T']) - stats_rep += '\nNumber of G:\t'+str(stats['G']) - getcontext().rounding = ROUND_05UP - getcontext().prec = 4 - stats_rep += '\nGC[%] (0.5 up):\t'+str(Decimal(stats['G']+stats['C'])/stats['L']*Decimal(100.00)) - stats_rep += '\n\nTotal length:\t'+str(stats['L']) - stats_rep += '\nTotal contigs:\t'+str(stats['totalc']) - stats_rep += '\n\ncontigs 1000-5000bp:\t'+str(stats['nbp1000']) - stats_rep += '\ncontigs 1000-5000bp length:\t'+str(stats['lbp1000']) - stats_rep += '\ncontigs 5001-10000bp:\t'+str(stats['nbp5000']) - stats_rep += '\ncontigs 5001-10000bp length:\t'+str(stats['lbp5000']) - stats_rep += '\ncontigs 10001-25000bp:\t'+str(stats['nbp10000']) - stats_rep += '\ncontigs 10001-25000bp length:\t'+str(stats['lbp10000']) - stats_rep += '\ncontigs 25001-50000bp:\t'+str(stats['nbp25000']) - stats_rep += '\ncontigs 25001-50000bp length:\t'+str(stats['lbp25000']) - stats_rep += '\ncontigs 50001+bp:\t'+str(stats['nbp50000']) - stats_rep += '\ncontigs 50001+bp length:\t'+str(stats['lbp50000']) - stats_rep += '\n\ncontigs > 1000bp:\t'+str(stats['nbp1000']+stats['nbp5000']+stats['nbp10000']+stats['nbp25000']+stats['nbp50000']) - stats_rep += '\ncontigs > 1000bp length:\t'+str(stats['lbp1000']+stats['lbp5000']+stats['lbp10000']+stats['lbp25000']+stats['nbp50000']) - stats_rep += '\ncontigs > 5000bp:\t'+str(stats['nbp5000']+stats['nbp10000']+stats['nbp25000']+stats['nbp50000']) - stats_rep += '\ncontigs > 5000bp length:\t'+str(stats['lbp5000']+stats['lbp10000']+stats['lbp25000']+stats['nbp50000']) - stats_rep += '\ncontigs > 10000bp:\t'+str(stats['nbp10000']+stats['nbp25000']+stats['nbp50000']) - stats_rep += '\ncontigs > 10000bp length:\t'+str(stats['lbp10000']+stats['lbp25000']+stats['nbp50000']) - stats_rep += '\ncontigs > 25000bp:\t'+str(stats['nbp25000']+stats['nbp50000']) - stats_rep += '\ncontigs > 25000bp length:\t'+str(stats['lbp25000']+stats['nbp50000']) - stats_rep += '\ncontigs > 50000bp:\t'+str(stats['nbp50000']) - stats_rep += '\ncontigs > 50000bp length:\t'+str(stats['nbp50000']) - stats_rep += '\nLongest contig:\t'+str(stats['longest']) - stats_rep += '\n\nN50:\t'+str(stats['N50']) - stats_rep += '\nL50:\t'+str(stats['L50']) - stats_rep += '\nN75:\t'+str(stats['N75']) - stats_rep += '\nL75:\t'+str(stats['L75']) - stats_rep += '\nN90:\t'+str(stats['N90']) - stats_rep += '\nL90:\t'+str(stats['L90']) - stats_rep += '\n\n------------------------------------------------------' - stats_rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(stats_rep) - else: - print stats_rep - - -def validate(args): - # check if fa is valid - rep = str(make_log_header('validate', args.operator)) - fa = Fa.load_from_file(args.fafile) - result_list = {} - if args.details: - for r in fa.contigs: - result_list[r.name] = Sequence.detailed_validate_generic(r, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') - else: - for r in fa.contigs: - result_list[r.name] = Sequence.validate_generic(r, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') - #print result_list - - for r in result_list: - rep += r +'\n' - - rep += '\n\n------------------------------------------------------' - rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(rep) - else: - print rep - - -def reverse(args): - rep = str(make_log_header('reverse', args.operator)) - fa = Fa.load_from_file(args.fafile) - fa.reverse() - fa.write(args.output) - rep += '\n\n------------------------------------------------------' - rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(rep) - else: - print rep - - -def find_motif(args): - print 'not available yet' - pass - -def find_primers(args): - rep = str(make_log_header('reverse', args.operator)) - fa = Fa.load_from_file(args.fafile) - logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) - logger.debug(args) - rep = '' - for r in fa.contigs: - rep += '\n================\n\t\t'+r.name+'\n' - for q in r.find_aprox_primers(args.start, args.stop, str(args.mode), int(args.mml), args.minlen, args.maxlen): - rep += q+'\n' - rep += '\n\n------------------------------------------------------' - rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(rep) - else: - print rep - -def cut_name_pattern(args): - rep = str(make_log_header('cutNameMarker', args.operator)) - fa = Fa.load_from_file(args.fafile) - for r in fa.contigs: - r.leave_name_after_marker(args.marker, args.length, args.keepMarker) - fa.write(args.output) - -def translate_dna_to_protein(args): - rep = str(make_log_header('translate2protein', args.operator)) - fa = Fa.load_from_file(args.fafile) - r_dict = {} - otp = '' - if args.nss: - for r in fa.contigs: - r_dict = r.translate2protein({}) - otp += '\n=============================\n'+r.name+'\n=============================\n' - otp += '\nFORWARD\n' - i = 0 - for f in r_dict['fwd']: - otp += 'FRAME:\t'+str(i+1)+'\n' - otp += 'BEFORE:\t '+f[0] - otp += 'TRANSLATION:\n '+f[1] - otp += 'AFTER:\t '+f[2] - otp += '\n------------------------------------------------\n' - i+=1 - otp += '\nREVERS\n' - otp += '\n------------------------------------------------\n' - i = 0 - for f in r_dict['rev']: - otp += 'FRAME:\t'+str(i+1)+'\n' - otp += 'BEFORE:\t '+f[0] - otp += 'TRANSLATION:\n '+f[1] - otp += 'AFTER:\t '+f[2] - otp += '\n------------------------------------------------\n' - i+=1 - rep += otp - - else: - tdict = { - 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', - 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', - 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', - 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', - 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', - 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' - } - for r in fa.contigs: - - r_dict = r.translate2protein_in_range(args.startCodons, args.stopCodons, tdict) - otp += '\n=============================\n'+r.name+'\n=============================\n' - otp += 'FORWARD\n' - i = 0 - - for f in r_dict['fwd']: - otp += 'FRAME:\t'+str(i+1)+'\n' - for k in f: - otp += '\n'+k[0]+' start: '+str(k[1]) - otp += '\n------------------------------------------------\n' - otp += '\n=================================================\n' - otp += 'REVERS\n' - i = 0 - for f in r_dict['rev']: - otp += 'FRAME:\t'+str(i+1)+'\n' - for k in f: - otp += '\n'+k[0]+' start: '+str(k[1]) - otp += '\n------------------------------------------------\n' - otp += '\n=================================================\n' - rep += otp - - fa.write(args.output) - rep += '\n\n------------------------------------------------------' - rep += '\nFinished:\t'+str(datetime.datetime.now()) - if args.report: - with args.report as log_file: - log_file.write(rep) - else: - print rep - -def cut_name(args): - pass - - -if __name__ == '__main__': - exit(main()) - - - - - -