Permalink
Browse files

Builds removed

	modified:   build/lib/fatool/fa.py
	modified:   build/lib/fatool/fuzzy.py
	modified:   build/lib/fatool/sequence.py
	modified:   build/scripts-2.7/cmdfatool.py
  • Loading branch information...
1 parent 65dca72 commit 0065ac7b5e73e44c52c9b30755cf7fc9f42996cd @blazejmarciniak blazejmarciniak committed Jun 27, 2016
Showing with 312 additions and 110 deletions.
  1. +11 −3 build/lib/fatool/fa.py
  2. +9 −22 build/lib/fatool/fuzzy.py
  3. +76 −26 build/lib/fatool/sequence.py
  4. +216 −59 build/scripts-2.7/cmdfatool.py
View
@@ -4,27 +4,34 @@
import re
import math
from fatool import Sequence
+import logging
class Fa(object):
def __init__(self, contigs_list, name):
- #print contigs_list
- # do poprawki
+ logger = logging.getLogger(__name__)
+
+ logger.debug('creating Fa object')
self.name = name
self.contigs = []
self.contigs_idx = {}
for r in contigs_list:
if not isinstance(r, Sequence):
+ logger.error('Supplied param is not Sequence object')
raise TypeError('Wrong param supplied Sequence was expected')
if not r.name in self.contigs_idx:
if len(self.contigs) > 0:
+ logger.debug('appending contig: '+r.name)
self.contigs.append(r)
else:
+ logger.debug('adding first contig: '+r.name)
self.contigs = [r]
self.contigs_idx[r.name] = len(self.contigs) - 1
else:
+ logger.error('Sequence name: '+r.name+' already exists in file')
raise NameError('Sequence name already exists: '+r.name)
- # self.stats{'A':0,'C':0,'T':0,'G':0,'N':0, 'L':0, }
+
+
@staticmethod
def load_from_file(file):
if isinstance(file, str):
@@ -110,6 +117,7 @@ def validate(self):
def nl_statistics(self, g, percent):
'''
Counts statistics of N50, L50, N75 etc.
+ g array containing sorted contigs by length, from biggest to lowest
'''
ncount = -1 # index & number of contigs with +1
nsum = 0
View
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
#import math
+import logging
def find_aprox_match_iter(needle, hstack, missmatch_level, hs_start_pos = 0):
i = hs_start_pos # start iterate from start position
@@ -11,8 +12,6 @@ def find_aprox_match_iter(needle, hstack, missmatch_level, hs_start_pos = 0):
while i < len(hstack):
if hstack[i] != needle[j]:
mmatch_count += 1
- #print mmatch_count
- #print 'j = '+str(j)
if mmatch_count > missmatch_level:
# if missmatch level oversized back to strat + 1 and start again
i -= j
@@ -41,18 +40,15 @@ def find_all_aprox_matches(needle, hstack, missmatch_level, hs_start_pos):
i = r[0]+1
# match not found - no more maches in hstack
else:
- #print 'not found'
break
return ret_list
# return string from between two aproximated motifs
def find_motif_in_aprox_range(start_motif, stop_motif, hstack, missmatch_level, hs_start_pos = 0):
start = 0
stop = 0
- #print 'startm: '+start_motif+'\tstop_motif: '+stop_motif
start = find_aprox_match_iter(start_motif, hstack, missmatch_level, hs_start_pos = 0)
stop = find_aprox_match_iter(stop_motif, hstack, missmatch_level, start[1])
- #print start,stop
if start and stop:
return hstack[start[1]:stop[0]]
@@ -61,21 +57,21 @@ def find_all_motifs_in_aprox_range(start_motif, stop_motif, hstack, missmatch_le
start = 0
stop = 0
ret_list = []
- print 'hstack in fuzzy'
- print hstack
+ logger = logging.getLogger(__name__)
+ #logger.setLevel(logging.DEBUG)
+ logger.debug([start_motif, stop_motif, hstack, missmatch_level, hs_start_pos, len_min, len_max])
+ logger.debug(hstack)
+
while i <= len(hstack):
start = find_aprox_match_iter(start_motif, hstack, missmatch_level, i)
stop = find_aprox_match_iter(stop_motif, hstack, missmatch_level, start[1])
- #print start,stop
if start and stop:
- #print 'start + stop found'
- if stop[0] - start[1] > len_min and stop[0] - start[1] < len_max:
- #print 'match valid'
- ret_list.append(hstack[start[1]:stop[0]])
+ if stop[1] - start[0] >= len_min and stop[1] - start[0] <= len_max:
+ ret_list.append(hstack[start[0]:stop[1]])
i = start[0]+1
- #print i
else:
break
+ logger.debug(ret_list)
return ret_list
def find_motif(needle, hstack, missmatch_level, hs_start_pos = 0):
@@ -85,22 +81,13 @@ def find_motif(needle, hstack, missmatch_level, hs_start_pos = 0):
return hstack[r[0]:r[1]]
def find_all_motifs(needle, hstack, missmatch_level, hs_start_pos = 0):
- #print 'fuzzy.find_all_motifs'
- #print needle
- #print hstack
- #print missmatch_level
- #print hs_start_pos
i = hs_start_pos
ret_list = []
while i <= len(hstack):
r = find_aprox_match_iter(needle, hstack, missmatch_level, i )
- #print r
if r:
- #print 'founded: ',r
ret_list.append(hstack[r[0]:r[1]])
- #ret_list = [hstack[r[0]:r[1]]]
i = r[0]+1
else:
break
- #print ret_list
return ret_list
@@ -4,12 +4,13 @@
from collections import Counter
import fuzzy
import re
+import logging
class Sequence(object):
def __init__(self, name, seq):
if Sequence.validate_name_string(name):
- self.name = name.lstrip('>')
+ self.name = name
else:
raise NameError('Sequence name have to start with ">"')
self.seq = seq
@@ -133,7 +134,7 @@ def detailed_validate_generic(seq, domain):
log_info = []
# if not allowed chars found
if m:
- # it may be 60 xxxxxxxxxx xxx.... format
+ # it may be 61 xxxxxxxxxx xxx.... format
if re.search('(\d+)', seq):
seq_array = seq.split('\n')
new_array = [] # array to store new sequence after cleaning and transformation
@@ -146,7 +147,7 @@ def detailed_validate_generic(seq, domain):
if end_of_seq_array > 1:
line_length = int(new_array[1][0])-int(new_array[0][0])
- # validate ecah block (between " " [space]) of given sequence
+ # validate each block (between " " [space]) of given sequence
i = 0
while i < end_of_seq_array:
# digit on begining of line was not found - error
@@ -192,9 +193,35 @@ def cut(self, length, step):
contig_end = len(self.seq) # last position of contig
contig_list = [] # contig list returning by function
while i+length <= contig_end:
- contig_list.append(Sequence('>'+self.name+'_frag_'+str(i + 1)+':'+str(i + length), str(self.seq[i:i+length])))
+ contig_list.append(Sequence(self.name+'_frag_'+str(i + 1)+':'+str(i + length), str(self.seq[i:i+length])))
i = i+step
return contig_list
+
+ def cut_name(self, length, start = 0):
+ self.name = self.name[start:length]
+ print self.name
+
+ def leave_name_after_marker(self, mark, length = 0, keep_marker = 1):
+ m = re.search(re.escape(mark), self.name)
+ logger = logging.getLogger(__name__)
+ logger.setLevel(logging.DEBUG)
+ logger.debug(m)
+ logger.debug(keep_marker)
+ if m:
+ # keep original marker or skip it
+
+ if keep_marker == 1:
+ s = m.start()
+ else:
+ s = m.end()
+ # defined length or return string to end
+ if length > 0:
+ self.name = '>'+self.name[s:s+length].lstrip('>')
+ else:
+ self.name = '>'+self.name[s:].lstrip('>')
+ return 1
+ return 0
+
def reverse(self):
'''
@@ -206,7 +233,7 @@ def reverse(self):
rev = rev.translate(maketrans('ACTGactg', 'TGACtgac'))
# creating 80 chars lines
#rev = re.sub("(.{80})", '\\1\n', rev, 0)
- return Sequence('>rev_'+self.name, rev)
+ return Sequence('>rev_'+self.name.lstrip('>'), rev)
def normalize(self):
@@ -251,21 +278,20 @@ def translate2protein_in_range_generic(seq, start, stop, tdict):
frame2 = []
frame3 = []
- # creating pattern to find start codons
+ # creating pattern (from dict) to find start codons
for r in start:
p += r+'|'
p = '('+p.rstrip('|')+')'
# creating pattern to find stop codons
for r in stop:
p_stop += r+'|'
- p_stop = '('+p.rstrip('|')+')'
+ p_stop = '('+p_stop.rstrip('|')+')'
- # match for start contigs
m = re.finditer(p, seq)
# there will be stored latest string position for each frame
- frame_iterator[0,0,0]
+ frame_iterator = [0,0,0]
stop_pos = len(seq) # where to stop searching if no stopcodon found
@@ -276,9 +302,10 @@ def translate2protein_in_range_generic(seq, start, stop, tdict):
# set i for start position of current start contig
i = r.start()
ret = ''
- while i+3 <= stop:
+ while i+3 <= stop_pos:
ret += Sequence.translate(seq[i:i+3], tdict)
if re.match(p_stop, seq[i:i+3]):
+ #print 'exiting on: '+seq[i:i+3]
i = i+3
break
else:
@@ -294,18 +321,36 @@ def translate2protein_in_range_generic(seq, start, stop, tdict):
return [frame1, frame2, frame3]
+ def translate2protein_in_range(self, start, stop, tdict):
+ tdict = {
+ 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
+ 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I',
+ 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N',
+ 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R',
+ 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
+ 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*'
+ }
+
+ f = Sequence.translate2protein_in_range_generic(self.seq, start, stop, tdict)
+ r = Sequence.translate2protein_in_range_generic(self.reverse().seq, start, stop, tdict)
+
+ return {'fwd':f, 'rev':r}
+
+
@staticmethod
def translate2protein_generic(seq, tdict):
# +5 to secure all frames
f1 = ''
f2 = ''
f3 = ''
+ i = 0
while i+5 < len(seq):
f1 += Sequence.translate(seq[i:i+3], tdict)
f2 += Sequence.translate(seq[i+1:i+4], tdict)
f3 += Sequence.translate(seq[i+2:i+5], tdict)
+ i = i + 3
- return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,)]
+ return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,'')]
def translate2protein(self, tdict):
tdict = {
@@ -321,7 +366,7 @@ def translate2protein(self, tdict):
return {'fwd':f, 'rev':r}
@staticmethod
- def translate(contig, tdict):
+ def translate(codon, tdict):
if codon in tdict:
return tdict[codon]
else:
@@ -331,37 +376,42 @@ def find_aprox_motif(self, motif, missmatch_level):
self.normalize()
return fuzzy.find_all_motifs(motif, self.seq, missmatch_level, hs_start_pos = 0)
- def find_aprox_primers(self, start, stop, missmatch_level = 0, len_min = 50, len_max = 10000):
+ def find_primers(self, start, stop, mode, len_min = 50, len_max = 10000):
+ return self.find_aprox_primers(start, stop, mode, 0, len_min, len_max)
+
+
+ def find_aprox_primers(self, start, stop, mode, missmatch_level = 0, len_min = 50, len_max = 10000):
#start 5'->3'
# add missmatch_level condition if 50%>
- rev = stop[::-1]
- new_stop = rev.translate(maketrans('ACTGactg', 'TGACtgac'))
+ logger = logging.getLogger(__name__)
+ #logger.setLevel(logging.DEBUG)
+ logger.debug('given args: start:'+start+' stop: '+stop+' mode: '+mode+' mm level: '+str(missmatch_level)+' len_min: '+str(len_min)+' len_max: '+str(len_max))
+ #logger.debug('sequence: '+self.seq)
+ if mode.upper() == 'FR':
+ rev = stop[::-1]
+ stop = rev.translate(maketrans('ACTGactg', 'TGACtgac'))
+ elif mode.upper() != 'FF':
+ raise ('Unexpected mode: '+str(mode)+' expected values [FR|FF]')
+
r_list = []
self.normalize()
- #print '\nAfter normailzation'
- #print self.seq
res = fuzzy.find_all_motifs_in_aprox_range(start, stop, self.seq, missmatch_level, 0, len_min, len_max)
if res:
r_list.extend(res)
- rev = start[::-1]
- new_start = rev.translate(maketrans('ACTGactg', 'TGACtgac'))
- #print 'new_seq in sequence\n'
- #print new_seq.seq
- res = fuzzy.find_all_motifs_in_aprox_range(new_start, stop, self.seq, missmatch_level, 0, len_min, len_max)
+ res = fuzzy.find_all_motifs_in_aprox_range(start, stop, self.reverse().seq, missmatch_level, 0, len_min, len_max)
if res:
r_list.extend(res)
- print 'Sequence.find_aprox_primers',
- for s in r_list:
- print s+'\n'
+
+ logger.debug(r_list)
return r_list
def __str__(self):
'''
creates nicely outputed string
'''
- return '>'+self.name+'\n'+re.sub("(.{80})", '\\1\n', self.seq, 0)+'\n'
+ return self.name+'\n'+re.sub("(.{80})", '\\1\n', self.seq, 0)+'\n'
def __len__(self):
Oops, something went wrong.

0 comments on commit 0065ac7

Please sign in to comment.