From 396276980e6e3d109747e0a253ec02fd9998ccfa Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 24 Jun 2016 00:50:30 +0200 Subject: [PATCH] Added cutNameMarkers functionality. modified: README.md modified: bin/cmdfatool.py modified: fatool/fa.py modified: fatool/sequence.py modified: fatool/tests/test_sequence.py --- README.md | 36 ++++++++++++++++++++++++++++++ bin/cmdfatool.py | 52 ++++++++++++++++++++++++++++++++++++------- fatool/fa.py | 1 - fatool/sequence.py | 34 ++++++++++++++++++++++++---- fatool/tests/test_sequence.py | 29 ++++++++++++++++++++---- 5 files changed, 135 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 3e2ed0d..3230b6d 100644 --- a/README.md +++ b/README.md @@ -174,3 +174,39 @@ optional arguments: -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa --report REPORT log file if not supplied stdout --operator [OPERATOR [OPERATOR ...]] user who have fired script it will be noted in log + + findPrimer: + +usage: cmdfatool.py findPrimer [-h] -f FAFILE --start START --stop STOP --mode + {FF,FR} [--minlen MINLEN] [--maxlen MAXLEN] + [--mml MML] [--report REPORT] + [--operator [OPERATOR [OPERATOR ...]]] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE + file to show statistics usualy *.fa + --start START strat codon 5' + --stop STOP stop codon 3' + --mode {FF,FR} FF (start forward, stop forward) or FR (start 5' stop 3') + --minlen MINLEN minimum length (detfault 50bp) + --maxlen MAXLEN max length (detfault 1000bp) + --mml MML mismatch level number of allowed missmatches in primers (detfault 0) + --report REPORT report results into file if not supplied stdout + --operator [OPERATOR [OPERATOR ...]] + user who have fired script it will be noted in report + + + cutNameMarker: + + +usage: cmdfatool.py cutNameMarker [-h] -f FAFILE -m MARKER -l LENGTH + --keepMarker KEEPMARKER [-o OUTPUT] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa + -m MARKER, --marker MARKER marker that indicates start of cut + -l LENGTH, --length LENGTH length of cut + --keepMarker KEEPMARKER weather to keep marker or not default 1 (Yes) + -o OUTPUT, --output OUTPUT output file default: output.fa \ No newline at end of file diff --git a/bin/cmdfatool.py b/bin/cmdfatool.py index 30abff2..8227e24 100644 --- a/bin/cmdfatool.py +++ b/bin/cmdfatool.py @@ -88,25 +88,43 @@ def main(): sub_s.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) sub_s.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) sub_s.set_defaults(func=statistics) - - sub_fm = subparsers.add_parser('findMotif', help='finding given motif; display motif and its position in contig') + ''' + sub_fm = subparsers.add_parser('findMotif', help='display motifs position in contig') sub_fm.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) sub_fm.add_argument('--mml', help='mismatch level number of allowed missmatches in primers (detfault 0)', type=str, default=0) sub_fm.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) sub_fm.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) sub_fm.set_defaults(func=find_motif) - + ''' sub_fp = subparsers.add_parser('findPrimer', help='display list of founded primers') sub_fp.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) sub_fp.add_argument('--start', help='strat codon 5\'', type=str, required=True) sub_fp.add_argument('--stop', help='stop codon 3\'', type=str, required=True) - sub_fp.add_argument('--mode', help='FF (start forward, stop forward) or FR (start 5\' stop 3\')', type=str, choices=['FF', 'FR'], required=True) + sub_fp.add_argument('--mode', help='FF (start forward, stop forward) or FR (start 5\' stop 3\')', type=str, choices=['FF', 'FR'], default = 'FR', required=True) sub_fp.add_argument('--minlen', help='minimum length (detfault 50bp)', type=int, default=50) sub_fp.add_argument('--maxlen', help='max length (detfault 1000bp)', type=int, default=1000) sub_fp.add_argument('--mml', help='mismatch level number of allowed missmatches in primers (detfault 0)', type=int, default=0) sub_fp.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) sub_fp.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) sub_fp.set_defaults(func=find_primers) + + sub_cn = subparsers.add_parser('cutName', help='cuts name from position to given length') + sub_cn.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) + sub_cn.add_argument('--start', help='start of cut', type=int, required=True) + sub_cn.add_argument('-l', '--length', help='length of cut', type=int, required=True) + sub_cn.set_defaults(func=cut_name) + + sub_lnam = subparsers.add_parser('cutNameMarker', help='cuts name leaving defined number of chars after begining of marker') + sub_lnam.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) + sub_lnam.add_argument('-m', '--marker', help='marker that indicates start of cut', type=str, required=True) + sub_lnam.add_argument('-l', '--length', help='length of cut', type=int, required=True) + sub_lnam.add_argument('--keepMarker', help='weather to keep marker or not default 1 (Yes)', type=int, required=True) + sub_lnam.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa') + #sub_lnam.add_argument('-d', '--outputDir', help='output directory where multiple contigs will be saved', type=str) + sub_lnam.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) + sub_lnam.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) + sub_lnam.set_defaults(func=cut_name_pattern) + ''' sub_fap = subparsers.add_parser('findPrimer', help='show statistics of fa file') sub_fap.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) @@ -122,9 +140,7 @@ def main(): #parser.add_argument('--report', help='log file if not supplied stdout', type=argparse.FileType('w')) args = parser.parse_args() - #if args.version: - # print version - # exit(0) + args.func(args) @@ -366,17 +382,21 @@ def reverse(args): fa = Fa.load_from_file(args.fafile) fa.reverse() fa.write(args.output) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) if args.report: with args.report as log_file: log_file.write(rep) else: print rep + def find_motif(args): print 'not available yet' pass def find_primers(args): + rep = str(make_log_header('reverse', args.operator)) fa = Fa.load_from_file(args.fafile) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -386,8 +406,24 @@ def find_primers(args): rep += '\n================\n\t\t'+r.name+'\n' for q in r.find_aprox_primers(args.start, args.stop, str(args.mode), int(args.mml), args.minlen, args.maxlen): rep += q+'\n' + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.report: + with args.report as log_file: + log_file.write(rep) + else: + print rep + +def cut_name_pattern(args): + rep = str(make_log_header('cutNameMarker', args.operator)) + fa = Fa.load_from_file(args.fafile) + for r in fa.contigs: + r.leave_name_after_marker(args.marker, args.length, args.keepMarker) + fa.write(args.output) - print rep +def cut_name(args): + pass + if __name__ == '__main__': exit(main()) diff --git a/fatool/fa.py b/fatool/fa.py index 1aaba5f..dfbad5a 100644 --- a/fatool/fa.py +++ b/fatool/fa.py @@ -11,7 +11,6 @@ def __init__(self, contigs_list, name): logger = logging.getLogger(__name__) logger.debug('creating Fa object') - logger.info('but info works') self.name = name self.contigs = [] self.contigs_idx = {} diff --git a/fatool/sequence.py b/fatool/sequence.py index 3d2b11c..b9015b4 100644 --- a/fatool/sequence.py +++ b/fatool/sequence.py @@ -10,7 +10,7 @@ class Sequence(object): def __init__(self, name, seq): if Sequence.validate_name_string(name): - self.name = name.lstrip('>') + self.name = name else: raise NameError('Sequence name have to start with ">"') self.seq = seq @@ -193,9 +193,35 @@ def cut(self, length, step): contig_end = len(self.seq) # last position of contig contig_list = [] # contig list returning by function while i+length <= contig_end: - contig_list.append(Sequence('>'+self.name+'_frag_'+str(i + 1)+':'+str(i + length), str(self.seq[i:i+length]))) + contig_list.append(Sequence(self.name+'_frag_'+str(i + 1)+':'+str(i + length), str(self.seq[i:i+length]))) i = i+step return contig_list + + def cut_name(self, length, start = 0): + self.name = self.name[start:length] + print self.name + + def leave_name_after_marker(self, mark, length = 0, keep_marker = 1): + m = re.search(re.escape(mark), self.name) + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + logger.debug(m) + logger.debug(keep_marker) + if m: + # keep original marker or skip it + + if keep_marker == 1: + s = m.start() + else: + s = m.end() + # defined length or return string to end + if length > 0: + self.name = '>'+self.name[s:s+length].lstrip('>') + else: + self.name = '>'+self.name[s:].lstrip('>') + return 1 + return 0 + def reverse(self): ''' @@ -207,7 +233,7 @@ def reverse(self): rev = rev.translate(maketrans('ACTGactg', 'TGACtgac')) # creating 80 chars lines #rev = re.sub("(.{80})", '\\1\n', rev, 0) - return Sequence('>rev_'+self.name, rev) + return Sequence('>rev_'+self.name.lstrip('>'), rev) def normalize(self): @@ -367,7 +393,7 @@ def __str__(self): ''' creates nicely outputed string ''' - return '>'+self.name+'\n'+re.sub("(.{80})", '\\1\n', self.seq, 0)+'\n' + return self.name+'\n'+re.sub("(.{80})", '\\1\n', self.seq, 0)+'\n' def __len__(self): diff --git a/fatool/tests/test_sequence.py b/fatool/tests/test_sequence.py index 251f808..cf6c1cb 100644 --- a/fatool/tests/test_sequence.py +++ b/fatool/tests/test_sequence.py @@ -12,7 +12,7 @@ def setUp(self): def test_setUpSequence(self): c = Sequence('>name', 'ACTGactg') self.assertTrue( isinstance(c, Sequence) ) - self.assertEqual(c.name, 'name') + self.assertEqual(c.name, '>name') self.assertEqual(c.seq, 'ACTGactg') def test_contig_str(self): @@ -376,11 +376,32 @@ def test_find_aprox_primers(self): 'TTTAGCACTGATAGCCACTTGATCCACATCGTTAACGGTAATATAGCCAGTCCAATGTGAGG', ] - for r in c.find_aprox_primers('TTTT', 'GGGG', 'FF', 1,60,65): - print r - + #for r in c.find_aprox_primers('TTTT', 'GGGG', 'FF', 1,60,65): + # print r + self.assertEqual(t_TTTT_GGGG_FF_60_65, c.find_aprox_primers('TTTT', 'CCCC', 'FR', 1,60,65)) + def test_leave_name_after_marker(self): + c = Sequence('>test_something_special_gene=qwerty_ready', 'ACTGTACGGA') + self.assertEqual(1, c.leave_name_after_marker('gene=', 20)) + self.assertEqual('>gene=qwerty_ready', c.name) + #self.assertEqual() + c2 = Sequence('>test_something_special_gene=qwerty_ready', 'ACTGTACGGA') + self.assertEqual(1, c2.leave_name_after_marker('gene=', 11)) + self.assertEqual('>gene=qwerty', c2.name) + c = Sequence('>test_something_special_gene=qwerty_ready', 'ACTGTACGGA') + self.assertEqual(1, c.leave_name_after_marker('gene=')) + self.assertEqual('>gene=qwerty_ready', c.name) + c2 = Sequence('>test_something_special_gene=qwerty_ready', 'ACTGTACGGA') + self.assertEqual(1, c2.leave_name_after_marker('gene=', 6, 0)) + self.assertEqual('>qwerty', c2.name) + + def test_cut_name(self): + c = Sequence('>test_something_special_gene=qwerty_ready', 'ACTGTACGGA') + c.cut_name(5) + self.assertEqual('>test', c.name) + c.cut_name(10) + self.assertEqual('>test', c.name) if __name__ == "__main__":