Permalink
Browse files

fatool.py added verision 0.0.1 extractNames, cut, extractContigs working

remContigs need testing
	modified:   README.md
	new file:   fatool.py
  • Loading branch information...
1 parent 71832ac commit 4381090f244e85ff3609a26496300b60199f30e2 @blazejmarciniak blazejmarciniak committed Apr 6, 2016
Showing with 255 additions and 1 deletion.
  1. +53 −1 README.md
  2. +202 −0 fatool.py
View
@@ -1,3 +1,55 @@
# FA_TOOL
-Command line tool in python. It operates on fa/fasta etc. files.
+Command line tool in python. It operates on fa/fasta etc. files. version: 0.0.1
+
+fatool.py [-h] -f FAFILE {cut,extractNames,extractContigs,remContigs}
+
+optional arguments:
+ -h, --help show this help message and exit
+ -f FAFILE, --fafile FAFILE file to be cut usualy *.fa
+
+fatool commands:
+ {cut,extractNames,extractContigs,remContigs} each has own params, for more details use: command -h
+
+cut:
+
+ usage: fatool.py cut [-h] -r RANGE [-o OUTPUT] [-s STEP] [--log LOG]
+
+ optional arguments:
+ -h, --help show this help message and exit
+ -r RANGE, --range RANGE cutted sequence length
+ -o OUTPUT, --output OUTPUT output file default: output.fa
+ -s STEP, --step STEP step length default: 1
+ --log LOG log file if not supplied stdout
+
+extractContigs:
+
+ usage: fatool.py extractContigs [-h] --list LIST -o OUTPUT [--log LOG] [--multifile]
+
+ optional arguments:
+ -h, --help show this help message and exit
+ --list LIST file containing list of contigs one contig per line
+ -o OUTPUT, --output output file; if --multifile is set output directory
+ --log LOG log file if not supplied stdout
+ --multifile if this flag is set each contig will be saved inseparate file
+
+
+extractNames:
+
+ usage: fatool.py extractNames [-h] [-o OUTPUT] [--log LOG]
+
+ optional arguments:
+ -h, --help show this help message and exit
+ -o OUTPUT, --output output file if not supplied stdout
+ --log LOG log file if not supplied stdout
+
+ remContigs:
+
+usage: facuter4.py remContigs [-h] --list LIST -o OUTPUT [--log LOG]
+
+ optional arguments:
+ -h, --help show this help message and exit
+ --list LIST file containing list of contigs one contig per line
+ -o OUTPUT, --output output file if not supplied stdout
+ --log LOG log file if not supplied stdout
+
View
202 fatool.py
@@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import argparse
+import re
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True)
+
+ subparsers = parser.add_subparsers(title='facutter commands', help='each has own params, for more details use: command -h')
+
+ sub_cut = subparsers.add_parser('cut')
+ sub_cut.add_argument('-r', '--range', help='cutted sequence length', type=int, required=True)
+ sub_cut.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa')
+ sub_cut.add_argument('-s', '--step', help='step length default: 1', type=int, default=1)
+ sub_cut.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w'))
+ sub_cut.set_defaults(func=cut_fa)
+
+ sub_en = subparsers.add_parser('extractNames')
+ sub_en.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w'))
+ # sub_en.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w'))
+ sub_en.set_defaults(func=extract_names)
+
+ sub_ec = subparsers.add_parser('extractContigs')
+ sub_ec.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True)
+ sub_ec.add_argument('-o', '--output', help='output file; if --multifile is set output directory', type=str, required=True)
+ # sub_ec.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w'))
+ sub_ec.add_argument('--multifile', help='if this flag is set each contig will be saved in separate file', action='store_true')
+ sub_ec.set_defaults(func=extract_contigs)
+
+ sub_rc = subparsers.add_parser('remContigs')
+ sub_rc.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True)
+ sub_rc.add_argument('-o', '--output', help='output file if not supplied stdout', type=str, required=True)
+ # sub_rc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w'))
+ sub_rc.set_defaults(func=remove_contigs)
+
+ parser.add_argument('--operator', help='user who have fired script it will be noted in log', type=str)
+ parser.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w'))
+
+ args = parser.parse_args()
+ args.func(args)
+
+
+def make_log(content, lfile):
+ with lfile as f:
+ f.write(content)
+
+
+def cut_fa(args):
+
+ fafile = args.fafile
+ output = args.output
+ split_range = args.range
+ step = args.step
+
+ print 'step used: '+str(step)
+
+ fa = '' # sequence grabed from file and ceared
+ with fafile as f:
+ # load sequence from file remove first line and all white chars
+ for r in f.readlines()[1:]:
+ fa = fa+r.replace("\r", "").replace("\n", "")
+
+ with output as o:
+ coe = len(fa) # end of fa position
+ i = 0
+ # N = step*8000
+
+ while i + split_range <= coe:
+ # while curent position + length of frag is less or equal postion of the last char in file.
+ o.write('> frag ' + str(i + 1) + ' : ' + str(i + split_range) + '\n' + str(fa[i:i + split_range]) + '\n')
+ i = i + step
+ # print dot every N split to show that script does not heng.
+ # if(i%N == 0): sys.stdout.write('.')
+
+
+def extract_names(args):
+ fafile = args.fafile
+ output = args.output
+ # sequence title line begining
+ pat = re.compile('^>')
+ if output is None:
+ print 'no output defined results will be print on stdout\n'
+ with fafile as f:
+ for r in f.readlines():
+ if pat.match(r):
+ print r
+ else:
+ # proceed fafile and save title lines in output file
+ with fafile as f, output as o:
+ for r in f.readlines():
+ if pat.match(r):
+ o.write(r)
+
+
+def extract_contigs(args):
+ # default all extracted contigs in one file
+ # with flag multifile save each contig to separate file
+ fafile = args.fafile
+ elist = args.list
+ log = args.log
+
+ # counters: extracted contigs and list items
+ excounter = lcounter = 0
+ log_not_found = ''
+
+ print 'extracting contigs'
+ # output to multi files
+ if(args.multifile):
+ output = args.output
+ with elist as cntgs, fafile as f:
+ content = f.read()
+ for r in cntgs:
+ lcounter = lcounter + 1
+ # check if list item is with '>' important to create pattern.
+ if re.match('^>', r.strip()):
+ pattern = '('+re.escape(r.strip())+'\n[A-Za-z\n]*)>'
+ else:
+ pattern = '(> '+re.escape(r.strip())+'\n[A-Za-z\n]*|>'+r.strip()+'\n[A-Za-z\n]*)>'
+
+ m = re.search(pattern, content)
+
+ if m:
+ excounter = excounter + 1
+ opt = re.sub('[>\*\\\?\<\/]', '', r.strip())
+ with open(output+'/'+opt+'.fa', 'w') as o:
+ o.write(m.group(1))
+ else:
+ # log_content = log_content + 'contig not found: ' + r
+ log_not_found = 'contig not found: ' + r
+ # output to single file
+ else:
+ output = args.output
+ with elist as cntgs, fafile as f, open(output, 'w') as o:
+ content = f.read()
+ for r in cntgs:
+ lcounter = lcounter + 1
+ # check if list item is with '>' important to create pattern.
+ if re.match('^>', r.strip()):
+ pattern = '('+re.escape(r.strip())+'\n[A-Za-z\n]*)>'
+ else:
+ pattern = '(> '+re.escape(r.strip())+'\n[A-Za-z\n]*|>'+r.strip()+'\n[A-Za-z\n]*)>'
+
+ m = re.search(pattern, content)
+
+ if m:
+ excounter = excounter + 1
+ o.write(m.group(1))
+ else:
+ # log_content = log_content + 'contig not found: ' + r
+ log_not_found = 'contig not found: ' + r
+
+ if(log):
+ log_content = '\nfatools\nlist items:\t'+str(lcounter)+'\nextracted contigs:\t'+str(excounter)+'\n'
+ if log_not_found:
+ log_content = log_content + '\nContigs not found:\n============================================\n'+log_not_found
+ make_log(log_content, log)
+ else:
+ print 'list items: '+str(lcounter)+'; extracted contigs: '+str(excounter)
+ if log_not_found:
+ '\nContigs not found:\n============================================\n'+log_not_found
+
+
+def remove_contigs(args):
+ fafile = args.fafile
+ rlist = args.list
+ output = args.output
+ log = args.log
+ # counters for listitems and removed contigs
+ lcounter = rem_counter = 0
+ with elist as cntgs, fafile as f, open(output, 'w') as o:
+ content = f.read()
+ for r in cntgs:
+ lcounter = lcounter + 1
+ # check if list item is with '>' important to create pattern.
+ if re.match('^>', r.strip()):
+ pattern = '('+re.escape(r.strip())+'\n[A-Za-z\n]*)>'
+ else:
+ pattern = '(> '+re.escape(r.strip())+'\n[A-Za-z\n]*|>'+r.strip()+'\n[A-Za-z\n]*)>'
+
+ if re.match(pattern, content):
+ rem_counter = rem_counter + 1
+ content = re.sub(pattern, '>', content)
+ o.write(content)
+ if(log):
+ make_log('fatool - remContigs:\n list items:\t'+str(lcounter)+'\ncontings rmoved:\t'+str(rem_counter), log)
+ else:
+ print 'list items:\t'+str(lcounter)+'\ncontings rmoved:\t'+str(rem_counter)
+
+
+def split_contigs(args):
+ return 1
+
+
+def statistics(args):
+ return 1
+
+
+if __name__ == '__main__':
+ exit(main())

0 comments on commit 4381090

Please sign in to comment.