Permalink
Browse files

Fixed ISSUE 1

new functionality join, split
	modified:   README.md
	modified:   fatool.py
  • Loading branch information...
1 parent d7cdadc commit 0b72e89a8144dad0300196b664362c0746241397 @blazejmarciniak blazejmarciniak committed Apr 8, 2016
Showing with 128 additions and 43 deletions.
  1. +58 −11 README.md
  2. +70 −32 fatool.py
View
@@ -1,15 +1,46 @@
-# FA_TOOL
+NAME
+====
+fatool
-Command line tool in python. It operates on fa/fasta etc. files. version: 0.0.1
+VERSION
+=======
-fatool.py [-h] -f FAFILE {cut,extractNames,extractContigs,remContigs}
+0.1.0
+
+LICENSE
+=======
+
+INTRODUCTION
+============
+
+Command line tool in python 2.7. It operates on fa/fasta/etc. files. version: 0.1.0
+
+
+PREREQUISITES
+=============
+PYTHON 2.7
+
+COMMAND LINE
+============
+
+usage: fatool.py [-h] -f FAFILE [--operator OPERATOR] [--log LOG]
+ {cut,extractNames,extractContigs,remContigs,join,split} ...
optional arguments:
-h, --help show this help message and exit
- -f FAFILE, --fafile FAFILE file to be cut usualy *.fa
+ -f FAFILE, --fafile file to be cut usualy *.fa
+ --operator OPERATOR user who have fired script it will be noted in log
+ --log LOG log file if not supplied stdout
-fatool commands:
- {cut,extractNames,extractContigs,remContigs} each has own params, for more details use: command -h
+facutter commands:
+ {cut,extractNames,extractContigs,remContigs,join,split} each has own params
+
+ cut split supplied sequence into smaller parts, according to given params
+ extractNames extracting contigs names only
+ extractContigs extracting contigs specified in file (output in new file)
+ join joining two or more files, yet not verifing duplicates
+ remContigs removing contigs specified in file (output in new file)
+ split each cotig saved into separate file
cut:
@@ -29,7 +60,7 @@ extractContigs:
optional arguments:
-h, --help show this help message and exit
--list LIST file containing list of contigs one contig per line
- -o OUTPUT, --output output file; if --multifile is set output directory
+ -o OUTPUT, --output output file; if --multifile is set output directory
--log LOG log file if not supplied stdout
--multifile if this flag is set each contig will be saved inseparate file
@@ -42,14 +73,30 @@ extractNames:
-h, --help show this help message and exit
-o OUTPUT, --output output file if not supplied stdout
--log LOG log file if not supplied stdout
+
+join:
+
+ usage: fatool.py join [-h] -o OUTPUT [--files [FILES [FILES ...]]]
+
+ optional arguments:
+ -h, --help show this help message and exit
+ -o OUTPUT, --output OUTPUT output file
+ --files [FILES [FILES ...]] files to be joined
+
+remContigs:
- remContigs:
-
-usage: fatool.py remContigs [-h] --list LIST -o OUTPUT [--log LOG]
+ usage: fatool.py remContigs [-h] --list LIST -o OUTPUT [--log LOG]
optional arguments:
-h, --help show this help message and exit
--list LIST file containing list of contigs one contig per line
-o OUTPUT, --output output file if not supplied stdout
--log LOG log file if not supplied stdout
-
+
+split:
+
+ usage: fatool.py split [-h] -d OUTPUTDIR
+
+ optional arguments:
+ -h, --help show this help message and exit
+ -d OUTPUTDIR, --outputDir output directory where splited contigs will be saved
View
102 fatool.py
@@ -3,6 +3,7 @@
import sys
import argparse
import re
+import datetime
def main():
@@ -11,31 +12,40 @@ def main():
subparsers = parser.add_subparsers(title='facutter commands', help='each has own params, for more details use: command -h')
- sub_cut = subparsers.add_parser('cut')
+ sub_cut = subparsers.add_parser('cut', help='split supplied sequence into smaller parts, according to given params')
sub_cut.add_argument('-r', '--range', help='cutted sequence length', type=int, required=True)
sub_cut.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa')
sub_cut.add_argument('-s', '--step', help='step length default: 1', type=int, default=1)
sub_cut.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w'))
sub_cut.set_defaults(func=cut_fa)
- sub_en = subparsers.add_parser('extractNames')
+ sub_en = subparsers.add_parser('extractNames', help='extracting contigs names only')
sub_en.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w'))
# sub_en.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w'))
sub_en.set_defaults(func=extract_names)
- sub_ec = subparsers.add_parser('extractContigs')
+ sub_ec = subparsers.add_parser('extractContigs', help='extracting contigs specified in file (output in new file)')
sub_ec.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True)
sub_ec.add_argument('-o', '--output', help='output file; if --multifile is set output directory', type=str, required=True)
# sub_ec.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w'))
sub_ec.add_argument('--multifile', help='if this flag is set each contig will be saved in separate file', action='store_true')
sub_ec.set_defaults(func=extract_contigs)
- sub_rc = subparsers.add_parser('remContigs')
+ sub_rc = subparsers.add_parser('remContigs', help='removing contigs specified in file (output in new file)')
sub_rc.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True)
sub_rc.add_argument('-o', '--output', help='output file if not supplied stdout', type=str, required=True)
# sub_rc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w'))
sub_rc.set_defaults(func=remove_contigs)
-
+
+ sub_jc = subparsers.add_parser('join', help='joining two or more files, yet not verifing duplicates')
+ sub_jc.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w'), required=True)
+ sub_jc.add_argument('--files', help='files to be joined', nargs='*', type=argparse.FileType('r'))
+ sub_jc.set_defaults(func=join)
+
+ sub_sc = subparsers.add_parser('split', help='each cotig saved into separate file')
+ sub_sc.add_argument('-d', '--outputDir', help='output directory where splited contigs will be saved', type=str, required=True)
+ sub_sc.set_defaults(func=split_contigs)
+
parser.add_argument('--operator', help='user who have fired script it will be noted in log', type=str)
parser.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w'))
@@ -48,6 +58,15 @@ def make_log(content, lfile):
f.write(content)
+# function prepares pattern for contig search
+def make_pattern(r):
+ if re.match('^>', r.strip()):
+ pattern = '('+re.escape(r.strip())+'\n[A-Za-z\n]*)[\Z>]?'
+ else:
+ pattern = '(> '+re.escape(r.strip())+'\n[A-Za-z\n]*|>'+r.strip()+'\n[A-Za-z\n]*)[\Z>]?'
+ return pattern
+
+
def cut_fa(args):
fafile = args.fafile
@@ -66,14 +85,11 @@ def cut_fa(args):
with output as o:
coe = len(fa) # end of fa position
i = 0
- # N = step*8000
while i + split_range <= coe:
# while curent position + length of frag is less or equal postion of the last char in file.
o.write('> frag ' + str(i + 1) + ' : ' + str(i + split_range) + '\n' + str(fa[i:i + split_range]) + '\n')
i = i + step
- # print dot every N split to show that script does not heng.
- # if(i%N == 0): sys.stdout.write('.')
def extract_names(args):
@@ -94,13 +110,21 @@ def extract_names(args):
if pat.match(r):
o.write(r)
+def make_file_name(r, suffix):
+ if len(suffix) > 0:
+ name = re.sub('[>\*\\\?\<\/]', '', r.strip())
+ return name+'.'+suffix
+ else:
+ return re.sub('[>\*\\\?\<\/]', '', r.strip())
+
def extract_contigs(args):
# default all extracted contigs in one file
# with flag multifile save each contig to separate file
fafile = args.fafile
elist = args.list
log = args.log
+ log_content = '\nfatools extractContigs\tstarted:\t'+str(datetime.datetime.now())+'\t'
# counters: extracted contigs and list items
excounter = lcounter = 0
@@ -113,19 +137,15 @@ def extract_contigs(args):
with elist as cntgs, fafile as f:
content = f.read()
for r in cntgs:
+ #print r
lcounter = lcounter + 1
# check if list item is with '>' important to create pattern.
- if re.match('^>', r.strip()):
- pattern = '('+re.escape(r.strip())+'\n[A-Za-z\n]*)>'
- else:
- pattern = '(> '+re.escape(r.strip())+'\n[A-Za-z\n]*|>'+r.strip()+'\n[A-Za-z\n]*)>'
-
- m = re.search(pattern, content)
+
+ m = re.search(make_pattern(r), content)
if m:
excounter = excounter + 1
- opt = re.sub('[>\*\\\?\<\/]', '', r.strip())
- with open(output+'/'+opt+'.fa', 'w') as o:
+ with open(output+'/'+make_file_name(r,'fa'), 'w') as o:
o.write(m.group(1))
else:
# log_content = log_content + 'contig not found: ' + r
@@ -138,12 +158,8 @@ def extract_contigs(args):
for r in cntgs:
lcounter = lcounter + 1
# check if list item is with '>' important to create pattern.
- if re.match('^>', r.strip()):
- pattern = '('+re.escape(r.strip())+'\n[A-Za-z\n]*)>'
- else:
- pattern = '(> '+re.escape(r.strip())+'\n[A-Za-z\n]*|>'+r.strip()+'\n[A-Za-z\n]*)>'
- m = re.search(pattern, content)
+ m = re.search(make_pattern(r), content)
if m:
excounter = excounter + 1
@@ -153,9 +169,12 @@ def extract_contigs(args):
log_not_found = 'contig not found: ' + r
if(log):
- log_content = '\nfatools\nlist items:\t'+str(lcounter)+'\nextracted contigs:\t'+str(excounter)+'\n'
+ log_content = log_content + 'stoped:\t'+str(datetime.datetime.now())+'\n'
+ if args.operator:
+ log_content = log_content + 'operator:\t'+args.operator+'\n'
+ log_content = log_content + '='*15+'\nlist items:\t'+str(lcounter)+'\nextracted contigs:\t'+str(excounter)+'\n'
if log_not_found:
- log_content = log_content + '\nContigs not found:\n============================================\n'+log_not_found
+ log_content = log_content + '\nContigs not found:\n'+'='*15+'\n'+log_not_found
make_log(log_content, log)
else:
print 'list items: '+str(lcounter)+'; extracted contigs: '+str(excounter)
@@ -174,24 +193,43 @@ def remove_contigs(args):
content = f.read()
for r in cntgs:
lcounter = lcounter + 1
- # check if list item is with '>' important to create pattern.
- if re.match('^>', r.strip()):
- pattern = '('+re.escape(r.strip())+'\n[A-Za-z\n]*)>'
- else:
- pattern = '(> '+re.escape(r.strip())+'\n[A-Za-z\n]*|>'+r.strip()+'\n[A-Za-z\n]*)>'
- if re.match(pattern, content):
+ if re.match(make_pattern, content):
rem_counter = rem_counter + 1
- content = re.sub(pattern, '>', content)
- o.write(content)
+ content = re.sub(make_pattern, '>', content)
+ #rstrip removes last > left after removing last contig
+ o.write(content.rstrip('>'))
if(log):
make_log('fatool - remContigs:\n list items:\t'+str(lcounter)+'\ncontings rmoved:\t'+str(rem_counter), log)
else:
print 'list items:\t'+str(lcounter)+'\ncontings rmoved:\t'+str(rem_counter)
+def join(args):
+ with args.fafile as f:
+ content = f.read()
+ for r in args.files:
+ with r as j:
+ content = content.rstrip() + '\n' + r.read()
+ with args.output as o:
+ o.write(content)
+
+
+
def split_contigs(args):
- return 1
+ with args.fafile as f:
+ content = f.read()
+ nc = content.split('>')
+ for r in nc[1:]:
+ #print r
+ #print r.split('\n', 1)[0]
+
+ #ofile = make_file_name(r.split('\n', 1)[0],'fa')
+ #print ofile
+ with open(args.outputDir+'/'+make_file_name(r.split('\n', 1)[0],'fa'), 'w') as o:
+ o.write('>'+r)
+
+
def statistics(args):

0 comments on commit 0b72e89

Please sign in to comment.