From 6af1b970cdcae15a101c3ea0629806d083abf2d9 Mon Sep 17 00:00:00 2001 From: unknown Date: Sun, 12 Jun 2016 21:59:20 +0200 Subject: [PATCH] Script changed into python package, tests added. modified: README.md new file: bin/cmdfatool.py new file: build/lib/fatool/__init__.py new file: build/lib/fatool/fa.py new file: build/lib/fatool/fuzzy.py new file: build/lib/fatool/sequence.py new file: build/scripts-2.7/cmdfatool.py deleted: fatool.py new file: fatool/__init__.py new file: fatool/fa.py new file: fatool/fuzzy.py new file: fatool/sequence.py new file: fatool/tests/__init__.py new file: fatool/tests/test_fa.py new file: fatool/tests/test_sequence.py new file: setup.py --- README.md | 175 +++++++++++++------ bin/cmdfatool.py | 358 +++++++++++++++++++++++++++++++++++++++ build/lib/fatool/__init__.py | 3 + build/lib/fatool/fa.py | 208 +++++++++++++++++++++++ build/lib/fatool/fuzzy.py | 106 ++++++++++++ build/lib/fatool/sequence.py | 375 +++++++++++++++++++++++++++++++++++++++++ build/scripts-2.7/cmdfatool.py | 358 +++++++++++++++++++++++++++++++++++++++ fatool.py | 320 ----------------------------------- fatool/__init__.py | 3 + fatool/fa.py | 208 +++++++++++++++++++++++ fatool/fuzzy.py | 106 ++++++++++++ fatool/sequence.py | 375 +++++++++++++++++++++++++++++++++++++++++ fatool/tests/__init__.py | 0 fatool/tests/test_fa.py | 193 +++++++++++++++++++++ fatool/tests/test_sequence.py | 211 +++++++++++++++++++++++ setup.py | 14 ++ 16 files changed, 2641 insertions(+), 372 deletions(-) create mode 100644 bin/cmdfatool.py create mode 100644 build/lib/fatool/__init__.py create mode 100644 build/lib/fatool/fa.py create mode 100644 build/lib/fatool/fuzzy.py create mode 100644 build/lib/fatool/sequence.py create mode 100644 build/scripts-2.7/cmdfatool.py delete mode 100644 fatool.py create mode 100644 fatool/__init__.py create mode 100644 fatool/fa.py create mode 100644 fatool/fuzzy.py create mode 100644 fatool/sequence.py create mode 100644 fatool/tests/__init__.py create mode 100644 fatool/tests/test_fa.py create mode 100644 fatool/tests/test_sequence.py create mode 100644 setup.py diff --git a/README.md b/README.md index 68adefb..7c4f5de 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,20 @@ NAME ==== -FA_TOOL +fatool + +DESCRIPTION +=========== + +Tool for analyze and manipulate fasta files VERSION ======= -0.1.0 +0.2.1 LICENSE ======= - -Specified in LICENSE.md file +APACHE 2.0 Specified in LICENSE.md file INTRODUCTION ============ @@ -25,80 +29,147 @@ PYTHON 2.7 COMMAND LINE ============ -usage: fatool.py [-h] -f FAFILE [--operator OPERATOR] [--log LOG] - {cut,extractNames,extractContigs,remContigs,join,split} ... +usage: cmdfatool.py [-h] [-v] + {cut,extractNames,extractContigs,remContigs,join,split,reverse,validate,stats} optional arguments: -h, --help show this help message and exit - -f FAFILE, --fafile file to be cut usualy *.fa - --operator OPERATOR user who have fired script it will be noted in log - --log LOG log file if not supplied stdout + -v, --version display version number and exit -facutter commands: - {cut,extractNames,extractContigs,remContigs,join,split} each has own params +fatool commands: + {cut,extractNames,extractContigs,remContigs,join,split,reverse,validate,stats} each has own params, for more details use: command -h cut split supplied sequence into smaller parts, according to given params - extractNames extracting contigs names only into list txt file - extractContigs extracting contigs specified in file based on contigs names list (output in new file) + extractNames extracting contigs names only + extractContigs extracting contigs specified in file (output in new file) + remContigs removing contigs specified in file (output in new file) join joining two or more files, yet not verifing duplicates - remContigs removing contigs specified in file based on contigs names list (output in new file) split each cotig saved into separate file - -cut: - - usage: fatool.py cut [-h] -r RANGE [-o OUTPUT] [-s STEP] [--log LOG] + reverse reverse all sequences in file + validate validates fa file + stats show statistics of fa file - optional arguments: - -h, --help show this help message and exit - -r RANGE, --range RANGE cutted sequence length - -o OUTPUT, --output OUTPUT output file default: output.fa - -s STEP, --step STEP step length default: 1 - --log LOG log file if not supplied stdout + + cut: + +usage: cmdfatool.py cut [-h] -f FAFILE -r RANGE [-o OUTPUT] [-s STEP] + [--log LOG] [--operator OPERATOR] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + -r RANGE, --range RANGE cutted sequence length + -o OUTPUT, --output OUTPUT output file default: output.fa + -s STEP, --step STEP step length default: 1 + --log LOG log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + -extractContigs: + extractNames: - usage: fatool.py extractContigs [-h] --list LIST -o OUTPUT [--log LOG] [--multifile] +usage: cmdfatool.py extractNames [-h] -f FAFILE [-o OUTPUT] [--log LOG] + [--operator OPERATOR] - optional arguments: +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + -o OUTPUT, --output OUTPUT output file if not supplied stdout + --log LOG log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + + + extractContigs: + +usage: cmdfatool.py extractContigs [-h] -f FAFILE --list LIST -o OUTPUT + [--log LOG] [--operator OPERATOR] + [--multifile] + +optional arguments: -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa --list LIST file containing list of contigs one contig per line - -o OUTPUT, --output output file; if --multifile is set output directory + -o OUTPUT, --output OUTPUT output file; if --multifile is set output directory --log LOG log file if not supplied stdout - --multifile if this flag is set each contig will be saved inseparate file + --operator OPERATOR user who have fired script it will be noted in log + --multifile if this flag is set each contig will be saved in + separate file + + + remContigs + +usage: cmdfatool.py remContigs [-h] -f FAFILE --list LIST -o OUTPUT + [--log LOG] [--operator OPERATOR] +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + --list LIST file containing list of contigs one contig per line + -o OUTPUT, --output OUTPUT output file if not supplied stdout + --log LOG log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log -extractNames: - usage: fatool.py extractNames [-h] [-o OUTPUT] [--log LOG] + join + +usage: cmdfatool.py join [-h] -f FAFILE -o OUTPUT + [--files [FILES [FILES ...]]] [--overwrite] + [--log LOG] [--operator OPERATOR] - optional arguments: +optional arguments: -h, --help show this help message and exit - -o OUTPUT, --output output file if not supplied stdout + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + -o OUTPUT, --output OUTPUT output file if not supplied stdout + --files [FILES [FILES ...]] files to be joined + --overwrite if set owerwrites contigs with same name --log LOG log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + + + split -join: - - usage: fatool.py join [-h] -o OUTPUT [--files [FILES [FILES ...]]] +usage: cmdfatool.py split [-h] -f FAFILE -d OUTPUTDIR [--log LOG] + [--operator OPERATOR] - optional arguments: - -h, --help show this help message and exit - -o OUTPUT, --output OUTPUT output file - --files [FILES [FILES ...]] files to be joined - -remContigs: +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + -d OUTPUTDIR, --outputDir OUTPUTDIR output directory where splited contigs will be saved + --log LOG log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + - usage: fatool.py remContigs [-h] --list LIST -o OUTPUT [--log LOG] + reverse + +usage: cmdfatool.py reverse [-h] -f FAFILE -o OUTPUT [--log LOG] + [--operator OPERATOR] - optional arguments: +optional arguments: -h, --help show this help message and exit - --list LIST file containing list of contigs one contig per line - -o OUTPUT, --output output file if not supplied stdout + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + -o OUTPUT, --output OUTPUT output file; if --multifile is set output directory --log LOG log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + + + validate + +usage: cmdfatool.py validate [-h] -f FAFILE -t TYPE [--details] -split: - - usage: fatool.py split [-h] -d OUTPUTDIR +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE + file to be cut usualy *.fa + -t TYPE, --type TYPE type of sequence 0 - general, 1 DNA, 2 - amino + --details set if you want to see detaild validation info + + + stats + +usage: cmdfatool.py stats [-h] -f FAFILE [--log LOG] + [--operator [OPERATOR [OPERATOR ...]]] - optional arguments: - -h, --help show this help message and exit - -d OUTPUTDIR, --outputDir output directory where splited contigs will be saved +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa + --log LOG log file if not supplied stdout + --operator [OPERATOR [OPERATOR ...]] user who have fired script it will be noted in log diff --git a/bin/cmdfatool.py b/bin/cmdfatool.py new file mode 100644 index 0000000..2ec7a47 --- /dev/null +++ b/bin/cmdfatool.py @@ -0,0 +1,358 @@ +# -*- coding: utf-8 -*- + + +import sys +import argparse +import re +import datetime +from string import maketrans +# from fatool import Contig +from fatool import * +from decimal import * + + +def main(): + parser = argparse.ArgumentParser() + #parser.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + parser.add_argument('-v', '--version', help='display version number and exit', action='version', version='%(prog)s 0.2.1') + subparsers = parser.add_subparsers(title='fatool commands', help='each has own params, for more details use: command -h') + + sub_cut = subparsers.add_parser('cut', help='split supplied sequence into smaller parts, according to given params') + sub_cut.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_cut.add_argument('-r', '--range', help='cutted sequence length', type=int, required=True) + sub_cut.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa') + sub_cut.add_argument('-s', '--step', help='step length default: 1', type=int, default=1) + sub_cut.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_cut.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_cut.set_defaults(func=cut_fa) + + sub_en = subparsers.add_parser('extractNames', help='extracting contigs names only') + sub_en.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_en.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w')) + sub_en.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_en.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_en.set_defaults(func=extract_names) + + sub_ec = subparsers.add_parser('extractContigs', help='extracting contigs specified in file (output in new file)') + sub_ec.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_ec.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True) + sub_ec.add_argument('-o', '--output', help='output file; if --multifile is set output directory', type=str, required=True) + sub_ec.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_ec.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_ec.add_argument('--multifile', help='if this flag is set each contig will be saved in separate file', action='store_true') + sub_ec.set_defaults(func=extract_contigs) + + sub_rc = subparsers.add_parser('remContigs', help='removing contigs specified in file (output in new file)') + sub_rc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_rc.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True) + sub_rc.add_argument('-o', '--output', help='output file if not supplied stdout', type=str, required=True) + sub_rc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_rc.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_rc.set_defaults(func=remove_contigs) + + sub_jc = subparsers.add_parser('join', help='joining two or more files, yet not verifing duplicates') + sub_jc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_jc.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w'), required=True) + sub_jc.add_argument('--files', help='files to be joined', nargs='*', type=argparse.FileType('r')) + sub_jc.add_argument('--overwrite', help='if set owerwrites contigs with same name', action='store_true') + sub_jc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_jc.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_jc.set_defaults(func=join) + + sub_sc = subparsers.add_parser('split', help='each cotig saved into separate file') + sub_sc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_sc.add_argument('-d', '--outputDir', help='output directory where splited contigs will be saved', type=str, required=True) + sub_sc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_sc.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_sc.set_defaults(func=split_contigs) + + sub_r = subparsers.add_parser('reverse', help='reverse all sequences in file') + sub_r.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_r.add_argument('-o', '--output', help='output file; if --multifile is set output directory', type=argparse.FileType('w'), required=True) + sub_r.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_r.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_r.set_defaults(func=reverse) + + sub_v = subparsers.add_parser('validate', help='validates fa file') + sub_v.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_v.add_argument('-t', '--type', help='type of sequence 0 - general, 1 DNA, 2 - amino', type=int, required=True) + sub_v.add_argument('--details', help='set if you want to see detaild validation info', action='store_true') + sub_v.set_defaults(func=validate) + + sub_s = subparsers.add_parser('stats', help='show statistics of fa file') + sub_s.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) + sub_s.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_s.add_argument('--operator', help='user who have fired script it will be noted in log', nargs='*', type=str) + sub_s.set_defaults(func=statistics) + + sub_s = subparsers.add_parser('findMotif', help='show statistics of fa file') + sub_s.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) + sub_s.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_s.add_argument('--operator', help='user who have fired script it will be noted in log', nargs='*', type=str) + sub_s.set_defaults(func=find_motif) + + sub_s = subparsers.add_parser('findPrimer', help='show statistics of fa file') + sub_s.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) + sub_s.add_argument('--start', help='strat codon 5\'', type=str, required=True) + sub_s.add_argument('--stop', help='stop codon 3\'', type=str, required=True) + sub_s.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_s.add_argument('--operator', help='user who have fired script it will be noted in log', nargs='*', type=str) + sub_s.set_defaults(func=find_primers) + + #parser.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + #parser.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + + args = parser.parse_args() + #if args.version: + # print version + # exit(0) + + args.func(args) + +def resolve_operator(operator_arg_list): + # makes prity print of opoerator + op = '' + for r in operator_arg_list: + op += r+' ' + return op.rstrip() + +def make_log_header(cmd, op): + stats_rep = '\n-------------------------------------------------------------' + stats_rep +='\ncmdfatool '+str(cmd)+' \n\nstarted:\t'+str(datetime.datetime.now()) + if op: + stats_rep += '\nOperator:\t'+resolve_operator(op) + stats_rep += '\n-------------------------------------------------------------\n' + return stats_rep + + +def cut_fa(args): + rep = str(make_log_header('cut', args.operator)) + + fafile = args.fafile + output = args.output + split_range = args.range + step = args.step + + f = Fa.load_from_file(fafile) + contig_list = [] + for r in f.contigs: + contig_list.join(r.cut(split_range, step)) + result_fa = Fa(contig_list, 'splited') + result_fa.write(output) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(rep) + + +def extract_names(args): + rep = str(make_log_header('extractNames', args.operator)) + fafile = args.fafile + output = args.output + + fa = Fa.load_from_file(fafile) + names = fa.show_names() + with output as o: + for r in names: + o.write('>'+r) + rep += 'Number of neames founded:\t' + str(len(names)) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(rep) + +def extract_contigs(args): + # default all extracted contigs in one file + # with flag multifile save each contig to separate file + + rep = str(make_log_header('extractContigs', args.operator)) + + fa = Fa.load_from_file(args.fafile) + rep += 'Number of contigs in orginal file:\t'+str(len(fa.contigs)) + + #file with contigs names one per line + with args.list as cntgs: + elist = [c.strip() for c in cntgs] + result_fa = fa.extract(elist) + if( args.multifile): + result_fa.write_multiple_files(args.output) + else: + result_fa.write(args.output) + rep += '\nContigs to remove:\t'+str(len(elist)) + rep += '\Extracted contigs:\t'+str(len(result_ta.contigs)) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(rep) + else: + print rep + +def remove_contigs(args): + # contigs from list are removed, others saved to file + rep = str(make_log_header('remContigs', args.operator)) + fa = Fa.load_from_file(args.fafile) + rep += 'Number of contigs in orginal file:\t'+str(len(fa.contigs)) + # file that contains list of contigs one per line + with args.list as cntgs: + rlist = [c.strip() for c in cntgs] + rep += 'Number of contigs to remove:\t'+len(rlist) + result_fa = fa.remove(rlist) + rep += 'Number of contigs after remove:\t'+str(len(fa.contigs)) + rep += 'Contigs removed:\t'+str(len(fa.contigs) - len(result_fa.contigs)) + result_fa.write(args.output) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(stats_rep) + else: + print stats_rep + + +def join(args): + # joins contig from multiple files + rep = str(make_log_header('join', args.operator)) + fa = Fa.load_from_file(args.fafile) + fa_list = [] + contigs_to_add = 0 + # list of Fa files to join. + for r in args.files: + if len(r) > 0: + fa2add = Fa.load_from_file(r) + fa_list.append(fa2add) + contigs_to_add += fa2add.count_contigs() + rep += '\nOrginal contigs number:\t'+Fa.count_contigs() + rep += '\nTotal files to join with orginal file:\t'+len(args.files) + rep += '\nTotal contigs to add:\t'+str(contigs_to_add) + fa.join(fa_list, args.overwrite) + rep += '\nNumber of contigs after join:\t'+str(fa.count_contigs()) + fa.write(args.output) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(stats_rep) + else: + print stats_rep + +def split_contigs(args): + #writes each contig in single file + rep = str(make_log_header('split', args.operator)) + fa = Fa.load_from_file(args.fafile) + fa.write_multiple_files(args.output) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(rep) + else: + print rep + + +def statistics(args): + # returns statistics of fa file + stats_rep = str(make_log_header('stats', args.operator)) + fa = Fa.load_from_file(args.fafile) + stats = fa.statistics() + stats_rep += '\n\nNumber of N:\t'+str(stats['N']) + stats_rep += '\nNumber of A:\t'+str(stats['A']) + stats_rep += '\nNumber of C:\t'+str(stats['C']) + stats_rep += '\nNumber of T:\t'+str(stats['T']) + stats_rep += '\nNumber of G:\t'+str(stats['G']) + getcontext().rounding = ROUND_05UP + getcontext().prec = 4 + stats_rep += '\nGC[%] (0.5 up):\t'+str(Decimal(stats['G']+stats['C'])/stats['L']*Decimal(100.00)) + stats_rep += '\n\nTotal length:\t'+str(stats['L']) + stats_rep += '\nTotal contigs:\t'+str(stats['totalc']) + stats_rep += '\n\ncontigs 1000-5000bp:\t'+str(stats['nbp1000']) + stats_rep += '\ncontigs 1000-5000bp length:\t'+str(stats['lbp1000']) + stats_rep += '\ncontigs 5001-10000bp:\t'+str(stats['nbp5000']) + stats_rep += '\ncontigs 5001-10000bp length:\t'+str(stats['lbp5000']) + stats_rep += '\ncontigs 10001-25000bp:\t'+str(stats['nbp10000']) + stats_rep += '\ncontigs 10001-25000bp length:\t'+str(stats['lbp10000']) + stats_rep += '\ncontigs 25001-50000bp:\t'+str(stats['nbp25000']) + stats_rep += '\ncontigs 25001-50000bp length:\t'+str(stats['lbp25000']) + stats_rep += '\ncontigs 50001+bp:\t'+str(stats['nbp50000']) + stats_rep += '\ncontigs 50001+bp length:\t'+str(stats['lbp50000']) + stats_rep += '\n\ncontigs > 1000bp:\t'+str(stats['nbp1000']+stats['nbp5000']+stats['nbp10000']+stats['nbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 1000bp length:\t'+str(stats['lbp1000']+stats['lbp5000']+stats['lbp10000']+stats['lbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 5000bp:\t'+str(stats['nbp5000']+stats['nbp10000']+stats['nbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 5000bp length:\t'+str(stats['lbp5000']+stats['lbp10000']+stats['lbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 10000bp:\t'+str(stats['nbp10000']+stats['nbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 10000bp length:\t'+str(stats['lbp10000']+stats['lbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 25000bp:\t'+str(stats['nbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 25000bp length:\t'+str(stats['lbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 50000bp:\t'+str(stats['nbp50000']) + stats_rep += '\ncontigs > 50000bp length:\t'+str(stats['nbp50000']) + stats_rep += '\nLongest contig:\t'+str(stats['longest']) + stats_rep += '\n\nN50:\t'+str(stats['N50']) + stats_rep += '\nL50:\t'+str(stats['L50']) + stats_rep += '\nN75:\t'+str(stats['N75']) + stats_rep += '\nL75:\t'+str(stats['L75']) + stats_rep += '\nN90:\t'+str(stats['N90']) + stats_rep += '\nL90:\t'+str(stats['L90']) + stats_rep += '\n\n------------------------------------------------------' + stats_rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(stats_rep) + else: + print stats_rep + + +def validate(args): + # check if fa is valid + rep = str(make_log_header('validate', args.operator)) + fa = Fa.load_from_file(args.fafile) + result_list = {} + if args.details: + for r in fa.contigs: + result_list[r.name] = Sequence.detailed_validate_generic(r, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') + else: + for r in fa.contigs: + result_list[r.name] = Sequence.validate_generic(r, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') + #print result_list + + for r in result_list: + rep += r +'\n' + + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(rep) + else: + print rep + + +def reverse(args): + rep = str(make_log_header('reverse', args.operator)) + fa = Fa.load_from_file(args.fafile) + fa.reverse() + fa.write(args.output) + if args.log: + with args.log as log_file: + log_file.write(rep) + else: + print rep + +def find_motif(args): + print 'not available yet' + pass + +def find_primers(args): + print 'not available yet' + pass + + +if __name__ == '__main__': + exit(main()) + + + + + + diff --git a/build/lib/fatool/__init__.py b/build/lib/fatool/__init__.py new file mode 100644 index 0000000..22fb0c7 --- /dev/null +++ b/build/lib/fatool/__init__.py @@ -0,0 +1,3 @@ +from .sequence import Sequence +from .fa import Fa +import fuzzy \ No newline at end of file diff --git a/build/lib/fatool/fa.py b/build/lib/fatool/fa.py new file mode 100644 index 0000000..827df1b --- /dev/null +++ b/build/lib/fatool/fa.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- + + +import re +import math +from fatool import Sequence + +class Fa(object): + def __init__(self, contigs_list, name): + #print contigs_list + # do poprawki + self.name = name + self.contigs = [] + self.contigs_idx = {} + for r in contigs_list: + if not isinstance(r, Sequence): + raise TypeError('Wrong param supplied Sequence was expected') + if not r.name in self.contigs_idx: + if len(self.contigs) > 0: + self.contigs.append(r) + else: + self.contigs = [r] + + self.contigs_idx[r.name] = len(self.contigs) - 1 + else: + raise NameError('Sequence name already exists: '+r.name) + # self.stats{'A':0,'C':0,'T':0,'G':0,'N':0, 'L':0, } + @staticmethod + def load_from_file(file): + if isinstance(file, str): + with open(file, 'r') as f: + contigs = Fa.load_content(f.read()) + name = file + else: + name = file.name + with file as f: + contigs = Fa.load_content(f.read() ) + + + return Fa(contigs, name) + + @staticmethod + def load_content(content): + #print content + nc = content.split('>') + contigs_list = [] + for r in nc[1:]: + contigs_list.append(Sequence('>'+r.split('\n', 1)[0], re.sub('^>.*\n', '', '>'+r.rstrip()))) + return contigs_list + + def write(self, fafile): + if isinstance(fafile, str): + with open(fafile, 'w') as f: + f.write(str(self)) + else: + with fafile as f: + f.write(str(self)) + + def write_multiple_files(self, dir): + dir = dir.rstrip('/') + dir = dir.rstrip('\\') + if len(dir) > 0: + dir = dir+'/' + for r in self.contigs: + with open(dir+r.name+'.fa', 'w') as w: + w.write(str(r)) + + def add_contigs(self, contig_list, owrite=0): + for r in contig_list: + self.add_contig(r, owrite) + + + def add_contig(self, contig, owrite = 0): + if not isinstance(contig, Sequence): + raise TypeError('Wrong param supplied contig was expected') + if contig.name in self.contigs_idx: + if owrite == 1: + #rem old item and add new name + del self.contigs[self.contigs_idx[contig.name]] + self.contigs.append(contig) + for a, r in enumerate(self.contigs): + #print 'cnt '+str(r) + self.contigs_idx[r.name] = a + else: + self.contigs.append(contig) + self.contigs_idx[contig.name] = len(self.contigs) - 1 + + def show_names(self): + return sorted(self.contigs_idx, key=self.contigs_idx.get) + + + def extract(self, contigs_name_list): + new_contig_list = [] + for r in contigs_name_list: + if r in self.contigs_idx: + new_contig_list.append(self.contigs[self.contigs_idx[r]]) + return Fa(new_contig_list, 'extr_'+self.name) + + def remove(self, contigs_name_list): + new_contig_list = [] + for r in self.contigs: + if not r.name in contigs_name_list: + new_contig_list.append(r) + return Fa(new_contig_list, 'rem_'+self.name) + + def validate(self): + ''' + ''' + + def nl_statistics(self, g, percent): + ''' + Counts statistics of N50, L50, N75 etc. + ''' + ncount = -1 # index & number of contigs with +1 + nsum = 0 + stop = math.floor(self.stats['L']*(percent/100.00)) + while nsum < stop: + ncount += 1 + nsum += g[ncount] + + self.stats['N'+str(percent)] = g[ncount] + self.stats['L'+str(percent)] = ncount + 1 + + def bp_stats(self, length): + self.stats['totalc'] += 1 + if length > 50000: + self.stats['nbp50000'] += 1 # number of contigs with length + self.stats['lbp50000'] += length # total length of contigs with min. len + elif length > 25000: + self.stats['nbp25000'] += 1 + self.stats['lbp25000'] += length + elif length > 10000: + self.stats['nbp10000'] += 1 + self.stats['lbp10000'] += length + elif length > 5000: + self.stats['nbp5000'] += 1 + self.stats['lbp5000'] += length + elif length > 1000: + self.stats['nbp1000'] += 1 + self.stats['lbp1000'] += length + + def statistics(self): + self.stats = { + 'A': 0, 'C': 0, 'T': 0, 'G': 0, 'N': 0, 'L': 0, + 'nbp1000': 0, 'nbp5000': 0, 'nbp10000': 0, 'nbp25000': 0, 'nbp50000': 0, + 'lbp1000': 0, 'lbp5000': 0, 'lbp10000': 0, 'lbp25000': 0, 'lbp50000': 0, + 'totalc':0 + } + nstat_list = [] + bp_stats = [] + for r in self.contigs: + temp = r.statistics() + self.stats['A'] += temp['A'] + self.stats['C'] += temp['C'] + self.stats['T'] += temp['T'] + self.stats['G'] += temp['G'] + self.stats['N'] += temp['N'] + self.stats['L'] += temp['L'] + nstat_list.append(temp['L']) + self.bp_stats(temp['L']) + + self.stats['longest'] = max(nstat_list) + nstat_list.sort() + nstat_list.reverse() + + self.nl_statistics(nstat_list, 50) + self.nl_statistics(nstat_list, 75) + self.nl_statistics(nstat_list, 90) + + #print self.stats + + return self.stats + + def sort(self, mono): + contig_list = [] + temp = {} # dict to store name:len(contig) + for r in self.contigs: + temp[r.name] = len(r) + + if mono == -1: + for r in sorted(temp, key=temp.get)[::-1]: + contig_list.append(self.contigs[self.contigs_idx[r]]) + else: + for r in sorted(temp, key=temp.get): + contig_list.append(self.contigs[self.contigs_idx[r]]) + + return Fa(contig_list, 'sorted_'+self.name) + + def reverse(): + cl = [] + for r in self.contigs: + cl.append(r.reverse) + return Fa(cl, 'rev_'+self.name) + + def join(self, fa_list, owrite = 0): + for fa in fa_list: + if not isinstance(fa, Fa): + raise TypeError('Wrong param supplied Fa was expected') + self.add_contigs(fa.contigs, owrite) + + def count_contigs(self): + return len(self.contigs) + + def __str__(self): + return_string = '' + for r in self.contigs: + return_string += str(r) + return return_string diff --git a/build/lib/fatool/fuzzy.py b/build/lib/fatool/fuzzy.py new file mode 100644 index 0000000..2177397 --- /dev/null +++ b/build/lib/fatool/fuzzy.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +#import math + +def find_aprox_match_iter(needle, hstack, missmatch_level, hs_start_pos = 0): + i = hs_start_pos # start iterate from start position + start = hs_start_pos # start of founded region at begining start of search + mmatch_count = 0 # missmatch counter + needle_len = len(needle) + j = 0 # needle iterator + while i < len(hstack): + if hstack[i] != needle[j]: + mmatch_count += 1 + #print mmatch_count + #print 'j = '+str(j) + if mmatch_count > missmatch_level: + # if missmatch level oversized back to strat + 1 and start again + i -= j + # needle iterator restart (-1) because it will be increased in a moment + j = -1 + # new start = start + 1 + start = i+1 + #print 'start = '+str(start) + # reset mmatch_count + mmatch_count = 0 + i += 1 + j += 1 + # if needle iterator = len of needle match found return it. + if j >= needle_len: + return (start,i,mmatch_count) + +def find_all_aprox_matches(needle, hstack, missmatch_level, hs_start_pos): + ret_list = [] # list of matches to return + i = hs_start_pos # start iteration from start position + needle_len = len(needle) + while i+needle_len <= len(hstack): + r = find_aprox_match_iter(needle, hstack, missmatch_level, i) + # match found append to list strat new look in start + 1 position + if r: + ret_list.append(r) + i = r[0]+1 + # match not found - no more maches in hstack + else: + #print 'not found' + break + return ret_list + +# return string from between two aproximated motifs +def find_motif_in_aprox_range(start_motif, stop_motif, hstack, missmatch_level, hs_start_pos = 0): + start = 0 + stop = 0 + #print 'startm: '+start_motif+'\tstop_motif: '+stop_motif + start = find_aprox_match_iter(start_motif, hstack, missmatch_level, hs_start_pos = 0) + stop = find_aprox_match_iter(stop_motif, hstack, missmatch_level, start[1]) + #print start,stop + if start and stop: + return hstack[start[1]:stop[0]] + +def find_all_motifs_in_aprox_range(start_motif, stop_motif, hstack, missmatch_level, hs_start_pos = 0, len_min = 0, len_max = float('inf')): + i = hs_start_pos + start = 0 + stop = 0 + ret_list = [] + print 'hstack in fuzzy' + print hstack + while i <= len(hstack): + start = find_aprox_match_iter(start_motif, hstack, missmatch_level, i) + stop = find_aprox_match_iter(stop_motif, hstack, missmatch_level, start[1]) + #print start,stop + if start and stop: + #print 'start + stop found' + if stop[0] - start[1] > len_min and stop[0] - start[1] < len_max: + #print 'match valid' + ret_list.append(hstack[start[1]:stop[0]]) + i = start[0]+1 + #print i + else: + break + return ret_list + +def find_motif(needle, hstack, missmatch_level, hs_start_pos = 0): + r = 0 + r = find_aprox_match_iter(needle, hstack, missmatch_level, hs_start_pos = 0) + if r: + return hstack[r[0]:r[1]] + +def find_all_motifs(needle, hstack, missmatch_level, hs_start_pos = 0): + #print 'fuzzy.find_all_motifs' + #print needle + #print hstack + #print missmatch_level + #print hs_start_pos + i = hs_start_pos + ret_list = [] + while i <= len(hstack): + r = find_aprox_match_iter(needle, hstack, missmatch_level, i ) + #print r + if r: + #print 'founded: ',r + ret_list.append(hstack[r[0]:r[1]]) + #ret_list = [hstack[r[0]:r[1]]] + i = r[0]+1 + else: + break + #print ret_list + return ret_list \ No newline at end of file diff --git a/build/lib/fatool/sequence.py b/build/lib/fatool/sequence.py new file mode 100644 index 0000000..4e21fa6 --- /dev/null +++ b/build/lib/fatool/sequence.py @@ -0,0 +1,375 @@ +# -*- coding: utf-8 -*- + +from string import maketrans +from collections import Counter +import fuzzy +import re + + +class Sequence(object): + def __init__(self, name, seq): + if Sequence.validate_name_string(name): + self.name = name.lstrip('>') + else: + raise NameError('Sequence name have to start with ">"') + self.seq = seq + #self.quality = quality + + # def is_valid(self): + + # def validate_name(self): + + + @staticmethod + def validate_name_string(nstr): + if re.search('^>', nstr): + return 1 + + def validate_seq(self): + ''' + validates general seqence not specified for DNA or others. + ''' + # pattern to find not allowed chars. + pattern = re.compile('[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') + if pattern.search(self.seq): + if re.search('(\d+)', self.seq): + seq_array = self.seq.split('\n') + new_array = [] # array to store new sequence + for r in seq_array: + r = r.lstrip() # removing ' ' from beginings and ends + nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa + new_array.append(nr) + + end_of_seq_array = len(seq_array) + # if min. two lines calculate expected line length + if end_of_seq_array > 1: + line_length = int(new_array[1][0])-int(new_array[0][0]) + + # validate ecah block (between " ") of sequence () + i = 0 + while i < end_of_seq_array: + if not re.search('(\d+)', new_array[i][0]): + return 7 # line doesn't starts with digit + if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): + return 0 # bad line length + for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit + if len(r) != 10: # block not eq 10 + if len(r) < 10: # if less it can be ok if last elem of last line + if(i == end_of_seq_array - 1): + if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. + return 0 # not last elem of last line + else: + return 0 # not last line + else: + return 0 # block not eq 10 + if pattern.search(r): + return 0 + i += 1 + else: + return 0 # digit is not first char + # return pattern.search(self.seq) but nan error code returned before + return 1 + return 1 # valid + + @staticmethod + def generic_validate(seq, domain): + # pattern created from passed domain (domain contains chars that are not allowed) + pattern = re.compile(domain) #'[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]' + # if sequence contains illegal chars + if pattern.search(seq): + # if digits it can be ok if format like (60 xxxxxxxxxx xxx...) + if re.search('(\d+)', seq): + # to check that we have to transform array + seq_array = seq.split('\n') + new_array = [] # array to store new sequence as array of arrays + for r in seq_array: + r = r.lstrip() # removing ' ' from beginings and ends + nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa + new_array.append(nr) + + end_of_seq_array = len(seq_array) + # if min. two lines calculate expected line length + if end_of_seq_array > 1: + line_length = int(new_array[1][0])-int(new_array[0][0]) + + # validate ecah block (between " " [space]) of given sequence + i = 0 + while i < end_of_seq_array: + if not re.search('(\d+)', new_array[i][0]): + return 7 # line doesn't starts with digit + if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): + return 0 # bad line length + for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit + if len(r) != 10: # block not eq 10 + if len(r) < 10: # if less it can be ok if last elem of last line + if(i == end_of_seq_array - 1): + if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. + return 0 # not last elem of last line + else: + return 0 # not last line + else: + return 0 # block not eq 10 + if pattern.search(r): + return 0 + i += 1 + else: + return 0 # digit is not first char + # return pattern.search(seq) but nan error code returned before + return 1 + return 1 # valid + + # def validate_dna_seq(self): + + # def validate_other_seq(self): + + @staticmethod + def detailed_validate_generic(seq, domain): + not_valid = 0 + missmatches = {} + # pattern created from passed domain (domain contains chars that are not allowed) + pattern = re.compile(domain) + # find not allowed chars in sequence + m = pattern.finditer(seq) + log_info = [] + # if not allowed chars found + if m: + # it may be 60 xxxxxxxxxx xxx.... format + if re.search('(\d+)', seq): + seq_array = seq.split('\n') + new_array = [] # array to store new sequence after cleaning and transformation + for r in seq_array: + r = r.lstrip() # removing ' ' from beginings and ends + nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa + new_array.append(nr) + end_of_seq_array = len(seq_array) + # if min. two lines calculate expected line length + if end_of_seq_array > 1: + line_length = int(new_array[1][0])-int(new_array[0][0]) + + # validate ecah block (between " " [space]) of given sequence + i = 0 + while i < end_of_seq_array: + # digit on begining of line was not found - error + if not re.search('(\d+)', new_array[i][0]): + log_info.append('line '+str(i+1)+": line doesn't starts with digit") # line doesn't starts with digit + # check if line length = expected line length last line can be shorter + if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): + #return 0 # bad line length + log_info.append('line '+str(i+1)+': bad line length') + #chcek all blocks if are eq 10 (last can be shorter) + for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit + if len(r) != 10: # block not eq 10 + if len(r) < 10: # if less it can be ok if last elem of last line + if(i == end_of_seq_array - 1): + if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains les then 10 chars') # not last elem of last line + else: + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains les then 10 chars') # not last line + else: + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains more then 10 chars') # block gt 10 + # if block contains illegal chars now after transtrmation it should contain only legal chars. + if pattern.search(r): + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains illegal chars') + i += 1 + else: + # in this case it is not seq like "10 xxxxx xxxxx" + for mitem in m: + log_info.append('Position:\t'+str(mitem.start())+'\tvalue:\t'+str(mitem.group())) + # none of not allowed chars were found sequence OK + return log_info + # def detailed_validate_dna_seq(self): + + # def detailed_validate_other_seq(self): + + def cut(self, length, step): + ''' + cutting contig into smaller parts accordigly to supplied params + length of contig (number of chars) + step offset between current and next start + ''' + self.normalize() + i = 0 + contig_end = len(self.seq) # last position of contig + contig_list = [] # contig list returning by function + while i+length <= contig_end: + contig_list.append(Sequence('>'+self.name+'_frag_'+str(i + 1)+':'+str(i + length), str(self.seq[i:i+length]))) + i = i+step + return contig_list + + def reverse(self): + ''' + creates reversed sequence + ''' + self.normalize() + nr = re.sub('\n', '', self.seq) + rev = nr[::-1] + rev = rev.translate(maketrans('ACTGactg', 'TGACtgac')) + # creating 80 chars lines + #rev = re.sub("(.{80})", '\\1\n', rev, 0) + return Sequence('>rev_'+self.name, rev) + + + def normalize(self): + self.seq = re.sub(' ', '', self.seq) + self.seq = re.sub('^\d', '', self.seq, re.M) + self.seq = re.sub('\n', '', self.seq) + + def statistics(self): + ''' + returns simple statistics for contig + ''' + self.normalize() + r = {} + c = Counter(self.seq) + r['A'] = c['A']+c['a'] + r['C'] = c['C']+c['c'] + r['G'] = c['G']+c['g'] + r['T'] = c['T']+c['t'] + r['N'] = c['N']+c['n'] + r['L'] = len(self.seq) + return r + + #def getRange(self, start, stop): + # return self.seq[start:stop] + + def translate_dna2rna(self): + nc = self.seq.translate(maketrans('ACTGactg', 'UGACugac')) + return Sequence('>rna_'+self.name, nc) + + def translate_rna2dna(self): + nc = self.seq.translate(maketrans('UGACugac', 'ACTGactg')) + return Sequence('>dna_'+self.name, nc) + + # ctrl f1 frame 1 forward, r1 frame 1 revers, fall torward all frames, rall reverse all frames, all in this way? + # supply dict of translation or its constant? + @staticmethod + def translate2protein_in_range_generic(seq, start, stop, tdict): + p = '' + p_stop = '' + # search results in distribution to frames + frame1 = [] + frame2 = [] + frame3 = [] + + # creating pattern to find start codons + for r in start: + p += r+'|' + p = '('+p.rstrip('|')+')' + + # creating pattern to find stop codons + for r in stop: + p_stop += r+'|' + p_stop = '('+p.rstrip('|')+')' + + # match for start contigs + m = re.finditer(p, seq) + + # there will be stored latest string position for each frame + frame_iterator[0,0,0] + + stop_pos = len(seq) # where to stop searching if no stopcodon found + + # using each found start codon + for r in m: + # if start is lower then last used position skip it. + if frame_iterator[r.start()%3] <= r.start(): + # set i for start position of current start contig + i = r.start() + ret = '' + while i+3 <= stop: + ret += Sequence.translate(seq[i:i+3], tdict) + if re.match(p_stop, seq[i:i+3]): + i = i+3 + break + else: + i = i+3 + + frame_iterator[r.start()%3] = i + if r.start()%3 == 0: + frame1.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) + elif r.start()%3 == 1: + frame2.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) + elif r.start()%3 == 2: + frame3.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) + + return [frame1, frame2, frame3] + + @staticmethod + def translate2protein_generic(seq, tdict): + # +5 to secure all frames + f1 = '' + f2 = '' + f3 = '' + while i+5 < len(seq): + f1 += Sequence.translate(seq[i:i+3], tdict) + f2 += Sequence.translate(seq[i+1:i+4], tdict) + f3 += Sequence.translate(seq[i+2:i+5], tdict) + + return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,)] + + def translate2protein(self, tdict): + tdict = { + 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', + 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', + 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', + 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', + 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', + 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' + } + f = Sequence.translate2protein_generic(self.seq, tdict) + r = Sequence.translate2protein_generic(self.reverse().seq, tdict) + return {'fwd':f, 'rev':r} + + @staticmethod + def translate(contig, tdict): + if codon in tdict: + return tdict[codon] + else: + return '|'+codon+'|' + + def find_aprox_motif(self, motif, missmatch_level): + self.normalize() + return fuzzy.find_all_motifs(motif, self.seq, missmatch_level, hs_start_pos = 0) + + def find_aprox_primers(self, start, stop, missmatch_level = 0, len_min = 50, len_max = 10000): + #start 5'->3' + # add missmatch_level condition if 50%> + rev = stop[::-1] + new_stop = rev.translate(maketrans('ACTGactg', 'TGACtgac')) + r_list = [] + self.normalize() + #print '\nAfter normailzation' + #print self.seq + + res = fuzzy.find_all_motifs_in_aprox_range(start, stop, self.seq, missmatch_level, 0, len_min, len_max) + if res: + r_list.extend(res) + + rev = start[::-1] + new_start = rev.translate(maketrans('ACTGactg', 'TGACtgac')) + #print 'new_seq in sequence\n' + #print new_seq.seq + res = fuzzy.find_all_motifs_in_aprox_range(new_start, stop, self.seq, missmatch_level, 0, len_min, len_max) + if res: + r_list.extend(res) + print 'Sequence.find_aprox_primers', + for s in r_list: + print s+'\n' + return r_list + + def __str__(self): + ''' + creates nicely outputed string + ''' + return '>'+self.name+'\n'+re.sub("(.{80})", '\\1\n', self.seq, 0)+'\n' + + + def __len__(self): + return len(self.seq) + + def __cmp__(self, other): + if self.seq == other.seq: + return 0 + + def __eq__(self, other): + return self.seq == other.seq \ No newline at end of file diff --git a/build/scripts-2.7/cmdfatool.py b/build/scripts-2.7/cmdfatool.py new file mode 100644 index 0000000..2ec7a47 --- /dev/null +++ b/build/scripts-2.7/cmdfatool.py @@ -0,0 +1,358 @@ +# -*- coding: utf-8 -*- + + +import sys +import argparse +import re +import datetime +from string import maketrans +# from fatool import Contig +from fatool import * +from decimal import * + + +def main(): + parser = argparse.ArgumentParser() + #parser.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + parser.add_argument('-v', '--version', help='display version number and exit', action='version', version='%(prog)s 0.2.1') + subparsers = parser.add_subparsers(title='fatool commands', help='each has own params, for more details use: command -h') + + sub_cut = subparsers.add_parser('cut', help='split supplied sequence into smaller parts, according to given params') + sub_cut.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_cut.add_argument('-r', '--range', help='cutted sequence length', type=int, required=True) + sub_cut.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa') + sub_cut.add_argument('-s', '--step', help='step length default: 1', type=int, default=1) + sub_cut.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_cut.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_cut.set_defaults(func=cut_fa) + + sub_en = subparsers.add_parser('extractNames', help='extracting contigs names only') + sub_en.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_en.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w')) + sub_en.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_en.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_en.set_defaults(func=extract_names) + + sub_ec = subparsers.add_parser('extractContigs', help='extracting contigs specified in file (output in new file)') + sub_ec.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_ec.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True) + sub_ec.add_argument('-o', '--output', help='output file; if --multifile is set output directory', type=str, required=True) + sub_ec.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_ec.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_ec.add_argument('--multifile', help='if this flag is set each contig will be saved in separate file', action='store_true') + sub_ec.set_defaults(func=extract_contigs) + + sub_rc = subparsers.add_parser('remContigs', help='removing contigs specified in file (output in new file)') + sub_rc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_rc.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True) + sub_rc.add_argument('-o', '--output', help='output file if not supplied stdout', type=str, required=True) + sub_rc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_rc.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_rc.set_defaults(func=remove_contigs) + + sub_jc = subparsers.add_parser('join', help='joining two or more files, yet not verifing duplicates') + sub_jc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_jc.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w'), required=True) + sub_jc.add_argument('--files', help='files to be joined', nargs='*', type=argparse.FileType('r')) + sub_jc.add_argument('--overwrite', help='if set owerwrites contigs with same name', action='store_true') + sub_jc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_jc.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_jc.set_defaults(func=join) + + sub_sc = subparsers.add_parser('split', help='each cotig saved into separate file') + sub_sc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_sc.add_argument('-d', '--outputDir', help='output directory where splited contigs will be saved', type=str, required=True) + sub_sc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_sc.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_sc.set_defaults(func=split_contigs) + + sub_r = subparsers.add_parser('reverse', help='reverse all sequences in file') + sub_r.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_r.add_argument('-o', '--output', help='output file; if --multifile is set output directory', type=argparse.FileType('w'), required=True) + sub_r.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_r.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + sub_r.set_defaults(func=reverse) + + sub_v = subparsers.add_parser('validate', help='validates fa file') + sub_v.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) + sub_v.add_argument('-t', '--type', help='type of sequence 0 - general, 1 DNA, 2 - amino', type=int, required=True) + sub_v.add_argument('--details', help='set if you want to see detaild validation info', action='store_true') + sub_v.set_defaults(func=validate) + + sub_s = subparsers.add_parser('stats', help='show statistics of fa file') + sub_s.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) + sub_s.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_s.add_argument('--operator', help='user who have fired script it will be noted in log', nargs='*', type=str) + sub_s.set_defaults(func=statistics) + + sub_s = subparsers.add_parser('findMotif', help='show statistics of fa file') + sub_s.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) + sub_s.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_s.add_argument('--operator', help='user who have fired script it will be noted in log', nargs='*', type=str) + sub_s.set_defaults(func=find_motif) + + sub_s = subparsers.add_parser('findPrimer', help='show statistics of fa file') + sub_s.add_argument('-f', '--fafile', help='file to show statistics usualy *.fa', type=argparse.FileType('r'), required=True) + sub_s.add_argument('--start', help='strat codon 5\'', type=str, required=True) + sub_s.add_argument('--stop', help='stop codon 3\'', type=str, required=True) + sub_s.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + sub_s.add_argument('--operator', help='user who have fired script it will be noted in log', nargs='*', type=str) + sub_s.set_defaults(func=find_primers) + + #parser.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) + #parser.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) + + args = parser.parse_args() + #if args.version: + # print version + # exit(0) + + args.func(args) + +def resolve_operator(operator_arg_list): + # makes prity print of opoerator + op = '' + for r in operator_arg_list: + op += r+' ' + return op.rstrip() + +def make_log_header(cmd, op): + stats_rep = '\n-------------------------------------------------------------' + stats_rep +='\ncmdfatool '+str(cmd)+' \n\nstarted:\t'+str(datetime.datetime.now()) + if op: + stats_rep += '\nOperator:\t'+resolve_operator(op) + stats_rep += '\n-------------------------------------------------------------\n' + return stats_rep + + +def cut_fa(args): + rep = str(make_log_header('cut', args.operator)) + + fafile = args.fafile + output = args.output + split_range = args.range + step = args.step + + f = Fa.load_from_file(fafile) + contig_list = [] + for r in f.contigs: + contig_list.join(r.cut(split_range, step)) + result_fa = Fa(contig_list, 'splited') + result_fa.write(output) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(rep) + + +def extract_names(args): + rep = str(make_log_header('extractNames', args.operator)) + fafile = args.fafile + output = args.output + + fa = Fa.load_from_file(fafile) + names = fa.show_names() + with output as o: + for r in names: + o.write('>'+r) + rep += 'Number of neames founded:\t' + str(len(names)) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(rep) + +def extract_contigs(args): + # default all extracted contigs in one file + # with flag multifile save each contig to separate file + + rep = str(make_log_header('extractContigs', args.operator)) + + fa = Fa.load_from_file(args.fafile) + rep += 'Number of contigs in orginal file:\t'+str(len(fa.contigs)) + + #file with contigs names one per line + with args.list as cntgs: + elist = [c.strip() for c in cntgs] + result_fa = fa.extract(elist) + if( args.multifile): + result_fa.write_multiple_files(args.output) + else: + result_fa.write(args.output) + rep += '\nContigs to remove:\t'+str(len(elist)) + rep += '\Extracted contigs:\t'+str(len(result_ta.contigs)) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(rep) + else: + print rep + +def remove_contigs(args): + # contigs from list are removed, others saved to file + rep = str(make_log_header('remContigs', args.operator)) + fa = Fa.load_from_file(args.fafile) + rep += 'Number of contigs in orginal file:\t'+str(len(fa.contigs)) + # file that contains list of contigs one per line + with args.list as cntgs: + rlist = [c.strip() for c in cntgs] + rep += 'Number of contigs to remove:\t'+len(rlist) + result_fa = fa.remove(rlist) + rep += 'Number of contigs after remove:\t'+str(len(fa.contigs)) + rep += 'Contigs removed:\t'+str(len(fa.contigs) - len(result_fa.contigs)) + result_fa.write(args.output) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(stats_rep) + else: + print stats_rep + + +def join(args): + # joins contig from multiple files + rep = str(make_log_header('join', args.operator)) + fa = Fa.load_from_file(args.fafile) + fa_list = [] + contigs_to_add = 0 + # list of Fa files to join. + for r in args.files: + if len(r) > 0: + fa2add = Fa.load_from_file(r) + fa_list.append(fa2add) + contigs_to_add += fa2add.count_contigs() + rep += '\nOrginal contigs number:\t'+Fa.count_contigs() + rep += '\nTotal files to join with orginal file:\t'+len(args.files) + rep += '\nTotal contigs to add:\t'+str(contigs_to_add) + fa.join(fa_list, args.overwrite) + rep += '\nNumber of contigs after join:\t'+str(fa.count_contigs()) + fa.write(args.output) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(stats_rep) + else: + print stats_rep + +def split_contigs(args): + #writes each contig in single file + rep = str(make_log_header('split', args.operator)) + fa = Fa.load_from_file(args.fafile) + fa.write_multiple_files(args.output) + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(rep) + else: + print rep + + +def statistics(args): + # returns statistics of fa file + stats_rep = str(make_log_header('stats', args.operator)) + fa = Fa.load_from_file(args.fafile) + stats = fa.statistics() + stats_rep += '\n\nNumber of N:\t'+str(stats['N']) + stats_rep += '\nNumber of A:\t'+str(stats['A']) + stats_rep += '\nNumber of C:\t'+str(stats['C']) + stats_rep += '\nNumber of T:\t'+str(stats['T']) + stats_rep += '\nNumber of G:\t'+str(stats['G']) + getcontext().rounding = ROUND_05UP + getcontext().prec = 4 + stats_rep += '\nGC[%] (0.5 up):\t'+str(Decimal(stats['G']+stats['C'])/stats['L']*Decimal(100.00)) + stats_rep += '\n\nTotal length:\t'+str(stats['L']) + stats_rep += '\nTotal contigs:\t'+str(stats['totalc']) + stats_rep += '\n\ncontigs 1000-5000bp:\t'+str(stats['nbp1000']) + stats_rep += '\ncontigs 1000-5000bp length:\t'+str(stats['lbp1000']) + stats_rep += '\ncontigs 5001-10000bp:\t'+str(stats['nbp5000']) + stats_rep += '\ncontigs 5001-10000bp length:\t'+str(stats['lbp5000']) + stats_rep += '\ncontigs 10001-25000bp:\t'+str(stats['nbp10000']) + stats_rep += '\ncontigs 10001-25000bp length:\t'+str(stats['lbp10000']) + stats_rep += '\ncontigs 25001-50000bp:\t'+str(stats['nbp25000']) + stats_rep += '\ncontigs 25001-50000bp length:\t'+str(stats['lbp25000']) + stats_rep += '\ncontigs 50001+bp:\t'+str(stats['nbp50000']) + stats_rep += '\ncontigs 50001+bp length:\t'+str(stats['lbp50000']) + stats_rep += '\n\ncontigs > 1000bp:\t'+str(stats['nbp1000']+stats['nbp5000']+stats['nbp10000']+stats['nbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 1000bp length:\t'+str(stats['lbp1000']+stats['lbp5000']+stats['lbp10000']+stats['lbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 5000bp:\t'+str(stats['nbp5000']+stats['nbp10000']+stats['nbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 5000bp length:\t'+str(stats['lbp5000']+stats['lbp10000']+stats['lbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 10000bp:\t'+str(stats['nbp10000']+stats['nbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 10000bp length:\t'+str(stats['lbp10000']+stats['lbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 25000bp:\t'+str(stats['nbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 25000bp length:\t'+str(stats['lbp25000']+stats['nbp50000']) + stats_rep += '\ncontigs > 50000bp:\t'+str(stats['nbp50000']) + stats_rep += '\ncontigs > 50000bp length:\t'+str(stats['nbp50000']) + stats_rep += '\nLongest contig:\t'+str(stats['longest']) + stats_rep += '\n\nN50:\t'+str(stats['N50']) + stats_rep += '\nL50:\t'+str(stats['L50']) + stats_rep += '\nN75:\t'+str(stats['N75']) + stats_rep += '\nL75:\t'+str(stats['L75']) + stats_rep += '\nN90:\t'+str(stats['N90']) + stats_rep += '\nL90:\t'+str(stats['L90']) + stats_rep += '\n\n------------------------------------------------------' + stats_rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(stats_rep) + else: + print stats_rep + + +def validate(args): + # check if fa is valid + rep = str(make_log_header('validate', args.operator)) + fa = Fa.load_from_file(args.fafile) + result_list = {} + if args.details: + for r in fa.contigs: + result_list[r.name] = Sequence.detailed_validate_generic(r, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') + else: + for r in fa.contigs: + result_list[r.name] = Sequence.validate_generic(r, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') + #print result_list + + for r in result_list: + rep += r +'\n' + + rep += '\n\n------------------------------------------------------' + rep += '\nFinished:\t'+str(datetime.datetime.now()) + if args.log: + with args.log as log_file: + log_file.write(rep) + else: + print rep + + +def reverse(args): + rep = str(make_log_header('reverse', args.operator)) + fa = Fa.load_from_file(args.fafile) + fa.reverse() + fa.write(args.output) + if args.log: + with args.log as log_file: + log_file.write(rep) + else: + print rep + +def find_motif(args): + print 'not available yet' + pass + +def find_primers(args): + print 'not available yet' + pass + + +if __name__ == '__main__': + exit(main()) + + + + + + diff --git a/fatool.py b/fatool.py deleted file mode 100644 index d92f83c..0000000 --- a/fatool.py +++ /dev/null @@ -1,320 +0,0 @@ -# -*- coding: utf-8 -*- - -import sys -import argparse -import re -import datetime -from string import maketrans - - -def main(): - parser = argparse.ArgumentParser() - #parser.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - - subparsers = parser.add_subparsers(title='facutter commands', help='each has own params, for more details use: command -h') - - sub_cut = subparsers.add_parser('cut', help='split supplied sequence into smaller parts, according to given params') - sub_cut.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_cut.add_argument('-r', '--range', help='cutted sequence length', type=int, required=True) - sub_cut.add_argument('-o', '--output', help='output file default: output.fa', type=argparse.FileType('w'), default='output.fa') - sub_cut.add_argument('-s', '--step', help='step length default: 1', type=int, default=1) - sub_cut.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) - sub_cut.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) - sub_cut.set_defaults(func=cut_fa) - - sub_en = subparsers.add_parser('extractNames', help='extracting contigs names only') - sub_en.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_en.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w')) - sub_en.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) - sub_en.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) - sub_en.set_defaults(func=extract_names) - - sub_ec = subparsers.add_parser('extractContigs', help='extracting contigs specified in file (output in new file)') - sub_ec.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_ec.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True) - sub_ec.add_argument('-o', '--output', help='output file; if --multifile is set output directory', type=str, required=True) - sub_ec.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) - sub_ec.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) - sub_ec.add_argument('--multifile', help='if this flag is set each contig will be saved in separate file', action='store_true') - sub_ec.set_defaults(func=extract_contigs) - - sub_rc = subparsers.add_parser('remContigs', help='removing contigs specified in file (output in new file)') - sub_rc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_rc.add_argument('--list', help='file containing list of contigs one contig per line', type=argparse.FileType('r'), required=True) - sub_rc.add_argument('-o', '--output', help='output file if not supplied stdout', type=str, required=True) - sub_rc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) - sub_rc.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) - sub_rc.set_defaults(func=remove_contigs) - - sub_jc = subparsers.add_parser('join', help='joining two or more files, yet not verifing duplicates') - sub_jc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_jc.add_argument('-o', '--output', help='output file if not supplied stdout', type=argparse.FileType('w'), required=True) - sub_jc.add_argument('--files', help='files to be joined', nargs='*', type=argparse.FileType('r')) - sub_jc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) - sub_jc.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) - sub_jc.set_defaults(func=join) - - sub_sc = subparsers.add_parser('split', help='each cotig saved into separate file') - sub_sc.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_sc.add_argument('-d', '--outputDir', help='output directory where splited contigs will be saved', type=str, required=True) - sub_sc.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) - sub_sc.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) - sub_sc.set_defaults(func=split_contigs) - - sub_r = subparsers.add_parser('reverse', help='reverse all sequences in file') - sub_r.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_r.add_argument('-o', '--output', help='output file; if --multifile is set output directory', type=argparse.FileType('w'), required=True) - sub_r.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) - sub_r.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) - sub_r.set_defaults(func=reverse) - - sub_v = subparsers.add_parser('validate', help='validates fa file') - sub_v.add_argument('-f', '--fafile', help='file to be cut usualy *.fa', type=argparse.FileType('r'), required=True) - sub_v.add_argument('-t', '--type', help='type of sequence 0 - general, 1 DNA, 2 - amino', type=int, required=True) - sub_v.add_argument('--detailed', help='set if you want to see detaild validation info', action='store_true') - sub_v.set_defaults(func=validate) - - #parser.add_argument('--operator', help='user who have fired script it will be noted in log', type=str) - #parser.add_argument('--log', help='log file if not supplied stdout', type=argparse.FileType('w')) - - args = parser.parse_args() - args.func(args) - - -def make_log(content, lfile): - with lfile as f: - f.write(content) - - -# function prepares pattern for contig search -def make_pattern(r): - if re.match('^>', r.strip()): - pattern = '('+re.escape(r.strip())+'\n[A-Za-z\n]*)[\Z>]?' - else: - pattern = '(> '+re.escape(r.strip())+'\n[A-Za-z\n]*|>'+r.strip()+'\n[A-Za-z\n]*)[\Z>]?' - return pattern - - -def cut_fa(args): - - fafile = args.fafile - output = args.output - split_range = args.range - step = args.step - - print 'step used: '+str(step) - - fa = '' # sequence grabed from file and ceared - with fafile as f: - # load sequence from file remove first line and all white chars - for r in f.readlines()[1:]: - fa = fa+r.replace("\r", "").replace("\n", "") - - with output as o: - coe = len(fa) # end of fa position - i = 0 - - while i + split_range <= coe: - # while curent position + length of frag is less or equal postion of the last char in file. - o.write('> frag ' + str(i + 1) + ' : ' + str(i + split_range) + '\n' + str(fa[i:i + split_range]) + '\n') - i = i + step - - -def extract_names(args): - fafile = args.fafile - output = args.output - # sequence title line begining - pat = re.compile('^>') - if output is None: - print 'no output defined results will be print on stdout\n' - with fafile as f: - for r in f.readlines(): - if pat.match(r): - print r - else: - # proceed fafile and save title lines in output file - with fafile as f, output as o: - for r in f.readlines(): - if pat.match(r): - o.write(r) - -def make_file_name(r, suffix): - if len(suffix) > 0: - name = re.sub('[>\*\\\?\<\/]', '', r.strip()) - return name+'.'+suffix - else: - return re.sub('[>\*\\\?\<\/]', '', r.strip()) - - -def extract_contigs(args): - # default all extracted contigs in one file - # with flag multifile save each contig to separate file - fafile = args.fafile - elist = args.list - log = args.log - log_content = '\nfatools extractContigs\tstarted:\t'+str(datetime.datetime.now())+'\t' - - # counters: extracted contigs and list items - excounter = lcounter = 0 - log_not_found = '' - - print 'extracting contigs' - # output to multi files - if(args.multifile): - output = args.output - with elist as cntgs, fafile as f: - content = f.read() - for r in cntgs: - #print r - lcounter = lcounter + 1 - # check if list item is with '>' important to create pattern. - - m = re.search(make_pattern(r), content) - - if m: - excounter = excounter + 1 - with open(output+'/'+make_file_name(r,'fa'), 'w') as o: - o.write(m.group(1)) - else: - # log_content = log_content + 'contig not found: ' + r - log_not_found = 'contig not found: ' + r - # output to single file - else: - output = args.output - with elist as cntgs, fafile as f, open(output, 'w') as o: - content = f.read() - for r in cntgs: - lcounter = lcounter + 1 - # check if list item is with '>' important to create pattern. - - m = re.search(make_pattern(r), content) - - if m: - excounter = excounter + 1 - o.write(m.group(1)) - else: - # log_content = log_content + 'contig not found: ' + r - log_not_found = 'contig not found: ' + r - - if(log): - log_content = log_content + 'stoped:\t'+str(datetime.datetime.now())+'\n' - if args.operator: - log_content = log_content + 'operator:\t'+args.operator+'\n' - log_content = log_content + '='*15+'\nlist items:\t'+str(lcounter)+'\nextracted contigs:\t'+str(excounter)+'\n' - if log_not_found: - log_content = log_content + '\nContigs not found:\n'+'='*15+'\n'+log_not_found - make_log(log_content, log) - else: - print 'list items: '+str(lcounter)+'; extracted contigs: '+str(excounter) - if log_not_found: - '\nContigs not found:\n============================================\n'+log_not_found - - -def remove_contigs(args): - fafile = args.fafile - rlist = args.list - output = args.output - log = args.log - # counters for listitems and removed contigs - lcounter = rem_counter = 0 - with elist as cntgs, fafile as f, open(output, 'w') as o: - content = f.read() - for r in cntgs: - lcounter = lcounter + 1 - - if re.match(make_pattern, content): - rem_counter = rem_counter + 1 - content = re.sub(make_pattern, '>', content) - #rstrip removes last > left after removing last contig - o.write(content.rstrip('>')) - if(log): - make_log('fatool - remContigs:\n list items:\t'+str(lcounter)+'\ncontings rmoved:\t'+str(rem_counter), log) - else: - print 'list items:\t'+str(lcounter)+'\ncontings rmoved:\t'+str(rem_counter) - -def join(args): - with args.fafile as f: - content = f.read() - for r in args.files: - with r as j: - content = content.rstrip() + '\n' + r.read() - with args.output as o: - o.write(content) - - - - -def split_contigs(args): - with args.fafile as f: - content = f.read() - nc = content.split('>') - for r in nc[1:]: - #print ofile - with open(args.outputDir+'/'+make_file_name(r.split('\n', 1)[0],'fa'), 'w') as o: - o.write('>'+r) - - - -def statistics(args): - return 1 - - -def validate(args): - pattern = re.compile('[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') - #dna - #amino - not_valid = 0 - missmatches = {} - with args.fafile as f: - content = f.read() - if not re.search('^>', content): - print 'Invalid fa file no ">" at begining' - exit(0) - - nc = content.split('>') - nv_list = {} - m = None - log_info = '' - # detailed flag show more info - if(args.detailed): - for r in nc[1:]: # first elem is empty - # removing first line of sequence it contains name of contig - nr = re.sub('^>.*\n','','>'+r) - m = pattern.finditer(nr) - if m: - not_valid = 1 - for i in m: - log_info += 'Contig:\t'+r.split('\n', 1)[0]+'\tposition:\t'+str(i.start())+'\tvalue:\t'+str(i.group())+'\n' - #nv_list = - #break - else: - for r in nc[1:]: # first elem is empty - nr = re.sub('^>.*\n','','>'+r) - if pattern.search(nr): - not_valid = 1 - break - if not_valid == 0: - print 'File is valid fa file' - else: - print 'Invalid fa file' - if log_info: - print log_info - -def reverse(args): - with args.fafile as f, args.output as o: - content = f.read() - nc = content.split('>') - for r in nc[1:]: # first elem is empty - # removing first line - nr = re.sub('^>.*\n','','>'+r) - # removing new lines to output with 80 chars per line - nr = re.sub('\n', '', nr) - rev = nr[::-1] - rev = rev.translate(maketrans('ACTGactg', 'TGACtgac')) - # creating 80 chars lines - rev = re.sub("(.{80})",'\\1\n', rev, 0) - o.write('>rev_'+r.split('\n', 1)[0]+'\n'+rev) - - -if __name__ == '__main__': - exit(main()) diff --git a/fatool/__init__.py b/fatool/__init__.py new file mode 100644 index 0000000..22fb0c7 --- /dev/null +++ b/fatool/__init__.py @@ -0,0 +1,3 @@ +from .sequence import Sequence +from .fa import Fa +import fuzzy \ No newline at end of file diff --git a/fatool/fa.py b/fatool/fa.py new file mode 100644 index 0000000..827df1b --- /dev/null +++ b/fatool/fa.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- + + +import re +import math +from fatool import Sequence + +class Fa(object): + def __init__(self, contigs_list, name): + #print contigs_list + # do poprawki + self.name = name + self.contigs = [] + self.contigs_idx = {} + for r in contigs_list: + if not isinstance(r, Sequence): + raise TypeError('Wrong param supplied Sequence was expected') + if not r.name in self.contigs_idx: + if len(self.contigs) > 0: + self.contigs.append(r) + else: + self.contigs = [r] + + self.contigs_idx[r.name] = len(self.contigs) - 1 + else: + raise NameError('Sequence name already exists: '+r.name) + # self.stats{'A':0,'C':0,'T':0,'G':0,'N':0, 'L':0, } + @staticmethod + def load_from_file(file): + if isinstance(file, str): + with open(file, 'r') as f: + contigs = Fa.load_content(f.read()) + name = file + else: + name = file.name + with file as f: + contigs = Fa.load_content(f.read() ) + + + return Fa(contigs, name) + + @staticmethod + def load_content(content): + #print content + nc = content.split('>') + contigs_list = [] + for r in nc[1:]: + contigs_list.append(Sequence('>'+r.split('\n', 1)[0], re.sub('^>.*\n', '', '>'+r.rstrip()))) + return contigs_list + + def write(self, fafile): + if isinstance(fafile, str): + with open(fafile, 'w') as f: + f.write(str(self)) + else: + with fafile as f: + f.write(str(self)) + + def write_multiple_files(self, dir): + dir = dir.rstrip('/') + dir = dir.rstrip('\\') + if len(dir) > 0: + dir = dir+'/' + for r in self.contigs: + with open(dir+r.name+'.fa', 'w') as w: + w.write(str(r)) + + def add_contigs(self, contig_list, owrite=0): + for r in contig_list: + self.add_contig(r, owrite) + + + def add_contig(self, contig, owrite = 0): + if not isinstance(contig, Sequence): + raise TypeError('Wrong param supplied contig was expected') + if contig.name in self.contigs_idx: + if owrite == 1: + #rem old item and add new name + del self.contigs[self.contigs_idx[contig.name]] + self.contigs.append(contig) + for a, r in enumerate(self.contigs): + #print 'cnt '+str(r) + self.contigs_idx[r.name] = a + else: + self.contigs.append(contig) + self.contigs_idx[contig.name] = len(self.contigs) - 1 + + def show_names(self): + return sorted(self.contigs_idx, key=self.contigs_idx.get) + + + def extract(self, contigs_name_list): + new_contig_list = [] + for r in contigs_name_list: + if r in self.contigs_idx: + new_contig_list.append(self.contigs[self.contigs_idx[r]]) + return Fa(new_contig_list, 'extr_'+self.name) + + def remove(self, contigs_name_list): + new_contig_list = [] + for r in self.contigs: + if not r.name in contigs_name_list: + new_contig_list.append(r) + return Fa(new_contig_list, 'rem_'+self.name) + + def validate(self): + ''' + ''' + + def nl_statistics(self, g, percent): + ''' + Counts statistics of N50, L50, N75 etc. + ''' + ncount = -1 # index & number of contigs with +1 + nsum = 0 + stop = math.floor(self.stats['L']*(percent/100.00)) + while nsum < stop: + ncount += 1 + nsum += g[ncount] + + self.stats['N'+str(percent)] = g[ncount] + self.stats['L'+str(percent)] = ncount + 1 + + def bp_stats(self, length): + self.stats['totalc'] += 1 + if length > 50000: + self.stats['nbp50000'] += 1 # number of contigs with length + self.stats['lbp50000'] += length # total length of contigs with min. len + elif length > 25000: + self.stats['nbp25000'] += 1 + self.stats['lbp25000'] += length + elif length > 10000: + self.stats['nbp10000'] += 1 + self.stats['lbp10000'] += length + elif length > 5000: + self.stats['nbp5000'] += 1 + self.stats['lbp5000'] += length + elif length > 1000: + self.stats['nbp1000'] += 1 + self.stats['lbp1000'] += length + + def statistics(self): + self.stats = { + 'A': 0, 'C': 0, 'T': 0, 'G': 0, 'N': 0, 'L': 0, + 'nbp1000': 0, 'nbp5000': 0, 'nbp10000': 0, 'nbp25000': 0, 'nbp50000': 0, + 'lbp1000': 0, 'lbp5000': 0, 'lbp10000': 0, 'lbp25000': 0, 'lbp50000': 0, + 'totalc':0 + } + nstat_list = [] + bp_stats = [] + for r in self.contigs: + temp = r.statistics() + self.stats['A'] += temp['A'] + self.stats['C'] += temp['C'] + self.stats['T'] += temp['T'] + self.stats['G'] += temp['G'] + self.stats['N'] += temp['N'] + self.stats['L'] += temp['L'] + nstat_list.append(temp['L']) + self.bp_stats(temp['L']) + + self.stats['longest'] = max(nstat_list) + nstat_list.sort() + nstat_list.reverse() + + self.nl_statistics(nstat_list, 50) + self.nl_statistics(nstat_list, 75) + self.nl_statistics(nstat_list, 90) + + #print self.stats + + return self.stats + + def sort(self, mono): + contig_list = [] + temp = {} # dict to store name:len(contig) + for r in self.contigs: + temp[r.name] = len(r) + + if mono == -1: + for r in sorted(temp, key=temp.get)[::-1]: + contig_list.append(self.contigs[self.contigs_idx[r]]) + else: + for r in sorted(temp, key=temp.get): + contig_list.append(self.contigs[self.contigs_idx[r]]) + + return Fa(contig_list, 'sorted_'+self.name) + + def reverse(): + cl = [] + for r in self.contigs: + cl.append(r.reverse) + return Fa(cl, 'rev_'+self.name) + + def join(self, fa_list, owrite = 0): + for fa in fa_list: + if not isinstance(fa, Fa): + raise TypeError('Wrong param supplied Fa was expected') + self.add_contigs(fa.contigs, owrite) + + def count_contigs(self): + return len(self.contigs) + + def __str__(self): + return_string = '' + for r in self.contigs: + return_string += str(r) + return return_string diff --git a/fatool/fuzzy.py b/fatool/fuzzy.py new file mode 100644 index 0000000..2177397 --- /dev/null +++ b/fatool/fuzzy.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +#import math + +def find_aprox_match_iter(needle, hstack, missmatch_level, hs_start_pos = 0): + i = hs_start_pos # start iterate from start position + start = hs_start_pos # start of founded region at begining start of search + mmatch_count = 0 # missmatch counter + needle_len = len(needle) + j = 0 # needle iterator + while i < len(hstack): + if hstack[i] != needle[j]: + mmatch_count += 1 + #print mmatch_count + #print 'j = '+str(j) + if mmatch_count > missmatch_level: + # if missmatch level oversized back to strat + 1 and start again + i -= j + # needle iterator restart (-1) because it will be increased in a moment + j = -1 + # new start = start + 1 + start = i+1 + #print 'start = '+str(start) + # reset mmatch_count + mmatch_count = 0 + i += 1 + j += 1 + # if needle iterator = len of needle match found return it. + if j >= needle_len: + return (start,i,mmatch_count) + +def find_all_aprox_matches(needle, hstack, missmatch_level, hs_start_pos): + ret_list = [] # list of matches to return + i = hs_start_pos # start iteration from start position + needle_len = len(needle) + while i+needle_len <= len(hstack): + r = find_aprox_match_iter(needle, hstack, missmatch_level, i) + # match found append to list strat new look in start + 1 position + if r: + ret_list.append(r) + i = r[0]+1 + # match not found - no more maches in hstack + else: + #print 'not found' + break + return ret_list + +# return string from between two aproximated motifs +def find_motif_in_aprox_range(start_motif, stop_motif, hstack, missmatch_level, hs_start_pos = 0): + start = 0 + stop = 0 + #print 'startm: '+start_motif+'\tstop_motif: '+stop_motif + start = find_aprox_match_iter(start_motif, hstack, missmatch_level, hs_start_pos = 0) + stop = find_aprox_match_iter(stop_motif, hstack, missmatch_level, start[1]) + #print start,stop + if start and stop: + return hstack[start[1]:stop[0]] + +def find_all_motifs_in_aprox_range(start_motif, stop_motif, hstack, missmatch_level, hs_start_pos = 0, len_min = 0, len_max = float('inf')): + i = hs_start_pos + start = 0 + stop = 0 + ret_list = [] + print 'hstack in fuzzy' + print hstack + while i <= len(hstack): + start = find_aprox_match_iter(start_motif, hstack, missmatch_level, i) + stop = find_aprox_match_iter(stop_motif, hstack, missmatch_level, start[1]) + #print start,stop + if start and stop: + #print 'start + stop found' + if stop[0] - start[1] > len_min and stop[0] - start[1] < len_max: + #print 'match valid' + ret_list.append(hstack[start[1]:stop[0]]) + i = start[0]+1 + #print i + else: + break + return ret_list + +def find_motif(needle, hstack, missmatch_level, hs_start_pos = 0): + r = 0 + r = find_aprox_match_iter(needle, hstack, missmatch_level, hs_start_pos = 0) + if r: + return hstack[r[0]:r[1]] + +def find_all_motifs(needle, hstack, missmatch_level, hs_start_pos = 0): + #print 'fuzzy.find_all_motifs' + #print needle + #print hstack + #print missmatch_level + #print hs_start_pos + i = hs_start_pos + ret_list = [] + while i <= len(hstack): + r = find_aprox_match_iter(needle, hstack, missmatch_level, i ) + #print r + if r: + #print 'founded: ',r + ret_list.append(hstack[r[0]:r[1]]) + #ret_list = [hstack[r[0]:r[1]]] + i = r[0]+1 + else: + break + #print ret_list + return ret_list \ No newline at end of file diff --git a/fatool/sequence.py b/fatool/sequence.py new file mode 100644 index 0000000..4e21fa6 --- /dev/null +++ b/fatool/sequence.py @@ -0,0 +1,375 @@ +# -*- coding: utf-8 -*- + +from string import maketrans +from collections import Counter +import fuzzy +import re + + +class Sequence(object): + def __init__(self, name, seq): + if Sequence.validate_name_string(name): + self.name = name.lstrip('>') + else: + raise NameError('Sequence name have to start with ">"') + self.seq = seq + #self.quality = quality + + # def is_valid(self): + + # def validate_name(self): + + + @staticmethod + def validate_name_string(nstr): + if re.search('^>', nstr): + return 1 + + def validate_seq(self): + ''' + validates general seqence not specified for DNA or others. + ''' + # pattern to find not allowed chars. + pattern = re.compile('[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') + if pattern.search(self.seq): + if re.search('(\d+)', self.seq): + seq_array = self.seq.split('\n') + new_array = [] # array to store new sequence + for r in seq_array: + r = r.lstrip() # removing ' ' from beginings and ends + nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa + new_array.append(nr) + + end_of_seq_array = len(seq_array) + # if min. two lines calculate expected line length + if end_of_seq_array > 1: + line_length = int(new_array[1][0])-int(new_array[0][0]) + + # validate ecah block (between " ") of sequence () + i = 0 + while i < end_of_seq_array: + if not re.search('(\d+)', new_array[i][0]): + return 7 # line doesn't starts with digit + if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): + return 0 # bad line length + for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit + if len(r) != 10: # block not eq 10 + if len(r) < 10: # if less it can be ok if last elem of last line + if(i == end_of_seq_array - 1): + if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. + return 0 # not last elem of last line + else: + return 0 # not last line + else: + return 0 # block not eq 10 + if pattern.search(r): + return 0 + i += 1 + else: + return 0 # digit is not first char + # return pattern.search(self.seq) but nan error code returned before + return 1 + return 1 # valid + + @staticmethod + def generic_validate(seq, domain): + # pattern created from passed domain (domain contains chars that are not allowed) + pattern = re.compile(domain) #'[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]' + # if sequence contains illegal chars + if pattern.search(seq): + # if digits it can be ok if format like (60 xxxxxxxxxx xxx...) + if re.search('(\d+)', seq): + # to check that we have to transform array + seq_array = seq.split('\n') + new_array = [] # array to store new sequence as array of arrays + for r in seq_array: + r = r.lstrip() # removing ' ' from beginings and ends + nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa + new_array.append(nr) + + end_of_seq_array = len(seq_array) + # if min. two lines calculate expected line length + if end_of_seq_array > 1: + line_length = int(new_array[1][0])-int(new_array[0][0]) + + # validate ecah block (between " " [space]) of given sequence + i = 0 + while i < end_of_seq_array: + if not re.search('(\d+)', new_array[i][0]): + return 7 # line doesn't starts with digit + if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): + return 0 # bad line length + for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit + if len(r) != 10: # block not eq 10 + if len(r) < 10: # if less it can be ok if last elem of last line + if(i == end_of_seq_array - 1): + if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. + return 0 # not last elem of last line + else: + return 0 # not last line + else: + return 0 # block not eq 10 + if pattern.search(r): + return 0 + i += 1 + else: + return 0 # digit is not first char + # return pattern.search(seq) but nan error code returned before + return 1 + return 1 # valid + + # def validate_dna_seq(self): + + # def validate_other_seq(self): + + @staticmethod + def detailed_validate_generic(seq, domain): + not_valid = 0 + missmatches = {} + # pattern created from passed domain (domain contains chars that are not allowed) + pattern = re.compile(domain) + # find not allowed chars in sequence + m = pattern.finditer(seq) + log_info = [] + # if not allowed chars found + if m: + # it may be 60 xxxxxxxxxx xxx.... format + if re.search('(\d+)', seq): + seq_array = seq.split('\n') + new_array = [] # array to store new sequence after cleaning and transformation + for r in seq_array: + r = r.lstrip() # removing ' ' from beginings and ends + nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa + new_array.append(nr) + end_of_seq_array = len(seq_array) + # if min. two lines calculate expected line length + if end_of_seq_array > 1: + line_length = int(new_array[1][0])-int(new_array[0][0]) + + # validate ecah block (between " " [space]) of given sequence + i = 0 + while i < end_of_seq_array: + # digit on begining of line was not found - error + if not re.search('(\d+)', new_array[i][0]): + log_info.append('line '+str(i+1)+": line doesn't starts with digit") # line doesn't starts with digit + # check if line length = expected line length last line can be shorter + if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): + #return 0 # bad line length + log_info.append('line '+str(i+1)+': bad line length') + #chcek all blocks if are eq 10 (last can be shorter) + for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit + if len(r) != 10: # block not eq 10 + if len(r) < 10: # if less it can be ok if last elem of last line + if(i == end_of_seq_array - 1): + if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains les then 10 chars') # not last elem of last line + else: + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains les then 10 chars') # not last line + else: + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains more then 10 chars') # block gt 10 + # if block contains illegal chars now after transtrmation it should contain only legal chars. + if pattern.search(r): + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains illegal chars') + i += 1 + else: + # in this case it is not seq like "10 xxxxx xxxxx" + for mitem in m: + log_info.append('Position:\t'+str(mitem.start())+'\tvalue:\t'+str(mitem.group())) + # none of not allowed chars were found sequence OK + return log_info + # def detailed_validate_dna_seq(self): + + # def detailed_validate_other_seq(self): + + def cut(self, length, step): + ''' + cutting contig into smaller parts accordigly to supplied params + length of contig (number of chars) + step offset between current and next start + ''' + self.normalize() + i = 0 + contig_end = len(self.seq) # last position of contig + contig_list = [] # contig list returning by function + while i+length <= contig_end: + contig_list.append(Sequence('>'+self.name+'_frag_'+str(i + 1)+':'+str(i + length), str(self.seq[i:i+length]))) + i = i+step + return contig_list + + def reverse(self): + ''' + creates reversed sequence + ''' + self.normalize() + nr = re.sub('\n', '', self.seq) + rev = nr[::-1] + rev = rev.translate(maketrans('ACTGactg', 'TGACtgac')) + # creating 80 chars lines + #rev = re.sub("(.{80})", '\\1\n', rev, 0) + return Sequence('>rev_'+self.name, rev) + + + def normalize(self): + self.seq = re.sub(' ', '', self.seq) + self.seq = re.sub('^\d', '', self.seq, re.M) + self.seq = re.sub('\n', '', self.seq) + + def statistics(self): + ''' + returns simple statistics for contig + ''' + self.normalize() + r = {} + c = Counter(self.seq) + r['A'] = c['A']+c['a'] + r['C'] = c['C']+c['c'] + r['G'] = c['G']+c['g'] + r['T'] = c['T']+c['t'] + r['N'] = c['N']+c['n'] + r['L'] = len(self.seq) + return r + + #def getRange(self, start, stop): + # return self.seq[start:stop] + + def translate_dna2rna(self): + nc = self.seq.translate(maketrans('ACTGactg', 'UGACugac')) + return Sequence('>rna_'+self.name, nc) + + def translate_rna2dna(self): + nc = self.seq.translate(maketrans('UGACugac', 'ACTGactg')) + return Sequence('>dna_'+self.name, nc) + + # ctrl f1 frame 1 forward, r1 frame 1 revers, fall torward all frames, rall reverse all frames, all in this way? + # supply dict of translation or its constant? + @staticmethod + def translate2protein_in_range_generic(seq, start, stop, tdict): + p = '' + p_stop = '' + # search results in distribution to frames + frame1 = [] + frame2 = [] + frame3 = [] + + # creating pattern to find start codons + for r in start: + p += r+'|' + p = '('+p.rstrip('|')+')' + + # creating pattern to find stop codons + for r in stop: + p_stop += r+'|' + p_stop = '('+p.rstrip('|')+')' + + # match for start contigs + m = re.finditer(p, seq) + + # there will be stored latest string position for each frame + frame_iterator[0,0,0] + + stop_pos = len(seq) # where to stop searching if no stopcodon found + + # using each found start codon + for r in m: + # if start is lower then last used position skip it. + if frame_iterator[r.start()%3] <= r.start(): + # set i for start position of current start contig + i = r.start() + ret = '' + while i+3 <= stop: + ret += Sequence.translate(seq[i:i+3], tdict) + if re.match(p_stop, seq[i:i+3]): + i = i+3 + break + else: + i = i+3 + + frame_iterator[r.start()%3] = i + if r.start()%3 == 0: + frame1.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) + elif r.start()%3 == 1: + frame2.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) + elif r.start()%3 == 2: + frame3.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) + + return [frame1, frame2, frame3] + + @staticmethod + def translate2protein_generic(seq, tdict): + # +5 to secure all frames + f1 = '' + f2 = '' + f3 = '' + while i+5 < len(seq): + f1 += Sequence.translate(seq[i:i+3], tdict) + f2 += Sequence.translate(seq[i+1:i+4], tdict) + f3 += Sequence.translate(seq[i+2:i+5], tdict) + + return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,)] + + def translate2protein(self, tdict): + tdict = { + 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', + 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', + 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', + 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', + 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', + 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' + } + f = Sequence.translate2protein_generic(self.seq, tdict) + r = Sequence.translate2protein_generic(self.reverse().seq, tdict) + return {'fwd':f, 'rev':r} + + @staticmethod + def translate(contig, tdict): + if codon in tdict: + return tdict[codon] + else: + return '|'+codon+'|' + + def find_aprox_motif(self, motif, missmatch_level): + self.normalize() + return fuzzy.find_all_motifs(motif, self.seq, missmatch_level, hs_start_pos = 0) + + def find_aprox_primers(self, start, stop, missmatch_level = 0, len_min = 50, len_max = 10000): + #start 5'->3' + # add missmatch_level condition if 50%> + rev = stop[::-1] + new_stop = rev.translate(maketrans('ACTGactg', 'TGACtgac')) + r_list = [] + self.normalize() + #print '\nAfter normailzation' + #print self.seq + + res = fuzzy.find_all_motifs_in_aprox_range(start, stop, self.seq, missmatch_level, 0, len_min, len_max) + if res: + r_list.extend(res) + + rev = start[::-1] + new_start = rev.translate(maketrans('ACTGactg', 'TGACtgac')) + #print 'new_seq in sequence\n' + #print new_seq.seq + res = fuzzy.find_all_motifs_in_aprox_range(new_start, stop, self.seq, missmatch_level, 0, len_min, len_max) + if res: + r_list.extend(res) + print 'Sequence.find_aprox_primers', + for s in r_list: + print s+'\n' + return r_list + + def __str__(self): + ''' + creates nicely outputed string + ''' + return '>'+self.name+'\n'+re.sub("(.{80})", '\\1\n', self.seq, 0)+'\n' + + + def __len__(self): + return len(self.seq) + + def __cmp__(self, other): + if self.seq == other.seq: + return 0 + + def __eq__(self, other): + return self.seq == other.seq \ No newline at end of file diff --git a/fatool/tests/__init__.py b/fatool/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fatool/tests/test_fa.py b/fatool/tests/test_fa.py new file mode 100644 index 0000000..b25e1f1 --- /dev/null +++ b/fatool/tests/test_fa.py @@ -0,0 +1,193 @@ +import unittest +import sys +from fatool import * +import os + + + + +class TestFa(unittest.TestCase): + + def setUp(self): + with open('test.fa', 'w') as f: + f.write('>name3\nCTNACtacgatNNNNNNN\n>name4\nCTNAC\n>name5\nNNNNNACTGNNNN\n>name\nACTGactg\n>name7\nNNNACTGN\n>name8\nCTNACtacgatNNNNNNN\n>name2\nNNNNNNNNNACTGNNNN\n>name6\nCTNACtatNNN\n') + + with open('f2.fa', 'w') as f: + f.write('') + pass + + def test_setUpFa(self): + cl = [] + cl.append(Sequence('>name', 'ACTGactg')) + cl.append(Sequence('>name2', 'CCCTAGACTG')) + cl.append(Sequence('>name3', 'CTNNNNNNACtacgat')) + f = Fa(cl, 'test-fa') + self.assertEqual(cl, f.contigs) + self.assertEqual('test-fa', f.name) + self.assertEqual({'name':0, 'name2':1, 'name3':2}, f.contigs_idx) + cl.append('something') + with self.assertRaises(TypeError): + Fa(cl, 'name4') + + def test_str(self): + cl = [] + cl.append(Sequence('>name', 'ACTGactg')) + cl.append(Sequence('>name2', 'CCCTAGACTG')) + cl.append(Sequence('>name3', 'CTNNNNNNACtacgat')) + f = Fa(cl, 'test-fa') + self.assertEqual('>name\nACTGactg\n>name2\nCCCTAGACTG\n>name3\nCTNNNNNNACtacgat\n', str(f)) + + def test_add_contig(self): + cl = [] + cl.append(Sequence('>name', 'ACTGactg')) + f = Fa(cl, 'test-fa') + self.assertEqual(cl, f.contigs) + f.add_contig(Sequence('>name2', 'CCCTAGACTG')) + cl.append(Sequence('>name2', 'CCCTAGACTG')) + self.assertEqual(cl, f.contigs) + f.add_contig(Sequence('>name2', 'ACTGaaaaaaa') ) + self.assertEqual(cl, f.contigs) + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'ACTGaaaaaaa')] + f.add_contig(Sequence('>name2', 'ACTGaaaaaaa'), 1) + self.assertEqual(cl, f.contigs) + + def test_add_contigs(self): + cl = [Sequence('>name', 'ACTGactg')] + f = Fa(cl, 'test-fa') + self.assertEqual(cl, f.contigs) + cl.append(Sequence('>name2', 'CCCTAGACTG')) + cl.append(Sequence('>name3', 'CTNNNNNNACtacgat')) + f.add_contigs([Sequence('>name2', 'CCCTAGACTG'), Sequence('>name3', 'CTNNNNNNACtacgat')]) + self.assertEqual(cl, f.contigs) + f.add_contigs([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')]) + self.assertEqual(cl, f.contigs) + f.add_contigs([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], 1) + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + self.assertEqual(cl, f.contigs) + #self.assertEqual(cl, f.contigs) + + def test_show_names(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + f = Fa(cl, 'test-fa') + self.assertEqual(['name','name2','name3'], f.show_names()) + f.add_contig(Sequence('>name2', 'ACTGaaaaaaa'), 1) + self.assertEqual(['name','name3','name2'], f.show_names()) + f.add_contig(Sequence('>name7', 'ACTGaaaaaaa'), 1) + self.assertEqual(['name','name3','name2','name7'], f.show_names()) + + def test_extract(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + f = Fa(cl, 'test-fa') + self.assertEqual(cl, f.contigs) + cl2 = [Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + self.assertEqual(cl2, f.extract(['name2', 'name3']).contigs) + self.assertEqual('extr_test-fa', f.extract(['name2', 'name3']).name) + self.assertEqual(cl2, f.extract(['name2', 'name3', 'name321']).contigs) + + + def test_remove(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + f = Fa(cl, 'test-fa') + self.assertEqual([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], f.remove(['name']).contigs) + self.assertEqual([Sequence('>name', 'ACTGactg')], f.remove(['name2','name3']).contigs) + self.assertEqual([Sequence('>name', 'ACTGactg')], f.remove(['name2','name3','name234']).contigs) + self.assertEqual([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], f.remove(['name']).contigs) + + def test_statistics(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')] + f = Fa(cl, 'test-fa') + stat = { + 'A': 7, 'C': 8, 'T': 7, 'G': 4, 'N': 22, 'L': 48, + 'nbp1000': 0, 'nbp5000': 0, 'nbp10000': 0, 'nbp25000': 0, 'nbp50000': 0, + 'lbp1000': 0, 'lbp5000': 0, 'lbp10000': 0, 'lbp25000': 0, 'lbp50000': 0, + 'totalc':4, 'N50':17, 'L50':2, 'N75':8, 'L75':3, 'N90':8, 'L90':3, + 'longest':18 + } + + self.assertEqual(stat, f.statistics()) + + def test_sort(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')] + f = Fa(cl, 'test-fa') + cl = [Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), Sequence('>name4', 'CTNAC')] + #for r in f.sort(1).contigs: + # print r + #for r in cl.reverse(): + # print r + self.assertEqual(cl, f.sort(-1).contigs) + cl = [Sequence('>name4', 'CTNAC'), Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + self.assertEqual(cl, f.sort(1).contigs) + + def test_join(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')] + f = Fa(cl, 'test-fa') + cl2 = [Sequence('>name', 'NNNNNNNN'), Sequence('>name5', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtacgatNNNNNNN')] + f2 = Fa(cl2, 'test2-fa') + f.join([f2]) + cl.append(Sequence('>name5', 'NNNNNNNNNACTGNNNN')) + cl.append(Sequence('>name6', 'CTNACtacgatNNNNNNN')) + self.assertEqual(cl, f.contigs) + cl = [ + Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), + Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN'), Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN') + ] + f = Fa([Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN')], 'fa1') + f2 = Fa([Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')], 'fa2') + f3 = Fa([Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN')], 'fa3') + f4 = Fa([Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN')], 'fa4') + f.join([f2,f3,f4]) + self.assertEqual(cl, f.contigs) + f = Fa([Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], 'fa1') + f2 = Fa([Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name', 'AnnnnnCTGactg')], 'fa2') + f3 = Fa([Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN'), Sequence('>name4', 'annaCTNAC'), Sequence('>name', 'AaaCTnnaGactg')], 'fa3') + f4 = Fa([Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name3', 'CTNaaaACtacgatNNNNNNN'), Sequence('>name', 'AnnnCTGactg')], 'fa4') + f.join([f2,f3,f4]) + self.assertEqual(cl, f.contigs) + cl = [ + Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), + Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') + ] + f = Fa([Sequence('>name', 'NNN'), Sequence('>name2', 'ACTGNNNN'), Sequence('>name3', 'NNNNNNN')], 'fa1') + f2 = Fa([Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')], 'fa2') + f3 = Fa([Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNNN'), Sequence('>name', 'ACTGactg')], 'fa3') + f4 = Fa([Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'),Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') ], 'fa4') + f.join([f2,f3,f4], 1) + + self.assertEqual(cl, f.contigs) + + def test_load_from_file(self): + cl = [ + Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), + Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') + ] + with open('test.fa') as f: + fob = Fa.load_from_file(f) + + self.assertEqual('test.fa', fob.name) + self.assertEqual(cl, fob.contigs) + f2 = Fa.load_from_file('test.fa') + self.assertEqual('test.fa', f2.name) + self.assertEqual(cl, f2.contigs) + + + def test_write(self): + cl = [ + Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), + Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') + ] + f = Fa(cl, 'fa1') + f.write('f2.fa') + with open('test.fa') as f1, open('f2.fa') as f2: + f1_content = f1.read() + f2_content = f2.read() + self.assertEqual(f1_content, f2_content) + + + def tearDown(self): + os.remove('f2.fa') + os.remove('test.fa') + pass + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/fatool/tests/test_sequence.py b/fatool/tests/test_sequence.py new file mode 100644 index 0000000..a9ad4ec --- /dev/null +++ b/fatool/tests/test_sequence.py @@ -0,0 +1,211 @@ +import unittest +import sys +from fatool import Sequence + + + +class TestSequence(unittest.TestCase): + def setUp(self): + pass + + def test_setUpSequence(self): + c = Sequence('>name', 'ACTGactg') + self.assertTrue( isinstance(c, Sequence) ) + self.assertEqual(c.name, 'name') + self.assertEqual(c.seq, 'ACTGactg') + + def test_contig_str(self): + c = Sequence('>name', 'ACTGactg') + self.assertEqual(str(c), '>name\nACTGactg\n') + + def test_validate_name_string(self): + c = Sequence('>name', 'ACTGactg') + self.assertEqual(c.validate_name_string('>name'), 1) + + + def test_generic_validation(self): + self.assertEqual(Sequence.generic_validate('ACTGactg', '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), 1) + self.assertEqual(Sequence.generic_validate('ACTGactgee', '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), 0) + seq = ' 1 ACTGactgAC aaatttccca ACTGactgaa aaatttccca\n 41 ACTGactgAA aaatttccca ACTGactgtt aaatttccca\n 81 ACTGactgGG aaatttccca ACTGactgcc aaatt' + self.assertEqual(Sequence.generic_validate(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), 1) + # not allowed symbol e + seq = ' 1 ACTGactgAC aaatttccca ACTGactgee aaatttccca\n 41 ACTGactgAA aaatttccca ACTGactgtt aaatttccca\n 81 ACTGactgGG aaatttccca ACTGactgcc aaatt' + self.assertEqual(Sequence.generic_validate(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), 0) + # not allowed symbol e + len < + seq = ' 1 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 51 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 81 ACTGactgee aaatttccca ACTGactgee aaatt' + self.assertEqual(Sequence.generic_validate(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), 0) + + # not allowed symbol e + len > + seq = ' 1 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 31 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 81 ACTGactgee aaatttccca ACTGactgee aaatt' + self.assertEqual(Sequence.generic_validate(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), 0) + + # len > + seq = ' 1 ACTGactgTT aaatttcc ACTGactgAa aaatttccca\n 31 ACTGactgcT aaatttccca ACTGactgCT aaatttccca\n 81 ACTGactgAg aaatttccca ACTGactggg aaatt' + self.assertEqual(Sequence.generic_validate(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), 0) + + # len < + seq = ' 1 ACTGactgTT aaatttcc ACTGactgAa aaatttccca\n 51 ACTGactgcT aaatttccca ACTGactgCT aaatttccca\n 81 ACTGactgAg aaatttccca ACTGactggg aaatt' + self.assertEqual(Sequence.generic_validate(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), 0) + + # last len > + seq = ' 1 ACTGactgTT aaatttcc ACTGactgAa aaatttccca\n 51 ACTGactgcT aaatttccca ACTGactgCT aaatttccca\n 81 ACTGactgAg aaatttccca ACTGactggg aaattaaattaaatt' + self.assertEqual(Sequence.generic_validate(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), 0) + + # last bad symbol + seq = ' 1 ACTGactgTT aaatttcc ACTGactgAa aaatttccca\n 41 ACTGactgcT aaatttccca ACTGactgCT aaatttccca\n 81 ACTGactgAg aaatttccca ACTGactggg aaatte' + self.assertEqual(Sequence.generic_validate(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), 0) + + + def test_detailed_validate_generic(self): + self.assertEqual(Sequence.detailed_validate_generic('ACTGactg', '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), []) + seq = ' 1 ACTGactgAC aaatttccca ACTGactgaa aaatttccca\n 41 ACTGactgAA aaatttccca ACTGactgtt aaatttccca\n 81 ACTGactgGG aaatttccca ACTGactgcc aaatt' + self.assertEqual(Sequence.detailed_validate_generic(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), []) + seq = ' 1 ACTGactgAC aaatttccca ACTGactgee aaatttccca\n 41 ACTGactgAA aaatttccca ACTGactgtt aaatttccca\n 81 ACTGactgGG aaatttccca ACTGactgcc aaatt' + self.assertEqual(Sequence.detailed_validate_generic(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), ['line 1: block 3 contains illegal chars']) + seq = ' 1 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 51 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 81 ACTGactgee aaatttccca ACTGactgee aaatt' + message = [ + 'line 1: bad line length', + 'line 1: block 1 contains illegal chars', + 'line 1: block 3 contains illegal chars', + 'line 2: bad line length', + 'line 2: block 1 contains illegal chars', + 'line 2: block 3 contains illegal chars', + 'line 3: block 1 contains illegal chars', + 'line 3: block 3 contains illegal chars', + ] + self.assertEqual(Sequence.detailed_validate_generic(seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]'), message) + + #def test_translate2protein_in_range_generic(self): + + def translate2protein_generic(self): + tdict = { + 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', + 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', + 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', + 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', + 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', + 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' + } + + test = 'ATGGAATCGGCTTTTAATACTGCAGGGGCGTTAAGTTGGCATGAACTCACAACCAATAATACCGAAGAGGCCATGCGCTTCTATGCTGAGATTTTTGGCTGGCACTTTAAAACCGTCAAAATGCCCCACGGTCACTATCACATTATTGAAAACGAGGGGATCAGCATTGGCGGAATTACCGACAGTTTAATCCCCACCCTTCCCTCACATTGGACTGGCTATATTACCGTTAACGATGTGGATCAAGTGGCTATCAGTGCTAAAAAACTCGGCGGTGACATTCTGTTTGGCCCTGAAGACATTCCAGAGGTGGGCCGTTTTTGTTGGATAAAAGACCCACAGGGCGCCATTATTGCGGCCATTAGCTATTTAAAACGTTGATGTAA' + + f1 = ('','MESAFNTAGALSWHELTTNNTEEAMRFYAEIFGWHFKTVKMPHGHYHIIENEGISIGGITDSLIPTLPSHWTGYITVNDVDQVAISAKKLGGDILFGPEDIPEVGRFCWIKDPQGAIIAAISYLKR*C','AA') + f2 = ('A','WNRLLILQGR*VGMNSQPIIPKRPCASMLRFLAGTLKPSKCPTVTITLLKTRGSALAELPTV*SPPFPHIGLAILPLTMWIKWLSVLKNSAVTFCLALKTFQRWAVFVG*KTHRAPLLRPLAI*NVDV','A') + f3 = ('AT','GIGF*YCRGVKLA*THNQ*YRRGHALLC*DFWLAL*NRQNAPRSLSHY*KRGDQHWRNYRQFNPHPSLTLDWLYYR*RCGSSGYQC*KTRR*HSVWP*RHSRGGPFLLDKRPTGRHYCGH*LFKTLM*','') + + self.assertEqual([f1,f2,f3], Sequence.translate2protein_generic(test, tdict)) + + + def test_translate2protein(self): + pass + + def test_validate_seq(self): + c = Sequence('>name', 'ACTGactg') + self.assertEqual(c.validate_seq(), 1) + c = Sequence('>name', 'ACTGactgee') + self.assertEqual(c.validate_seq(), 0) + c = Sequence('>name', ' 1 ACTGactgAC aaatttccca ACTGactgaa aaatttccca\n 41 ACTGactgAA aaatttccca ACTGactgtt aaatttccca\n 81 ACTGactgGG aaatttccca ACTGactgcc aaatt') + self.assertEqual(c.validate_seq(), 1) + + # not allowed symbol e + c = Sequence('>name', ' 1 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 41 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 81 ACTGactgee aaatttccca ACTGactgee aaatt') + self.assertEqual(c.validate_seq(), 0) + + # not allowed symbol e + len < + c = Sequence('>name', ' 1 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 51 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 81 ACTGactgee aaatttccca ACTGactgee aaatt') + self.assertEqual(c.validate_seq(), 0) + + # not allowed symbol e + len > + c = Sequence('>name', ' 1 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 31 ACTGactgee aaatttccca ACTGactgee aaatttccca\n 81 ACTGactgee aaatttccca ACTGactgee aaatt') + self.assertEqual(c.validate_seq(), 0) + + # len > + c = Sequence('>name', ' 1 ACTGactgTT aaatttcc ACTGactgAa aaatttccca\n 31 ACTGactgcT aaatttccca ACTGactgCT aaatttccca\n 81 ACTGactgAg aaatttccca ACTGactggg aaatt') + self.assertEqual(c.validate_seq(), 0) + + # len < + c = Sequence('>name', ' 1 ACTGactgTT aaatttcc ACTGactgAa aaatttccca\n 51 ACTGactgcT aaatttccca ACTGactgCT aaatttccca\n 81 ACTGactgAg aaatttccca ACTGactggg aaatt') + self.assertEqual(c.validate_seq(), 0) + + # last len > + c = Sequence('>name', ' 1 ACTGactgTT aaatttcc ACTGactgAa aaatttccca\n 51 ACTGactgcT aaatttccca ACTGactgCT aaatttccca\n 81 ACTGactgAg aaatttccca ACTGactggg aaattaaattaaatt') + self.assertEqual(c.validate_seq(), 0) + + # last bad symbol + c = Sequence('>name', ' 1 ACTGactgTT aaatttcc ACTGactgAa aaatttccca\n 41 ACTGactgcT aaatttccca ACTGactgCT aaatttccca\n 81 ACTGactgAg aaatttccca ACTGactggg aaatte') + self.assertEqual(c.validate_seq(), 0) + + def test_cmp(self): + c = Sequence('>name', ' 1 ACTG') + o = Sequence('>name2', ' 1 ACTG') + self.assertTrue(c.seq == o.seq) + + def test_cut(self): + c = Sequence('>name', ' 1 ACTG') + c.normalize() + cl = [Sequence('>name_frag_1:1', 'A'),Sequence('>name_frag_2:2', 'C'),Sequence('>name_frag_3:3', 'T'),Sequence('>name_frag_4:4', 'G')] + rcl = c.cut(1,1) + self.assertEqual(str(cl[0]), str(rcl[0])) + self.assertEqual(str(cl[1]), str(rcl[1])) + #self.assertEqual(cl[0], rcl[0]) + #self.assertEqual(cl, rcl) + + def test_revers(self): + c = Sequence('>name', ' 1 ACTG') + r = Sequence('>rev_name', 'CAGT') + self.assertEqual(str(r), str(c.reverse())) + + def test_normalise(self): + c = Sequence('>name', ' 1 ACTG') + self.assertEqual('>name\n 1 ACTG\n', str(c) ) + c.normalize() + self.assertEqual('>name\nACTG\n', str(c) ) + + def test_statistics(self): + c = Sequence('>name', ' 1 ACTG') + #r = {'A':1, 'C':1, 'T':1, 'G':1, 'N':0 'L':9} + self.assertEqual({'A':1, 'C':1, 'T':1, 'G':1, 'N':0, 'L':4}, c.statistics()) + c = Sequence('>name', ' 1 ACTG NNNNNNNN') + self.assertEqual({'A':1, 'C':1, 'T':1, 'G':1, 'N':8, 'L':12}, c.statistics()) + c = Sequence('>name', ' 1 ACTG NNNNNNNN\naaanan') + self.assertEqual({'A':5, 'C':1, 'T':1, 'G':1, 'N':10, 'L':18}, c.statistics()) + + def test_find_aprox_motif(self): + test = 'ATGGAATCGGCTTTTAATACTGCAGGGGCGTTAAGTTGGCATGAACTCACAACCAATAATACCGAAGAGGCCATGCGC' + c = Sequence('>test', test) + #print c.find_aprox_motif('TGGAATCGGCT',1) + self.assertEqual(['TGGAATCGGCT'], c.find_aprox_motif('TGGAATCGGCT',1)) + self.assertEqual(['TGGAATCGGCT'], c.find_aprox_motif('TAGAATCGGCT', 1)) + self.assertEqual([], c.find_aprox_motif('TAGAATCGGCT', 0)) + self.assertEqual(['TGGAATCGGCT'], c.find_aprox_motif('TGGAATCGGCT',0)) + + + def test_find_aprox_primers(self): + test = 'ATGGAATCGGCTTTTAATACTGCAGGGGCGTTAAGTTGGCATGAACTCACAACCAATAATACCGAAGAGGCCATGCGCTTCTATGCTGAGATTTTTGGCTGGCACTTTAAAACCGTCAAAATGCCCCACGGTCACTATCACATTATTGAAAACGAGGGGATCAGCATTGGCGGAATTACCGACAGTTTAATCCCCACCCTTCCCTCACATTGGACTGGCTATATTACCGTTAACGATGTGGATCAAGTGGCTATCAGTGCTAAAAAACTCGGCGGTGACATTCTGTTTGGCCCTGAAGACATTCCAGAGGTGGGCCGTTTTTGTTGGATAAAAGACCCACAGGGCGCCATTATTGCGGCCATTAGCTATTTAAAACGTTGATGTAA' + c = Sequence('>test', test) + #print '\n===========reverse==============\n' + #print c.reverse() + #res = c.find_aprox_primers('TTT', 'GGG', 0,0,10000) + #REV 'TTT'<->'CCC' + #NOR 'AAA'<->'GGG' + #print '==== res ====' + #for r in res: + # print r+'\n' + tl = [ + 'TAATACTGCAGGGGCGTTAAGTTGGCATGAACTCACAACCAATAATACCGAAGAGGCCATGCGCTTCTATGCTGAGATTTTTGGCTGGCACTTTAAAACCGTCAAAATG', + 'AATACTGCAGGGGCGTTAAGTTGGCATGAACTCACAACCAATAATACCGAAGAGGCCATGCGCTTCTATGCTGAGATTTTTGGCTGGCACTTTAAAACCGTCAAAATG', + 'TTGGCTGGCACTTTAAAACCGTCAAAATG', + 'TGGCTGGCACTTTAAAACCGTCAAAATG', + 'GGCTGGCACTTTAAAACCGTCAAAATG', + 'AAAACCGTCAAAATG', + 'AAT', + 'GG', + 'TTGTTGGATAAAAGA', + 'TGTTGGATAAAAGA', + 'GTTGGATAAAAGA' + ] + + self.assertEqual(tl, c.find_aprox_primers('TTT', 'GGG', 0,0,10000)) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..10346e2 --- /dev/null +++ b/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup + +setup(name='fatool', + version='0.2.1', + description='tools for handling fasta files', + #url='http://github.com/storborg/funniest', + author='Blazej Marciniak', + author_email='blazejmarciniak@gmail.com', + license='Apache 2.0', + packages=['fatool'], + install_requires=[ + ], + scripts=['bin/cmdfatool.py'], + zip_safe=False) \ No newline at end of file