From 3f0ed879dd26d329e0fc816e160743439cbe88da Mon Sep 17 00:00:00 2001 From: Blazej Marciniak Date: Tue, 15 Nov 2016 08:39:08 +0100 Subject: [PATCH] Windows fixes (/n/r), search by frag name, fq file support Changes to be committed: modified: .gitignore modified: LICENSE.md modified: README.md modified: bin/cmdfatool.py modified: fatool/fa.py modified: fatool/sequence.py modified: fatool/tests/test_fa.py modified: setup.py --- .gitignore | 20 +- LICENSE.md | 402 +++++++++++------------ README.md | 470 +++++++++++++-------------- bin/cmdfatool.py | 32 +- fatool/fa.py | 465 +++++++++++++------------- fatool/sequence.py | 841 ++++++++++++++++++++++++------------------------ fatool/tests/test_fa.py | 392 +++++++++++----------- setup.py | 26 +- 8 files changed, 1365 insertions(+), 1283 deletions(-) diff --git a/.gitignore b/.gitignore index 41ac187..0675f20 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,11 @@ -# Compiled python modules. -*.pyc - -# Setuptools distribution folder. -/dist/ - -# Python egg metadata, regenerated from source files by setuptools. -/*.egg-info -/*.egg - +# Compiled python modules. +*.pyc + +# Setuptools distribution folder. +/dist/ + +# Python egg metadata, regenerated from source files by setuptools. +/*.egg-info +/*.egg + build/ \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md index 39147af..11899d2 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,201 +1,201 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2016 Błażej Marciniak [blazejmarciniak(at)gmail.com] & Dominik Strapagiel [strapag(at)biol.uni.lodz.pl] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2016 Błażej Marciniak [blazejmarciniak(at)gmail.com] & Dominik Strapagiel [strapag(at)biol.uni.lodz.pl] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 39528e6..dd8f911 100644 --- a/README.md +++ b/README.md @@ -1,236 +1,236 @@ -NAME -==== -fatool - - -VERSION -======= - -0.3.1 - -LICENSE -======= -APACHE 2.0 Specified in LICENSE.md file - -INTRODUCTION -============ - -Package and Command line tool in python 2.7. It operates on fa/fasta/etc. files. version: 0.2.1. To install package use setup.py install. - - -PREREQUISITES -============= -PYTHON 2.7 - -USAGE -===== - - - -COMMAND LINE -============ - -usage: cmdfatool.py [-h] [-v] - {cut,extractNames,extractContigs,remContigs,join,split,reverse,validate,stats} - -optional arguments: - -h, --help show this help message and exit - -v, --version display version number and exit - -fatool commands: - {cut,extractNames,extractContigs,remContigs,join,split,reverse,validate,stats} each has own params, for more details use: command -h - - cut split supplied sequence into smaller parts, according to given params - extractNames extracting contigs names only - extractContigs extracting contigs specified in file (output in new file) - remContigs removing contigs specified in file (output in new file) - join joining two or more files, yet not verifing duplicates - split each cotig saved into separate file - reverse reverse all sequences in file - validate validates fa file - stats show statistics of fa file - - - cut: - -usage: cmdfatool.py cut [-h] -f FAFILE -r RANGE [-o OUTPUT] [-s STEP] - [--report REPORT] [--operator OPERATOR] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE file to be cut usualy *.fa - -r RANGE, --range RANGE cutted sequence length - -o OUTPUT, --output OUTPUT output file default: output.fa - -s STEP, --step STEP step length default: 1 - --report REPORT log file if not supplied stdout - --operator OPERATOR user who have fired script it will be noted in log - - - extractNames - -usage: cmdfatool.py extractNames [-h] -f FAFILE [-o OUTPUT] [--report REPORT] - [--operator OPERATOR] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE file to be cut usualy *.fa - -o OUTPUT, --output OUTPUT output file if not supplied stdout - --report REPORT log file if not supplied stdout - --operator OPERATOR user who have fired script it will be noted in log - - - extractContigs - -usage: cmdfatool.py extractContigs [-h] -f FAFILE --list LIST -o OUTPUT - [--report REPORT] [--operator OPERATOR] - [--multifile] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE file to be cut usualy *.fa - --list LIST file containing list of contigs one contig per line - -o OUTPUT, --output OUTPUT output file; if --multifile is set output directory - --report REPORT log file if not supplied stdout - --operator OPERATOR user who have fired script it will be noted in log - --multifile if this flag is set each contig will be saved in - separate file - - - remContigs - -usage: cmdfatool.py remContigs [-h] -f FAFILE --list LIST -o OUTPUT - [--report REPORT] [--operator OPERATOR] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE file to be cut usualy *.fa - --list LIST file containing list of contigs one contig per line - -o OUTPUT, --output OUTPUT output file if not supplied stdout - --report REPORT log file if not supplied stdout - --operator OPERATOR user who have fired script it will be noted in log - - - join - -usage: cmdfatool.py join [-h] -f FAFILE -o OUTPUT - [--files [FILES [FILES ...]]] [--overwrite] - [--report REPORT] [--operator OPERATOR] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE file to be cut usualy *.fa - -o OUTPUT, --output OUTPUT output file if not supplied stdout - --files [FILES [FILES ...]] files to be joined - --overwrite if set owerwrites contigs with same name - --report REPORT log file if not supplied stdout - --operator OPERATOR user who have fired script it will be noted in log - - - split - -usage: cmdfatool.py split [-h] -f FAFILE -d OUTPUTDIR [--report REPORT] - [--operator OPERATOR] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE file to be cut usualy *.fa - -d OUTPUTDIR, --outputDir OUTPUTDIR output directory where splited contigs will be saved - --report REPORT log file if not supplied stdout - --operator OPERATOR user who have fired script it will be noted in log - - - reverse - -usage: cmdfatool.py reverse [-h] -f FAFILE -o OUTPUT [--report REPORT] - [--operator OPERATOR] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE file to be cut usualy *.fa - -o OUTPUT, --output OUTPUT output file; if --multifile is set output directory - --report REPORT log file if not supplied stdout - --operator OPERATOR user who have fired script it will be noted in log - - - validate - -usage: cmdfatool.py validate [-h] -f FAFILE -t TYPE [--details] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE - file to be cut usualy *.fa - -t TYPE, --type TYPE type of sequence 0 - general, 1 DNA, 2 - amino - --details set if you want to see detaild validation info - - - stats - -usage: cmdfatool.py stats [-h] -f FAFILE [--report REPORT] - [--operator [OPERATOR [OPERATOR ...]]] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa - --report REPORT log file if not supplied stdout - --operator [OPERATOR [OPERATOR ...]] user who have fired script it will be noted in log - - findPrimer - -usage: cmdfatool.py findPrimer [-h] -f FAFILE --start START --stop STOP --mode - {FF,FR} [--minlen MINLEN] [--maxlen MAXLEN] - [--mml MML] [--report REPORT] - [--operator [OPERATOR [OPERATOR ...]]] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE - file to show statistics usualy *.fa - --start START first sequence to be found - --stop STOP last sequence to be found - --mode {FF,FR} FF (start - forward orientated, stop - forward orientated) or FR (start - forward orientated, stop - reverse orientated) - --minlen MINLEN minimum length (detfault 50bp) - --maxlen MAXLEN max length (detfault 1000bp) - --mml MML mismatch level number of allowed missmatches in primers (detfault 0) - --report REPORT report results into file if not supplied stdout - --operator [OPERATOR [OPERATOR ...]] - user who have fired script it will be noted in report - - - cutNameMarker - - -usage: cmdfatool.py cutNameMarker [-h] -f FAFILE -m MARKER -l LENGTH - --keepMarker KEEPMARKER [-o OUTPUT] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa - -m MARKER, --marker MARKER marker that indicates start of cut - -l LENGTH, --length LENGTH length of cut - --keepMarker KEEPMARKER weather to keep marker or not default 1 (Yes) - -o OUTPUT, --output OUTPUT output file default: output.fa - - translateDNA2Proteins - -usage: cmdfatool.py translateDNA2Proteins [-h] -f FAFILE [-o OUTPUT] - [--startCodons [STARTCODONS [STARTCODONS ...]]] - [--stopCodons [STOPCODONS [STOPCODONS ...]]] - [--tdict {STD,VMTO,YMTO,BAPP}] - [--nss] [--report REPORT] - [--operator [OPERATOR [OPERATOR ...]]] - -optional arguments: - -h, --help show this help message and exit - -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa - -o OUTPUT, --output OUTPUT output file default: output.fa - --startCodons [STARTCODONS [STARTCODONS ...]] list of start codons separated by space bar - --stopCodons [STOPCODONS [STOPCODONS ...]] list of stop codons separated by space bar - --tdict {STD,VMTO,YMTO,BAPP} - Which dictionary use for translation: STD - standard, - VMTO - Vertebrate Mitochondrial, YMTO - Yeast - Mitochondrial, BAPP - Bacterial Archaeal Plant and - Plastid - --nss No Start Stop - --report REPORT report results into file if not supplied stdout +NAME +==== +fatool + + +VERSION +======= + +0.3.1 + +LICENSE +======= +APACHE 2.0 Specified in LICENSE.md file + +INTRODUCTION +============ + +Package and Command line tool in python 2.7. It operates on fa/fasta/etc. files. version: 0.2.1. To install package use setup.py install. + + +PREREQUISITES +============= +PYTHON 2.7 + +USAGE +===== + + + +COMMAND LINE +============ + +usage: cmdfatool.py [-h] [-v] + {cut,extractNames,extractContigs,remContigs,join,split,reverse,validate,stats} + +optional arguments: + -h, --help show this help message and exit + -v, --version display version number and exit + +fatool commands: + {cut,extractNames,extractContigs,remContigs,join,split,reverse,validate,stats} each has own params, for more details use: command -h + + cut split supplied sequence into smaller parts, according to given params + extractNames extracting contigs names only + extractContigs extracting contigs specified in file (output in new file) + remContigs removing contigs specified in file (output in new file) + join joining two or more files, yet not verifing duplicates + split each cotig saved into separate file + reverse reverse all sequences in file + validate validates fa file + stats show statistics of fa file + + + cut: + +usage: cmdfatool.py cut [-h] -f FAFILE -r RANGE [-o OUTPUT] [-s STEP] + [--report REPORT] [--operator OPERATOR] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + -r RANGE, --range RANGE cutted sequence length + -o OUTPUT, --output OUTPUT output file default: output.fa + -s STEP, --step STEP step length default: 1 + --report REPORT log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + + + extractNames + +usage: cmdfatool.py extractNames [-h] -f FAFILE [-o OUTPUT] [--report REPORT] + [--operator OPERATOR] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + -o OUTPUT, --output OUTPUT output file if not supplied stdout + --report REPORT log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + + + extractContigs + +usage: cmdfatool.py extractContigs [-h] -f FAFILE --list LIST -o OUTPUT + [--report REPORT] [--operator OPERATOR] + [--multifile] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + --list LIST file containing list of contigs one contig per line + -o OUTPUT, --output OUTPUT output file; if --multifile is set output directory + --report REPORT log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + --multifile if this flag is set each contig will be saved in + separate file + + + remContigs + +usage: cmdfatool.py remContigs [-h] -f FAFILE --list LIST -o OUTPUT + [--report REPORT] [--operator OPERATOR] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + --list LIST file containing list of contigs one contig per line + -o OUTPUT, --output OUTPUT output file if not supplied stdout + --report REPORT log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + + + join + +usage: cmdfatool.py join [-h] -f FAFILE -o OUTPUT + [--files [FILES [FILES ...]]] [--overwrite] + [--report REPORT] [--operator OPERATOR] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + -o OUTPUT, --output OUTPUT output file if not supplied stdout + --files [FILES [FILES ...]] files to be joined + --overwrite if set owerwrites contigs with same name + --report REPORT log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + + + split + +usage: cmdfatool.py split [-h] -f FAFILE -d OUTPUTDIR [--report REPORT] + [--operator OPERATOR] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + -d OUTPUTDIR, --outputDir OUTPUTDIR output directory where splited contigs will be saved + --report REPORT log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + + + reverse + +usage: cmdfatool.py reverse [-h] -f FAFILE -o OUTPUT [--report REPORT] + [--operator OPERATOR] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to be cut usualy *.fa + -o OUTPUT, --output OUTPUT output file; if --multifile is set output directory + --report REPORT log file if not supplied stdout + --operator OPERATOR user who have fired script it will be noted in log + + + validate + +usage: cmdfatool.py validate [-h] -f FAFILE -t TYPE [--details] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE + file to be cut usualy *.fa + -t TYPE, --type TYPE type of sequence 0 - general, 1 DNA, 2 - amino + --details set if you want to see detaild validation info + + + stats + +usage: cmdfatool.py stats [-h] -f FAFILE [--report REPORT] + [--operator [OPERATOR [OPERATOR ...]]] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa + --report REPORT log file if not supplied stdout + --operator [OPERATOR [OPERATOR ...]] user who have fired script it will be noted in log + + findPrimer + +usage: cmdfatool.py findPrimer [-h] -f FAFILE --start START --stop STOP --mode + {FF,FR} [--minlen MINLEN] [--maxlen MAXLEN] + [--mml MML] [--report REPORT] + [--operator [OPERATOR [OPERATOR ...]]] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE + file to show statistics usualy *.fa + --start START first sequence to be found + --stop STOP last sequence to be found + --mode {FF,FR} FF (start - forward orientated, stop - forward orientated) or FR (start - forward orientated, stop - reverse orientated) + --minlen MINLEN minimum length (detfault 50bp) + --maxlen MAXLEN max length (detfault 1000bp) + --mml MML mismatch level number of allowed missmatches in primers (detfault 0) + --report REPORT report results into file if not supplied stdout + --operator [OPERATOR [OPERATOR ...]] + user who have fired script it will be noted in report + + + cutNameMarker + + +usage: cmdfatool.py cutNameMarker [-h] -f FAFILE -m MARKER -l LENGTH + --keepMarker KEEPMARKER [-o OUTPUT] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa + -m MARKER, --marker MARKER marker that indicates start of cut + -l LENGTH, --length LENGTH length of cut + --keepMarker KEEPMARKER weather to keep marker or not default 1 (Yes) + -o OUTPUT, --output OUTPUT output file default: output.fa + + translateDNA2Proteins + +usage: cmdfatool.py translateDNA2Proteins [-h] -f FAFILE [-o OUTPUT] + [--startCodons [STARTCODONS [STARTCODONS ...]]] + [--stopCodons [STOPCODONS [STOPCODONS ...]]] + [--tdict {STD,VMTO,YMTO,BAPP}] + [--nss] [--report REPORT] + [--operator [OPERATOR [OPERATOR ...]]] + +optional arguments: + -h, --help show this help message and exit + -f FAFILE, --fafile FAFILE file to show statistics usualy *.fa + -o OUTPUT, --output OUTPUT output file default: output.fa + --startCodons [STARTCODONS [STARTCODONS ...]] list of start codons separated by space bar + --stopCodons [STOPCODONS [STOPCODONS ...]] list of stop codons separated by space bar + --tdict {STD,VMTO,YMTO,BAPP} + Which dictionary use for translation: STD - standard, + VMTO - Vertebrate Mitochondrial, YMTO - Yeast + Mitochondrial, BAPP - Bacterial Archaeal Plant and + Plastid + --nss No Start Stop + --report REPORT report results into file if not supplied stdout --operator [OPERATOR [OPERATOR ...]] user who have fired script it will be noted in report \ No newline at end of file diff --git a/bin/cmdfatool.py b/bin/cmdfatool.py index bfe9311..d47b472 100644 --- a/bin/cmdfatool.py +++ b/bin/cmdfatool.py @@ -138,6 +138,13 @@ def main(): sub_trn_d2p.add_argument('--report', help='report results into file if not supplied stdout', type=argparse.FileType('w')) sub_trn_d2p.add_argument('--operator', help='user who have fired script it will be noted in report', nargs='*', type=str) sub_trn_d2p.set_defaults(func=translate_dna_to_protein) + + + sub_2fq = subparsers.add_parser('cnv2fq', help='converts *.FASTA to *.FQ') + sub_2fq.add_argument('-f', '--fafile', help='file to convert *.fa', type=argparse.FileType('r'), required=True) + sub_2fq.add_argument('-o', '--output', help='file to output as *.fq', type=argparse.FileType('w'), required=True) + sub_2fq.add_argument('-q', '--quality', help='quality score to add to reads', type=int, required=True) + sub_2fq.set_defaults(func=convert_to_fq) args = parser.parse_args() @@ -198,13 +205,17 @@ def extract_names(args): logger.info('command: extractNames starting') rep = str(make_log_header('extractNames', args.operator)) fafile = args.fafile - output = args.output + #output = args.output fa = Fa.load_from_file(fafile) names = fa.show_names() - with output as o: + if args.output: + with args.output as o: + for r in names: + o.write(r+'\n') + else: for r in names: - o.write(r+'\n') + print r rep += 'Number of neames founded:\t' + str(len(names)) rep += '\n\n------------------------------------------------------' rep += '\nFinished:\t'+str(datetime.datetime.now()) @@ -500,6 +511,21 @@ def translate_dna_to_protein(args): def cut_name(args): pass +def convert_to_fq(args): + fa = Fa.load_from_file(args.fafile) + #fq = fa.convert_to_fq(args.quality) + i = 1 + with args.output as w: + for r in fa.contigs: + q = chr(33+args.quality)*len(r) + #n = self.name.replace('>', '@') + #n = n.replace(' ','_') + n = '@EAS123:100:FC123VJ:2:'+str(i)+':'+str(i*7)+':'+str(i*8)+' 1:N:18:1' + i += 1 + #nlist.append(Sequence(n, r.seq, q)) + w.write(str(Sequence(n, r.seq, q))) + + #fq.write(args.output) if __name__ == '__main__': exit(main()) diff --git a/fatool/fa.py b/fatool/fa.py index d00304e..c682778 100644 --- a/fatool/fa.py +++ b/fatool/fa.py @@ -1,217 +1,248 @@ -# -*- coding: utf-8 -*- - - -import re -import math -from fatool import Sequence -import logging - -class Fa(object): - def __init__(self, contigs_list, name): - logger = logging.getLogger(__name__) - - logger.debug('creating Fa object') - self.name = name - self.contigs = [] - self.contigs_idx = {} - for r in contigs_list: - if not isinstance(r, Sequence): - logger.error('Supplied param is not Sequence object') - raise TypeError('Wrong param supplied Sequence was expected') - if not r.name in self.contigs_idx: - if len(self.contigs) > 0: - logger.debug('appending contig: '+r.name) - self.contigs.append(r) - else: - logger.debug('adding first contig: '+r.name) - self.contigs = [r] - - self.contigs_idx[r.name] = len(self.contigs) - 1 - else: - logger.error('Sequence name: '+r.name+' already exists in file') - raise NameError('Sequence name already exists: '+r.name) - - - @staticmethod - def load_from_file(file): - if isinstance(file, str): - with open(file, 'r') as f: - contigs = Fa.load_content(f.read()) - name = file - else: - name = file.name - with file as f: - contigs = Fa.load_content(f.read() ) - - - return Fa(contigs, name) - - @staticmethod - def load_content(content): - #print content - nc = content.split('>') - contigs_list = [] - for r in nc[1:]: - contigs_list.append(Sequence('>'+r.split('\n', 1)[0].rstrip(), re.sub('^>.*\n', '', '>'+r.rstrip()))) - return contigs_list - - def write(self, fafile): - if isinstance(fafile, str): - with open(fafile, 'w') as f: - f.write(str(self)) - else: - with fafile as f: - f.write(str(self)) - - def write_multiple_files(self, dir): - dir = dir.rstrip('/') - dir = dir.rstrip('\\') - if len(dir) > 0: - dir = dir+'/' - for r in self.contigs: - with open(dir+r.name+'.fa', 'w') as w: - w.write(str(r)) - - def add_contigs(self, contig_list, owrite=0): - for r in contig_list: - self.add_contig(r, owrite) - - - def add_contig(self, contig, owrite = 0): - if not isinstance(contig, Sequence): - raise TypeError('Wrong param supplied contig was expected') - if contig.name in self.contigs_idx: - if owrite == 1: - #rem old item and add new name - del self.contigs[self.contigs_idx[contig.name]] - self.contigs.append(contig) - for a, r in enumerate(self.contigs): - #print 'cnt '+str(r) - self.contigs_idx[r.name] = a - else: - self.contigs.append(contig) - self.contigs_idx[contig.name] = len(self.contigs) - 1 - - def show_names(self): - return sorted(self.contigs_idx, key=self.contigs_idx.get) - - - def extract(self, contigs_name_list): - print contigs_name_list - new_contig_list = [] - for r in contigs_name_list: - if r in self.contigs_idx: - new_contig_list.append(self.contigs[self.contigs_idx[r]]) - return Fa(new_contig_list, '>extr_'+self.name) - - def remove(self, contigs_name_list): - new_contig_list = [] - for r in self.contigs: - if not r.name in contigs_name_list: - new_contig_list.append(r) - return Fa(new_contig_list, 'rem_'+self.name) - - def validate(self): - ''' - ''' - - def nl_statistics(self, g, percent): - ''' - Counts statistics of N50, L50, N75 etc. - g array containing sorted contigs by length, from biggest to lowest - ''' - ncount = -1 # index & number of contigs with +1 - nsum = 0 - stop = math.floor(self.stats['L']*(percent/100.00)) - while nsum < stop: - ncount += 1 - nsum += g[ncount] - - self.stats['N'+str(percent)] = g[ncount] - self.stats['L'+str(percent)] = ncount + 1 - - def bp_stats(self, length): - self.stats['totalc'] += 1 - if length > 50000: - self.stats['nbp50000'] += 1 # number of contigs with length - self.stats['lbp50000'] += length # total length of contigs with min. len - elif length > 25000: - self.stats['nbp25000'] += 1 - self.stats['lbp25000'] += length - elif length > 10000: - self.stats['nbp10000'] += 1 - self.stats['lbp10000'] += length - elif length > 5000: - self.stats['nbp5000'] += 1 - self.stats['lbp5000'] += length - elif length > 1000: - self.stats['nbp1000'] += 1 - self.stats['lbp1000'] += length - - def statistics(self): - self.stats = { - 'A': 0, 'C': 0, 'T': 0, 'G': 0, 'N': 0, 'L': 0, - 'nbp1000': 0, 'nbp5000': 0, 'nbp10000': 0, 'nbp25000': 0, 'nbp50000': 0, - 'lbp1000': 0, 'lbp5000': 0, 'lbp10000': 0, 'lbp25000': 0, 'lbp50000': 0, - 'totalc':0 - } - nstat_list = [] - bp_stats = [] - for r in self.contigs: - temp = r.statistics() - self.stats['A'] += temp['A'] - self.stats['C'] += temp['C'] - self.stats['T'] += temp['T'] - self.stats['G'] += temp['G'] - self.stats['N'] += temp['N'] - self.stats['L'] += temp['L'] - nstat_list.append(temp['L']) - self.bp_stats(temp['L']) - - self.stats['longest'] = max(nstat_list) - nstat_list.sort() - nstat_list.reverse() - - self.nl_statistics(nstat_list, 50) - self.nl_statistics(nstat_list, 75) - self.nl_statistics(nstat_list, 90) - - #print self.stats - - return self.stats - - def sort(self, mono): - contig_list = [] - temp = {} # dict to store name:len(contig) - for r in self.contigs: - temp[r.name] = len(r) - - if mono == -1: - for r in sorted(temp, key=temp.get)[::-1]: - contig_list.append(self.contigs[self.contigs_idx[r]]) - else: - for r in sorted(temp, key=temp.get): - contig_list.append(self.contigs[self.contigs_idx[r]]) - - return Fa(contig_list, 'sorted_'+self.name) - - def reverse(): - cl = [] - for r in self.contigs: - cl.append(r.reverse) - return Fa(cl, 'rev_'+self.name) - - def join(self, fa_list, owrite = 0): - for fa in fa_list: - if not isinstance(fa, Fa): - raise TypeError('Wrong param supplied Fa was expected') - self.add_contigs(fa.contigs, owrite) - - def count_contigs(self): - return len(self.contigs) - - def __str__(self): - return_string = '' - for r in self.contigs: - return_string += str(r) - return return_string +# -*- coding: utf-8 -*- + + +import re +import math +from fatool import Sequence +import logging + + +class Fa(object): + def __init__(self, contigs_list, name): + logger = logging.getLogger(__name__) + + logger.debug('creating Fa object') + self.name = name + self.contigs = [] + # index of contigs positions + # contig_name:position_in_contigs + self.contigs_idx = {} + for r in contigs_list: + if not isinstance(r, Sequence): + logger.error('Supplied param is not Sequence object') + raise TypeError('Wrong param supplied Sequence was expected') + if not r.name in self.contigs_idx: + if len(self.contigs) > 0: + logger.debug('appending contig: '+r.name) + self.contigs.append(r) + else: + logger.debug('adding first contig: '+r.name) + self.contigs = [r] + + self.contigs_idx[r.name] = len(self.contigs) - 1 + else: + logger.error('Sequence name: '+r.name+' already exists in file') + raise NameError('Sequence name already exists: '+r.name) + + + @staticmethod + def load_from_file(file): + if isinstance(file, str): + with open(file, 'r') as f: + contigs = Fa.load_content(f.read()) + name = file + else: + name = file.name + with file as f: + contigs = Fa.load_content(f.read() ) + + + return Fa(contigs, name) + + @staticmethod + def load_content(content): + #print content + nc = content.split('>') + contigs_list = [] + for r in nc[1:]: + contigs_list.append(Sequence('>'+r.split('\n', 1)[0].rstrip(), re.sub('^>.*\n', '', '>'+r.rstrip()))) + return contigs_list + + def write(self, fafile): + if isinstance(fafile, str): + with open(fafile, 'w') as f: + f.write(str(self)) + else: + with fafile as f: + f.write(str(self)) + + def write_multiple_files(self, dir): + dir = dir.rstrip('/') + dir = dir.rstrip('\\') + if len(dir) > 0: + dir = dir+'/' + for r in self.contigs: + with open(dir+r.name+'.fa', 'w') as w: + w.write(str(r)) + + def add_contigs(self, contig_list, owrite=0): + for r in contig_list: + self.add_contig(r, owrite) + + + def add_contig(self, contig, owrite = 0): + if not isinstance(contig, Sequence): + raise TypeError('Wrong param supplied contig was expected') + if contig.name in self.contigs_idx: + if owrite == 1: + #rem old item and add new name + del self.contigs[self.contigs_idx[contig.name]] + self.contigs.append(contig) + for a, r in enumerate(self.contigs): + #print 'cnt '+str(r) + self.contigs_idx[r.name] = a + else: + self.contigs.append(contig) + self.contigs_idx[contig.name] = len(self.contigs) - 1 + + def show_names(self): + return sorted(self.contigs_idx, key=self.contigs_idx.get) + + + def extract(self, contigs_name_list): + print contigs_name_list + new_contig_list = [] + for r in contigs_name_list: + if r in self.contigs_idx: + new_contig_list.append(self.contigs[self.contigs_idx[r]]) + return Fa(new_contig_list, '>extr_'+self.name) + + def extract_by_name_frag(self, name_frag, expected_matches = 1): + new_contigs_list = [] + #m = re.search(re.escape(name_frag), self.name) + #while i < expected_matches: + i = 0 + for r in self.contigs: + if r.equal_to_name_frag(name_frag): + new_contigs_list.append(r) + i += 1 + if i >= expected_matches: + return new_contigs_list + + + + def remove(self, contigs_name_list): + new_contig_list = [] + for r in self.contigs: + if not r.name in contigs_name_list: + new_contig_list.append(r) + return Fa(new_contig_list, 'rem_'+self.name) + + def validate(self): + ''' + ''' + + def nl_statistics(self, g, percent): + ''' + Counts statistics of N50, L50, N75 etc. + g array containing sorted contigs by length, from biggest to lowest + ''' + ncount = -1 # index & number of contigs with +1 + nsum = 0 + stop = math.floor(self.stats['L']*(percent/100.00)) + while nsum < stop: + ncount += 1 + nsum += g[ncount] + + self.stats['N'+str(percent)] = g[ncount] + self.stats['L'+str(percent)] = ncount + 1 + + def bp_stats(self, length): + self.stats['totalc'] += 1 + if length > 50000: + self.stats['nbp50000'] += 1 # number of contigs with length + self.stats['lbp50000'] += length # total length of contigs with min. len + elif length > 25000: + self.stats['nbp25000'] += 1 + self.stats['lbp25000'] += length + elif length > 10000: + self.stats['nbp10000'] += 1 + self.stats['lbp10000'] += length + elif length > 5000: + self.stats['nbp5000'] += 1 + self.stats['lbp5000'] += length + elif length > 1000: + self.stats['nbp1000'] += 1 + self.stats['lbp1000'] += length + + def statistics(self): + self.stats = { + 'A': 0, 'C': 0, 'T': 0, 'G': 0, 'N': 0, 'L': 0, + 'nbp1000': 0, 'nbp5000': 0, 'nbp10000': 0, 'nbp25000': 0, 'nbp50000': 0, + 'lbp1000': 0, 'lbp5000': 0, 'lbp10000': 0, 'lbp25000': 0, 'lbp50000': 0, + 'totalc':0 + } + nstat_list = [] + bp_stats = [] + for r in self.contigs: + temp = r.statistics() + self.stats['A'] += temp['A'] + self.stats['C'] += temp['C'] + self.stats['T'] += temp['T'] + self.stats['G'] += temp['G'] + self.stats['N'] += temp['N'] + self.stats['L'] += temp['L'] + nstat_list.append(temp['L']) + self.bp_stats(temp['L']) + + self.stats['longest'] = max(nstat_list) + nstat_list.sort() + nstat_list.reverse() + + self.nl_statistics(nstat_list, 50) + self.nl_statistics(nstat_list, 75) + self.nl_statistics(nstat_list, 90) + + #print self.stats + + return self.stats + + def sort(self, mono): + contig_list = [] + temp = {} # dict to store name:len(contig) + for r in self.contigs: + temp[r.name] = len(r) + + if mono == -1: + for r in sorted(temp, key=temp.get)[::-1]: + contig_list.append(self.contigs[self.contigs_idx[r]]) + else: + for r in sorted(temp, key=temp.get): + contig_list.append(self.contigs[self.contigs_idx[r]]) + + return Fa(contig_list, 'sorted_'+self.name) + + def reverse(): + cl = [] + for r in self.contigs: + cl.append(r.reverse) + return Fa(cl, 'rev_'+self.name) + + def join(self, fa_list, owrite = 0): + for fa in fa_list: + if not isinstance(fa, Fa): + raise TypeError('Wrong param supplied Fa was expected') + self.add_contigs(fa.contigs, owrite) + + def count_contigs(self): + return len(self.contigs) + + def __str__(self): + return_string = '' + for r in self.contigs: + return_string += str(r) + return return_string + + def convert_to_fq(self, quality): + nlist = [] + i = 1 + for r in self.contigs: + q = chr(33+quality)*len(r) + #n = self.name.replace('>', '@') + #n = n.replace(' ','_') + n = '@EAS123:100:FC123VJ:2:'+str(i)+':'+str(i*7)+':'+str(i*8)+' 1:N:18:1' + i += 1 + nlist.append(Sequence(n, r.seq, q)) + return Fa(nlist, self.name+'_fq') + + diff --git a/fatool/sequence.py b/fatool/sequence.py index d3d2118..dc7914d 100644 --- a/fatool/sequence.py +++ b/fatool/sequence.py @@ -1,412 +1,429 @@ -# -*- coding: utf-8 -*- - -from string import maketrans -from collections import Counter -import fuzzy -import re -import logging - - -class Sequence(object): - # 1 - tdict_standard = { - 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', - 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', - 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', - 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', - 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', - 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' - } - - start_standard = ['ATG', 'TTG', 'CTG'] - - standard_stop = ['TAA', 'TAG', 'TGA'] - - # 2 - tdict_vertebrate_mitochondrial = { - 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', - 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'M', 'ATC':'I', 'ATT':'I', - 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', - 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'*', 'AGG':'*', 'CGA':'R', 'CGC':'R', 'CGG':'R', - 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', - 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'W', 'TAA':'*' - } - - # 3 - tdict_yeast_mitochondrial = { - 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', - 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'M', 'ATC':'I', 'ATT':'I', - 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'T', 'CTC':'T', 'CTG':'T', 'CTT':'T', 'ATG':'M', 'AAC':'N', 'AAT':'N', - 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', - 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', - 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'W', 'TAA':'*' - } - - # 11 - tdict_bacterial_archaeal_plant_plastid = { - 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', - 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', - 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', - 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', - 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', - 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' - } - - def __init__(self, name, seq): - if Sequence.validate_name_string(name): - self.name = name - else: - raise NameError('Sequence name have to start with ">"') - self.seq = seq - #self.quality = quality - - # def is_valid(self): - - # def validate_name(self): - - - @staticmethod - def validate_name_string(nstr): - if re.search('^>', nstr): - return 1 - - def validate_seq(self): - ''' - validates general seqence not specified for DNA or others. - ''' - return Sequence.generic_validate(self.seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') - - @staticmethod - def generic_validate(seq, domain): - # pattern created from passed domain (domain contains chars that are not allowed) - pattern = re.compile(domain) #'[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]' - # if sequence contains illegal chars - if pattern.search(seq): - # if digits it can be ok if format like (60 xxxxxxxxxx xxx...) - if re.search('(\d+)', seq): - # to check that we have to transform array - seq_array = seq.split('\n') - new_array = [] # array to store new sequence as array of arrays - for r in seq_array: - r = r.lstrip() # removing ' ' from beginings and ends - nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa - new_array.append(nr) - - end_of_seq_array = len(seq_array) - # if min. two lines calculate expected line length - if end_of_seq_array > 1: - line_length = int(new_array[1][0])-int(new_array[0][0]) - - # validate ecah block (between " " [space]) of given sequence - i = 0 - while i < end_of_seq_array: - if not re.search('(\d+)', new_array[i][0]): - return 7 # line doesn't starts with digit - if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): - return 0 # bad line length - for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit - if len(r) != 10: # block not eq 10 - if len(r) < 10: # if less it can be ok if last elem of last line - if(i == end_of_seq_array - 1): - if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. - return 0 # not last elem of last line - else: - return 0 # not last line - else: - return 0 # block not eq 10 - if pattern.search(r): - return 0 - i += 1 - else: - return 0 # digit is not first char - # return pattern.search(seq) but nan error code returned before - return 1 - return 1 # valid - - # def validate_dna_seq(self): - - # def validate_other_seq(self): - - @staticmethod - def detailed_validate_generic(seq, domain): - not_valid = 0 - missmatches = {} - # pattern created from passed domain (domain contains chars that are not allowed) - pattern = re.compile(domain) - # find not allowed chars in sequence - m = pattern.finditer(seq) - log_info = [] - # if not allowed chars found - if m: - # it may be 61 xxxxxxxxxx xxx.... format - if re.search('(\d+)', seq): - seq_array = seq.split('\n') - new_array = [] # array to store new sequence after cleaning and transformation - for r in seq_array: - r = r.lstrip() # removing ' ' from beginings and ends - nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa - new_array.append(nr) - end_of_seq_array = len(seq_array) - # if min. two lines calculate expected line length - if end_of_seq_array > 1: - line_length = int(new_array[1][0])-int(new_array[0][0]) - - # validate each block (between " " [space]) of given sequence - i = 0 - while i < end_of_seq_array: - # digit on begining of line was not found - error - if not re.search('(\d+)', new_array[i][0]): - log_info.append('line '+str(i+1)+": line doesn't starts with digit") # line doesn't starts with digit - # check if line length = expected line length last line can be shorter - if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): - #return 0 # bad line length - log_info.append('line '+str(i+1)+': bad line length') - #chcek all blocks if are eq 10 (last can be shorter) - for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit - if len(r) != 10: # block not eq 10 - if len(r) < 10: # if less it can be ok if last elem of last line - if(i == end_of_seq_array - 1): - if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. - log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains les then 10 chars') # not last elem of last line - else: - log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains les then 10 chars') # not last line - else: - log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains more then 10 chars') # block gt 10 - # if block contains illegal chars now after transtrmation it should contain only legal chars. - if pattern.search(r): - log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains illegal chars') - i += 1 - else: - # in this case it is not seq like "10 xxxxx xxxxx" - for mitem in m: - log_info.append('Position:\t'+str(mitem.start())+'\tvalue:\t'+str(mitem.group())) - # none of not allowed chars were found sequence OK - return log_info - # def detailed_validate_dna_seq(self): - - # def detailed_validate_other_seq(self): - - def cut(self, length, step): - ''' - cutting contig into smaller parts accordigly to supplied params - length of contig (number of chars) - step offset between current and next start - ''' - self.normalize() - i = 0 - contig_end = len(self.seq) # last position of contig - contig_list = [] # contig list returning by function - while i+length <= contig_end: - contig_list.append(Sequence(self.name+'_frag_'+str(i + 1)+':'+str(i + length), str(self.seq[i:i+length]))) - i = i+step - return contig_list - - def cut_name(self, length, start = 0): - self.name = self.name[start:length] - - def leave_name_after_marker(self, mark, length = 0, keep_marker = 1): - m = re.search(re.escape(mark), self.name) - logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) - logger.debug(m) - logger.debug(keep_marker) - if m: - # keep original marker or skip it - - if keep_marker == 1: - s = m.start() - else: - s = m.end() - # defined length or return string to end - if length > 0: - self.name = '>'+self.name[s:s+length].lstrip('>') - else: - self.name = '>'+self.name[s:].lstrip('>') - return 1 - return 0 - - - def reverse(self): - ''' - creates reversed sequence - ''' - self.normalize() - nr = re.sub('\n', '', self.seq) - rev = nr[::-1] - rev = rev.translate(maketrans('ACTGactg', 'TGACtgac')) - # creating 80 chars lines - #rev = re.sub("(.{80})", '\\1\n', rev, 0) - return Sequence('>rev_'+self.name.lstrip('>'), rev) - - - def normalize(self): - self.seq = re.sub(' ', '', self.seq) - self.seq = re.sub('^\d', '', self.seq, re.M) - self.seq = re.sub('\n', '', self.seq) - - def statistics(self): - ''' - returns simple statistics for contig - ''' - self.normalize() - r = {} - c = Counter(self.seq) - r['A'] = c['A']+c['a'] - r['C'] = c['C']+c['c'] - r['G'] = c['G']+c['g'] - r['T'] = c['T']+c['t'] - r['N'] = c['N']+c['n'] - r['L'] = len(self.seq) - return r - - #def getRange(self, start, stop): - # return self.seq[start:stop] - - def translate_dna2rna(self): - nc = self.seq.translate(maketrans('ACTGactg', 'UGACugac')) - return Sequence('>rna_'+self.name, nc) - - def translate_rna2dna(self): - nc = self.seq.translate(maketrans('UGACugac', 'ACTGactg')) - return Sequence('>dna_'+self.name, nc) - - # ctrl f1 frame 1 forward, r1 frame 1 revers, fall torward all frames, rall reverse all frames, all in this way? - # supply dict of translation or its constant? - @staticmethod - def translate2protein_in_range_generic(seq, start, stop, tdict): - p = '' - p_stop = '' - # search results in distribution to frames - frame1 = [] - frame2 = [] - frame3 = [] - - # creating pattern (from dict) to find start codons - for r in start: - p += r+'|' - p = '('+p.rstrip('|')+')' - - # creating pattern to find stop codons - for r in stop: - p_stop += r+'|' - p_stop = '('+p_stop.rstrip('|')+')' - - m = re.finditer(p, seq) - - # there will be stored latest string position for each frame - frame_iterator = [0,0,0] - - stop_pos = len(seq) # where to stop searching if no stopcodon found - - # using each found start codon - for r in m: - # if start is lower then last used position skip it. - if frame_iterator[r.start()%3] <= r.start(): - # set i for start position of current start contig - i = r.start() - ret = '' - while i+3 <= stop_pos: - ret += Sequence.translate(seq[i:i+3], tdict) - if re.match(p_stop, seq[i:i+3]): - i = i+3 - break - else: - i = i+3 - - frame_iterator[r.start()%3] = i - if r.start()%3 == 0: - frame1.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) - elif r.start()%3 == 1: - frame2.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) - elif r.start()%3 == 2: - frame3.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) - - return [frame1, frame2, frame3] - - def translate2protein_in_range(self, start, stop, tdict): - - f = Sequence.translate2protein_in_range_generic(self.seq, start, stop, tdict) - r = Sequence.translate2protein_in_range_generic(self.reverse().seq, start, stop, tdict) - - return {'fwd':f, 'rev':r} - - - @staticmethod - def translate2protein_generic(seq, tdict): - # +5 to secure all frames - f1 = '' - f2 = '' - f3 = '' - i = 0 - while i+5 < len(seq): - f1 += Sequence.translate(seq[i:i+3], tdict) - f2 += Sequence.translate(seq[i+1:i+4], tdict) - f3 += Sequence.translate(seq[i+2:i+5], tdict) - i = i + 3 - - return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,'')] - - def translate2protein(self, tdict): - - f = Sequence.translate2protein_generic(self.seq, tdict) - r = Sequence.translate2protein_generic(self.reverse().seq, tdict) - return {'fwd':f, 'rev':r} - - @staticmethod - def translate(codon, tdict): - if codon in tdict: - return tdict[codon] - else: - return '|'+codon+'|' - - def find_aprox_motif(self, motif, missmatch_level): - self.normalize() - return fuzzy.find_all_motifs(motif, self.seq, missmatch_level, hs_start_pos = 0) - - def find_primers(self, start, stop, mode, len_min = 50, len_max = 10000): - return self.find_aprox_primers(start, stop, mode, 0, len_min, len_max) - - - def find_aprox_primers(self, start, stop, mode, missmatch_level = 0, len_min = 50, len_max = 10000): - #start 5'->3' - # add missmatch_level condition if 50%> - logger = logging.getLogger(__name__) - #logger.setLevel(logging.DEBUG) - logger.debug('given args: start:'+start+' stop: '+stop+' mode: '+mode+' mm level: '+str(missmatch_level)+' len_min: '+str(len_min)+' len_max: '+str(len_max)) - #logger.debug('sequence: '+self.seq) - if mode.upper() == 'FR': - rev = stop[::-1] - stop = rev.translate(maketrans('ACTGactg', 'TGACtgac')) - elif mode.upper() != 'FF': - raise ('Unexpected mode: '+str(mode)+' expected values [FR|FF]') - - r_list = [] - self.normalize() - - res = fuzzy.find_all_motifs_in_aprox_range(start, stop, self.seq, missmatch_level, 0, len_min, len_max) - if res: - r_list.extend(res) - - res = fuzzy.find_all_motifs_in_aprox_range(start, stop, self.reverse().seq, missmatch_level, 0, len_min, len_max) - if res: - r_list.extend(res) - - logger.debug(r_list) - return r_list - - def __str__(self): - ''' - creates nicely outputed string - ''' - return self.name+'\n'+re.sub("(.{80})", '\\1\n', self.seq, 0)+'\n' - - - def __len__(self): - return len(self.seq) - - def __cmp__(self, other): - if self.seq == other.seq: - return 0 - - def __eq__(self, other): - return self.seq == other.seq \ No newline at end of file +# -*- coding: utf-8 -*- + +from string import maketrans +from collections import Counter +import fuzzy +import re +import logging + + +class Sequence(object): + # 1 + tdict_standard = { + 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', + 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', + 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', + 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', + 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', + 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' + } + + start_standard = ['ATG', 'TTG', 'CTG'] + + standard_stop = ['TAA', 'TAG', 'TGA'] + + # 2 + tdict_vertebrate_mitochondrial = { + 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', + 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'M', 'ATC':'I', 'ATT':'I', + 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', + 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'*', 'AGG':'*', 'CGA':'R', 'CGC':'R', 'CGG':'R', + 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', + 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'W', 'TAA':'*' + } + + # 3 + tdict_yeast_mitochondrial = { + 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', + 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'M', 'ATC':'I', 'ATT':'I', + 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'T', 'CTC':'T', 'CTG':'T', 'CTT':'T', 'ATG':'M', 'AAC':'N', 'AAT':'N', + 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', + 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', + 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'W', 'TAA':'*' + } + + # 11 + tdict_bacterial_archaeal_plant_plastid = { + 'GCA':'A','GCC':'A','GCG':'A','GCT':'A', 'TGC':'C','TGT':'C', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', + 'TTC':'F', 'TTT':'F', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'CAC':'H', 'CAT':'H', 'ATA':'I', 'ATC':'I', 'ATT':'I', + 'AAA':'K', 'AAG':'K', 'TTA':'L', 'TTG':'L', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'ATG':'M', 'AAC':'N', 'AAT':'N', + 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAA':'Q', 'CAG':'Q', 'AGA':'R', 'AGG':'R', 'CGA':'R', 'CGC':'R', 'CGG':'R', + 'CGT':'R', 'AGC':'S', 'AGT':'S', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', + 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'TGG':'W', 'TAC':'Y', 'TAT':'Y', 'TAG': '*', 'TGA':'*', 'TAA':'*' + } + + def __init__(self, name, seq, quality = None): + if Sequence.validate_name_string(name): + self.name = name + else: + raise NameError('Sequence name have to start with ">" or "@"') + self.seq = seq.strip() + self.quality = quality + + # def is_valid(self): + + # def validate_name(self): + + + @staticmethod + def validate_name_string(nstr): + if re.search('^>', nstr): + return 1 + elif re.search('^@', nstr): + return 1 + else: + return 0 + + def validate_seq(self): + ''' + validates general seqence not specified for DNA or others. + ''' + return Sequence.generic_validate(self.seq, '[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]') + + @staticmethod + def generic_validate(seq, domain): + # pattern created from passed domain (domain contains chars that are not allowed) + pattern = re.compile(domain) #'[^ACGNTUBDHKMRSVWY\-\nacgntubdhkmrsvwy]' + # if sequence contains illegal chars + if pattern.search(seq): + # if digits it can be ok if format like (60 xxxxxxxxxx xxx...) + if re.search('(\d+)', seq): + # to check that we have to transform array + seq_array = seq.split('\n') + new_array = [] # array to store new sequence as array of arrays + for r in seq_array: + r = r.lstrip() # removing ' ' from beginings and ends + nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa + new_array.append(nr) + + end_of_seq_array = len(seq_array) + # if min. two lines calculate expected line length + if end_of_seq_array > 1: + line_length = int(new_array[1][0])-int(new_array[0][0]) + + # validate ecah block (between " " [space]) of given sequence + i = 0 + while i < end_of_seq_array: + if not re.search('(\d+)', new_array[i][0]): + return 7 # line doesn't starts with digit + if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): + return 0 # bad line length + for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit + if len(r) != 10: # block not eq 10 + if len(r) < 10: # if less it can be ok if last elem of last line + if(i == end_of_seq_array - 1): + if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. + return 0 # not last elem of last line + else: + return 0 # not last line + else: + return 0 # block not eq 10 + if pattern.search(r): + return 0 + i += 1 + else: + return 0 # digit is not first char + # return pattern.search(seq) but nan error code returned before + return 1 + return 1 # valid + + # def validate_dna_seq(self): + + # def validate_other_seq(self): + + @staticmethod + def detailed_validate_generic(seq, domain): + not_valid = 0 + missmatches = {} + # pattern created from passed domain (domain contains chars that are not allowed) + pattern = re.compile(domain) + # find not allowed chars in sequence + m = pattern.finditer(seq) + log_info = [] + # if not allowed chars found + if m: + # it may be 61 xxxxxxxxxx xxx.... format + if re.search('(\d+)', seq): + seq_array = seq.split('\n') + new_array = [] # array to store new sequence after cleaning and transformation + for r in seq_array: + r = r.lstrip() # removing ' ' from beginings and ends + nr = r.split(' ') # split to array to catch all blocks aaaaaaaaaa aaaaaaaaaa + new_array.append(nr) + end_of_seq_array = len(seq_array) + # if min. two lines calculate expected line length + if end_of_seq_array > 1: + line_length = int(new_array[1][0])-int(new_array[0][0]) + + # validate each block (between " " [space]) of given sequence + i = 0 + while i < end_of_seq_array: + # digit on begining of line was not found - error + if not re.search('(\d+)', new_array[i][0]): + log_info.append('line '+str(i+1)+": line doesn't starts with digit") # line doesn't starts with digit + # check if line length = expected line length last line can be shorter + if not (len(new_array[i])-1)*10 == line_length and i != (end_of_seq_array-1): + #return 0 # bad line length + log_info.append('line '+str(i+1)+': bad line length') + #chcek all blocks if are eq 10 (last can be shorter) + for a, r in enumerate(new_array[i][1:]): # skip first elem which is digit + if len(r) != 10: # block not eq 10 + if len(r) < 10: # if less it can be ok if last elem of last line + if(i == end_of_seq_array - 1): + if a != len(new_array[i]) - 2: # if last -2 because enumerate is from first elem not 0 elem. + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains les then 10 chars') # not last elem of last line + else: + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains les then 10 chars') # not last line + else: + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains more then 10 chars') # block gt 10 + # if block contains illegal chars now after transtrmation it should contain only legal chars. + if pattern.search(r): + log_info.append('line '+str(i+1)+': block '+str(a+1)+' contains illegal chars') + i += 1 + else: + # in this case it is not seq like "10 xxxxx xxxxx" + for mitem in m: + log_info.append('Position:\t'+str(mitem.start())+'\tvalue:\t'+str(mitem.group())) + # none of not allowed chars were found sequence OK + return log_info + # def detailed_validate_dna_seq(self): + + # def detailed_validate_other_seq(self): + + def cut(self, length, step): + ''' + cutting contig into smaller parts accordigly to supplied params + length of contig (number of chars) + step offset between current and next start + ''' + self.normalize() + i = 0 + contig_end = len(self.seq) # last position of contig + contig_list = [] # contig list returning by function + while i+length <= contig_end: + contig_list.append(Sequence(self.name+'_frag_'+str(i + 1)+':'+str(i + length), str(self.seq[i:i+length]))) + i = i+step + return contig_list + + def cut_name(self, length, start = 0): + self.name = self.name[start:length] + + def leave_name_after_marker(self, mark, length = 0, keep_marker = 1): + m = re.search(re.escape(mark), self.name) + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + logger.debug(m) + logger.debug(keep_marker) + if m: + # keep original marker or skip it + + if keep_marker == 1: + s = m.start() + else: + s = m.end() + # defined length or return string to end + if length > 0: + self.name = '>'+self.name[s:s+length].lstrip('>') + else: + self.name = '>'+self.name[s:].lstrip('>') + return 1 + return 0 + + + def reverse(self): + ''' + creates reversed sequence + ''' + self.normalize() + nr = re.sub('\n', '', self.seq) + rev = nr[::-1] + rev = rev.translate(maketrans('ACTGactg', 'TGACtgac')) + # creating 80 chars lines + #rev = re.sub("(.{80})", '\\1\n', rev, 0) + return Sequence('>rev_'+self.name.lstrip('>'), rev) + + + def normalize(self): + self.seq = re.sub(' ', '', self.seq) + self.seq = re.sub('^\d', '', self.seq, re.M) + self.seq = re.sub('\n', '', self.seq) + self.seq = re.sub('\r', '', self.seq) + + def statistics(self): + ''' + returns simple statistics for contig + ''' + self.normalize() + r = {} + c = Counter(self.seq) + r['A'] = c['A']+c['a'] + r['C'] = c['C']+c['c'] + r['G'] = c['G']+c['g'] + r['T'] = c['T']+c['t'] + r['N'] = c['N']+c['n'] + r['L'] = len(self.seq) + return r + + #def getRange(self, start, stop): + # return self.seq[start:stop] + + def translate_dna2rna(self): + nc = self.seq.translate(maketrans('ACTGactg', 'UGACugac')) + return Sequence('>rna_'+self.name, nc) + + def translate_rna2dna(self): + nc = self.seq.translate(maketrans('UGACugac', 'ACTGactg')) + return Sequence('>dna_'+self.name, nc) + + # ctrl f1 frame 1 forward, r1 frame 1 revers, fall torward all frames, rall reverse all frames, all in this way? + # supply dict of translation or its constant? + @staticmethod + def translate2protein_in_range_generic(seq, start, stop, tdict): + p = '' + p_stop = '' + # search results in distribution to frames + frame1 = [] + frame2 = [] + frame3 = [] + + # creating pattern (from dict) to find start codons + for r in start: + p += r+'|' + p = '('+p.rstrip('|')+')' + + # creating pattern to find stop codons + for r in stop: + p_stop += r+'|' + p_stop = '('+p_stop.rstrip('|')+')' + + m = re.finditer(p, seq) + + # there will be stored latest string position for each frame + frame_iterator = [0,0,0] + + stop_pos = len(seq) # where to stop searching if no stopcodon found + + # using each found start codon + for r in m: + # if start is lower then last used position skip it. + if frame_iterator[r.start()%3] <= r.start(): + # set i for start position of current start contig + i = r.start() + ret = '' + while i+3 <= stop_pos: + ret += Sequence.translate(seq[i:i+3], tdict) + if re.match(p_stop, seq[i:i+3]): + i = i+3 + break + else: + i = i+3 + + frame_iterator[r.start()%3] = i + if r.start()%3 == 0: + frame1.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) + elif r.start()%3 == 1: + frame2.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) + elif r.start()%3 == 2: + frame3.append((ret,r.start(),i,str(r.start()/3+1),str(i-r.start()))) + + return [frame1, frame2, frame3] + + def translate2protein_in_range(self, start, stop, tdict): + + f = Sequence.translate2protein_in_range_generic(self.seq, start, stop, tdict) + r = Sequence.translate2protein_in_range_generic(self.reverse().seq, start, stop, tdict) + + return {'fwd':f, 'rev':r} + + + @staticmethod + def translate2protein_generic(seq, tdict): + # +5 to secure all frames + f1 = '' + f2 = '' + f3 = '' + i = 0 + while i+5 < len(seq): + f1 += Sequence.translate(seq[i:i+3], tdict) + f2 += Sequence.translate(seq[i+1:i+4], tdict) + f3 += Sequence.translate(seq[i+2:i+5], tdict) + i = i + 3 + + return [('',f1,seq[-2:]),(seq[0:1],f2,seq[-1:]),(seq[0:2],f2,'')] + + def translate2protein(self, tdict): + + f = Sequence.translate2protein_generic(self.seq, tdict) + r = Sequence.translate2protein_generic(self.reverse().seq, tdict) + return {'fwd':f, 'rev':r} + + @staticmethod + def translate(codon, tdict): + if codon in tdict: + return tdict[codon] + else: + return '|'+codon+'|' + + def find_aprox_motif(self, motif, missmatch_level): + self.normalize() + return fuzzy.find_all_motifs(motif, self.seq, missmatch_level, hs_start_pos = 0) + + def find_primers(self, start, stop, mode, len_min = 50, len_max = 10000): + return self.find_aprox_primers(start, stop, mode, 0, len_min, len_max) + + + def find_aprox_primers(self, start, stop, mode, missmatch_level = 0, len_min = 50, len_max = 10000): + #start 5'->3' + # add missmatch_level condition if 50%> + logger = logging.getLogger(__name__) + #logger.setLevel(logging.DEBUG) + logger.debug('given args: start:'+start+' stop: '+stop+' mode: '+mode+' mm level: '+str(missmatch_level)+' len_min: '+str(len_min)+' len_max: '+str(len_max)) + #logger.debug('sequence: '+self.seq) + if mode.upper() == 'FR': + rev = stop[::-1] + stop = rev.translate(maketrans('ACTGactg', 'TGACtgac')) + elif mode.upper() != 'FF': + raise ('Unexpected mode: '+str(mode)+' expected values [FR|FF]') + + r_list = [] + self.normalize() + + res = fuzzy.find_all_motifs_in_aprox_range(start, stop, self.seq, missmatch_level, 0, len_min, len_max) + if res: + r_list.extend(res) + + res = fuzzy.find_all_motifs_in_aprox_range(start, stop, self.reverse().seq, missmatch_level, 0, len_min, len_max) + if res: + r_list.extend(res) + + logger.debug(r_list) + return r_list + + def equal_to_name_frag(self, name_frag): + if re.search(re.escape(name_frag), self.name): + #print re.search(name_frag, self.name) + return 1 + return 0 + + def __str__(self): + ''' + creates nicely outputed string + ''' + if re.search('^@', self.name) and len(self.quality) == len(self.seq): + return self.name+'\n'+self.seq+'\n+\n'+self.quality+'\n' + else: + return self.name+'\n'+re.sub("(.{80})", '\\1\n', self.seq, 0)+'\n' + + + + def __len__(self): + return len(self.seq) + + def __cmp__(self, other): + if self.seq == other.seq: + return 0 + else: + return 1 + + def __eq__(self, other): + return self.seq == other.seq diff --git a/fatool/tests/test_fa.py b/fatool/tests/test_fa.py index b25e1f1..81b4e4d 100644 --- a/fatool/tests/test_fa.py +++ b/fatool/tests/test_fa.py @@ -1,193 +1,201 @@ -import unittest -import sys -from fatool import * -import os - - - - -class TestFa(unittest.TestCase): - - def setUp(self): - with open('test.fa', 'w') as f: - f.write('>name3\nCTNACtacgatNNNNNNN\n>name4\nCTNAC\n>name5\nNNNNNACTGNNNN\n>name\nACTGactg\n>name7\nNNNACTGN\n>name8\nCTNACtacgatNNNNNNN\n>name2\nNNNNNNNNNACTGNNNN\n>name6\nCTNACtatNNN\n') - - with open('f2.fa', 'w') as f: - f.write('') - pass - - def test_setUpFa(self): - cl = [] - cl.append(Sequence('>name', 'ACTGactg')) - cl.append(Sequence('>name2', 'CCCTAGACTG')) - cl.append(Sequence('>name3', 'CTNNNNNNACtacgat')) - f = Fa(cl, 'test-fa') - self.assertEqual(cl, f.contigs) - self.assertEqual('test-fa', f.name) - self.assertEqual({'name':0, 'name2':1, 'name3':2}, f.contigs_idx) - cl.append('something') - with self.assertRaises(TypeError): - Fa(cl, 'name4') - - def test_str(self): - cl = [] - cl.append(Sequence('>name', 'ACTGactg')) - cl.append(Sequence('>name2', 'CCCTAGACTG')) - cl.append(Sequence('>name3', 'CTNNNNNNACtacgat')) - f = Fa(cl, 'test-fa') - self.assertEqual('>name\nACTGactg\n>name2\nCCCTAGACTG\n>name3\nCTNNNNNNACtacgat\n', str(f)) - - def test_add_contig(self): - cl = [] - cl.append(Sequence('>name', 'ACTGactg')) - f = Fa(cl, 'test-fa') - self.assertEqual(cl, f.contigs) - f.add_contig(Sequence('>name2', 'CCCTAGACTG')) - cl.append(Sequence('>name2', 'CCCTAGACTG')) - self.assertEqual(cl, f.contigs) - f.add_contig(Sequence('>name2', 'ACTGaaaaaaa') ) - self.assertEqual(cl, f.contigs) - cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'ACTGaaaaaaa')] - f.add_contig(Sequence('>name2', 'ACTGaaaaaaa'), 1) - self.assertEqual(cl, f.contigs) - - def test_add_contigs(self): - cl = [Sequence('>name', 'ACTGactg')] - f = Fa(cl, 'test-fa') - self.assertEqual(cl, f.contigs) - cl.append(Sequence('>name2', 'CCCTAGACTG')) - cl.append(Sequence('>name3', 'CTNNNNNNACtacgat')) - f.add_contigs([Sequence('>name2', 'CCCTAGACTG'), Sequence('>name3', 'CTNNNNNNACtacgat')]) - self.assertEqual(cl, f.contigs) - f.add_contigs([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')]) - self.assertEqual(cl, f.contigs) - f.add_contigs([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], 1) - cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] - self.assertEqual(cl, f.contigs) - #self.assertEqual(cl, f.contigs) - - def test_show_names(self): - cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] - f = Fa(cl, 'test-fa') - self.assertEqual(['name','name2','name3'], f.show_names()) - f.add_contig(Sequence('>name2', 'ACTGaaaaaaa'), 1) - self.assertEqual(['name','name3','name2'], f.show_names()) - f.add_contig(Sequence('>name7', 'ACTGaaaaaaa'), 1) - self.assertEqual(['name','name3','name2','name7'], f.show_names()) - - def test_extract(self): - cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] - f = Fa(cl, 'test-fa') - self.assertEqual(cl, f.contigs) - cl2 = [Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] - self.assertEqual(cl2, f.extract(['name2', 'name3']).contigs) - self.assertEqual('extr_test-fa', f.extract(['name2', 'name3']).name) - self.assertEqual(cl2, f.extract(['name2', 'name3', 'name321']).contigs) - - - def test_remove(self): - cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] - f = Fa(cl, 'test-fa') - self.assertEqual([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], f.remove(['name']).contigs) - self.assertEqual([Sequence('>name', 'ACTGactg')], f.remove(['name2','name3']).contigs) - self.assertEqual([Sequence('>name', 'ACTGactg')], f.remove(['name2','name3','name234']).contigs) - self.assertEqual([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], f.remove(['name']).contigs) - - def test_statistics(self): - cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')] - f = Fa(cl, 'test-fa') - stat = { - 'A': 7, 'C': 8, 'T': 7, 'G': 4, 'N': 22, 'L': 48, - 'nbp1000': 0, 'nbp5000': 0, 'nbp10000': 0, 'nbp25000': 0, 'nbp50000': 0, - 'lbp1000': 0, 'lbp5000': 0, 'lbp10000': 0, 'lbp25000': 0, 'lbp50000': 0, - 'totalc':4, 'N50':17, 'L50':2, 'N75':8, 'L75':3, 'N90':8, 'L90':3, - 'longest':18 - } - - self.assertEqual(stat, f.statistics()) - - def test_sort(self): - cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')] - f = Fa(cl, 'test-fa') - cl = [Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), Sequence('>name4', 'CTNAC')] - #for r in f.sort(1).contigs: - # print r - #for r in cl.reverse(): - # print r - self.assertEqual(cl, f.sort(-1).contigs) - cl = [Sequence('>name4', 'CTNAC'), Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] - self.assertEqual(cl, f.sort(1).contigs) - - def test_join(self): - cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')] - f = Fa(cl, 'test-fa') - cl2 = [Sequence('>name', 'NNNNNNNN'), Sequence('>name5', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtacgatNNNNNNN')] - f2 = Fa(cl2, 'test2-fa') - f.join([f2]) - cl.append(Sequence('>name5', 'NNNNNNNNNACTGNNNN')) - cl.append(Sequence('>name6', 'CTNACtacgatNNNNNNN')) - self.assertEqual(cl, f.contigs) - cl = [ - Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), - Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN'), Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN') - ] - f = Fa([Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN')], 'fa1') - f2 = Fa([Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')], 'fa2') - f3 = Fa([Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN')], 'fa3') - f4 = Fa([Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN')], 'fa4') - f.join([f2,f3,f4]) - self.assertEqual(cl, f.contigs) - f = Fa([Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], 'fa1') - f2 = Fa([Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name', 'AnnnnnCTGactg')], 'fa2') - f3 = Fa([Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN'), Sequence('>name4', 'annaCTNAC'), Sequence('>name', 'AaaCTnnaGactg')], 'fa3') - f4 = Fa([Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name3', 'CTNaaaACtacgatNNNNNNN'), Sequence('>name', 'AnnnCTGactg')], 'fa4') - f.join([f2,f3,f4]) - self.assertEqual(cl, f.contigs) - cl = [ - Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), - Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') - ] - f = Fa([Sequence('>name', 'NNN'), Sequence('>name2', 'ACTGNNNN'), Sequence('>name3', 'NNNNNNN')], 'fa1') - f2 = Fa([Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')], 'fa2') - f3 = Fa([Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNNN'), Sequence('>name', 'ACTGactg')], 'fa3') - f4 = Fa([Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'),Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') ], 'fa4') - f.join([f2,f3,f4], 1) - - self.assertEqual(cl, f.contigs) - - def test_load_from_file(self): - cl = [ - Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), - Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') - ] - with open('test.fa') as f: - fob = Fa.load_from_file(f) - - self.assertEqual('test.fa', fob.name) - self.assertEqual(cl, fob.contigs) - f2 = Fa.load_from_file('test.fa') - self.assertEqual('test.fa', f2.name) - self.assertEqual(cl, f2.contigs) - - - def test_write(self): - cl = [ - Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), - Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') - ] - f = Fa(cl, 'fa1') - f.write('f2.fa') - with open('test.fa') as f1, open('f2.fa') as f2: - f1_content = f1.read() - f2_content = f2.read() - self.assertEqual(f1_content, f2_content) - - - def tearDown(self): - os.remove('f2.fa') - os.remove('test.fa') - pass - - -if __name__ == "__main__": +import unittest +import sys +from fatool import * +import os + + + + +class TestFa(unittest.TestCase): + + def setUp(self): + with open('test.fa', 'w') as f: + f.write('>name3\nCTNACtacgatNNNNNNN\n>name4\nCTNAC\n>name5\nNNNNNACTGNNNN\n>name\nACTGactg\n>name7\nNNNACTGN\n>name8\nCTNACtacgatNNNNNNN\n>name2\nNNNNNNNNNACTGNNNN\n>name6\nCTNACtatNNN\n') + + with open('f2.fa', 'w') as f: + f.write('') + pass + + def test_setUpFa(self): + cl = [] + cl.append(Sequence('>name', 'ACTGactg')) + cl.append(Sequence('>name2', 'CCCTAGACTG')) + cl.append(Sequence('>name3', 'CTNNNNNNACtacgat')) + f = Fa(cl, 'test-fa') + self.assertEqual(cl, f.contigs) + self.assertEqual('test-fa', f.name) + self.assertEqual({'name':0, 'name2':1, 'name3':2}, f.contigs_idx) + cl.append('something') + with self.assertRaises(TypeError): + Fa(cl, 'name4') + ''' + def test_str(self): + cl = [] + cl.append(Sequence('>name', 'ACTGactg')) + cl.append(Sequence('>name2', 'CCCTAGACTG')) + cl.append(Sequence('>name3', 'CTNNNNNNACtacgat')) + f = Fa(cl, 'test-fa') + self.assertEqual('>name\nACTGactg\n>name2\nCCCTAGACTG\n>name3\nCTNNNNNNACtacgat\n', str(f)) + + def test_add_contig(self): + cl = [] + cl.append(Sequence('>name', 'ACTGactg')) + f = Fa(cl, 'test-fa') + self.assertEqual(cl, f.contigs) + f.add_contig(Sequence('>name2', 'CCCTAGACTG')) + cl.append(Sequence('>name2', 'CCCTAGACTG')) + self.assertEqual(cl, f.contigs) + f.add_contig(Sequence('>name2', 'ACTGaaaaaaa') ) + self.assertEqual(cl, f.contigs) + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'ACTGaaaaaaa')] + f.add_contig(Sequence('>name2', 'ACTGaaaaaaa'), 1) + self.assertEqual(cl, f.contigs) + + def test_add_contigs(self): + cl = [Sequence('>name', 'ACTGactg')] + f = Fa(cl, 'test-fa') + self.assertEqual(cl, f.contigs) + cl.append(Sequence('>name2', 'CCCTAGACTG')) + cl.append(Sequence('>name3', 'CTNNNNNNACtacgat')) + f.add_contigs([Sequence('>name2', 'CCCTAGACTG'), Sequence('>name3', 'CTNNNNNNACtacgat')]) + self.assertEqual(cl, f.contigs) + f.add_contigs([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')]) + self.assertEqual(cl, f.contigs) + f.add_contigs([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], 1) + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + self.assertEqual(cl, f.contigs) + #self.assertEqual(cl, f.contigs) + + def test_show_names(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + f = Fa(cl, 'test-fa') + self.assertEqual(['name','name2','name3'], f.show_names()) + f.add_contig(Sequence('>name2', 'ACTGaaaaaaa'), 1) + self.assertEqual(['name','name3','name2'], f.show_names()) + f.add_contig(Sequence('>name7', 'ACTGaaaaaaa'), 1) + self.assertEqual(['name','name3','name2','name7'], f.show_names()) + + def test_extract(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + f = Fa(cl, 'test-fa') + self.assertEqual(cl, f.contigs) + cl2 = [Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + self.assertEqual(cl2, f.extract(['name2', 'name3']).contigs) + self.assertEqual('extr_test-fa', f.extract(['name2', 'name3']).name) + self.assertEqual(cl2, f.extract(['name2', 'name3', 'name321']).contigs) + + + def test_remove(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + f = Fa(cl, 'test-fa') + self.assertEqual([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], f.remove(['name']).contigs) + self.assertEqual([Sequence('>name', 'ACTGactg')], f.remove(['name2','name3']).contigs) + self.assertEqual([Sequence('>name', 'ACTGactg')], f.remove(['name2','name3','name234']).contigs) + self.assertEqual([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], f.remove(['name']).contigs) + + def test_statistics(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')] + f = Fa(cl, 'test-fa') + stat = { + 'A': 7, 'C': 8, 'T': 7, 'G': 4, 'N': 22, 'L': 48, + 'nbp1000': 0, 'nbp5000': 0, 'nbp10000': 0, 'nbp25000': 0, 'nbp50000': 0, + 'lbp1000': 0, 'lbp5000': 0, 'lbp10000': 0, 'lbp25000': 0, 'lbp50000': 0, + 'totalc':4, 'N50':17, 'L50':2, 'N75':8, 'L75':3, 'N90':8, 'L90':3, + 'longest':18 + } + + self.assertEqual(stat, f.statistics()) + + def test_sort(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')] + f = Fa(cl, 'test-fa') + cl = [Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), Sequence('>name4', 'CTNAC')] + #for r in f.sort(1).contigs: + # print r + #for r in cl.reverse(): + # print r + self.assertEqual(cl, f.sort(-1).contigs) + cl = [Sequence('>name4', 'CTNAC'), Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] + self.assertEqual(cl, f.sort(1).contigs) + + def test_join(self): + cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')] + f = Fa(cl, 'test-fa') + cl2 = [Sequence('>name', 'NNNNNNNN'), Sequence('>name5', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtacgatNNNNNNN')] + f2 = Fa(cl2, 'test2-fa') + f.join([f2]) + cl.append(Sequence('>name5', 'NNNNNNNNNACTGNNNN')) + cl.append(Sequence('>name6', 'CTNACtacgatNNNNNNN')) + self.assertEqual(cl, f.contigs) + cl = [ + Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), + Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN'), Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN') + ] + f = Fa([Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN')], 'fa1') + f2 = Fa([Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')], 'fa2') + f3 = Fa([Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN')], 'fa3') + f4 = Fa([Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN')], 'fa4') + f.join([f2,f3,f4]) + self.assertEqual(cl, f.contigs) + f = Fa([Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], 'fa1') + f2 = Fa([Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name', 'AnnnnnCTGactg')], 'fa2') + f3 = Fa([Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN'), Sequence('>name4', 'annaCTNAC'), Sequence('>name', 'AaaCTnnaGactg')], 'fa3') + f4 = Fa([Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name3', 'CTNaaaACtacgatNNNNNNN'), Sequence('>name', 'AnnnCTGactg')], 'fa4') + f.join([f2,f3,f4]) + self.assertEqual(cl, f.contigs) + cl = [ + Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), + Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') + ] + f = Fa([Sequence('>name', 'NNN'), Sequence('>name2', 'ACTGNNNN'), Sequence('>name3', 'NNNNNNN')], 'fa1') + f2 = Fa([Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')], 'fa2') + f3 = Fa([Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name6', 'CTNNN'), Sequence('>name', 'ACTGactg')], 'fa3') + f4 = Fa([Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'),Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') ], 'fa4') + f.join([f2,f3,f4], 1) + + self.assertEqual(cl, f.contigs) + + def test_load_from_file(self): + cl = [ + Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), + Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') + ] + with open('test.fa') as f: + fob = Fa.load_from_file(f) + + self.assertEqual('test.fa', fob.name) + self.assertEqual(cl, fob.contigs) + f2 = Fa.load_from_file('test.fa') + self.assertEqual('test.fa', f2.name) + self.assertEqual(cl, f2.contigs) + + + def test_write(self): + cl = [ + Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC'), Sequence('>name5', 'NNNNNACTGNNNN'), Sequence('>name', 'ACTGactg'), + Sequence('>name7', 'NNNACTGN'), Sequence('>name8', 'CTNACtacgatNNNNNNN'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name6', 'CTNACtatNNN') + ] + f = Fa(cl, 'fa1') + f.write('f2.fa') + with open('test.fa') as f1, open('f2.fa') as f2: + f1_content = f1.read() + f2_content = f2.read() + self.assertEqual(f1_content, f2_content) + + + def tearDown(self): + os.remove('f2.fa') + os.remove('test.fa') + pass + ''' + def test_conv_to_fq(self): + cl = [] + test = 'ATGGAATCGGCTTTTAATACTGCAGGGGCGTTAAGTTGGCATGAACTCACAACCAATAATACCGAAGAGGCCATGCGCTTCTATGCTGAGATTTTTGGCTGGCACTTTAAAACCGTCAAAATGCCCCACGGTCACTATCACATTATTGAAAACGAGGGGATCAGCATTGGCGGAATTACCGACAGTTTAATCCCCACCCTTCCCTCACATTGGACTGGCTATATTACCGTTAACGATGTGGATCAAGTGGCTATCAGTGCTAAAAAACTCGGCGGTGACATTCTGTTTGGCCCTGAAGACATTCCAGAGGTGGGCCGTTTTTGTTGGATAAAAGACCCACAGGGCGCCATTATTGCGGCCATTAGCTATTTAAAACGTTGATGTAA' + cl.append(Sequence('>test', test)) + cl.append(Sequence('>test2','ATGGAATCGGCTTTTAATACTGCAGGGGCGTTAAGTTGGCATGAACTCACAACCAATAATACCGAAGAGGCCATGCGCTTCTATGCTGAGATTTTTGGCTGGCACTTTAAAACCGTCAAAATGCCCCACGGTCACTNNNNNN')) + f = Fa(cl,'fa_test') + fq = f.convert_to_fq(40) + print fq + +if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/setup.py b/setup.py index 5260473..df8dd06 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,14 @@ -from setuptools import setup - -setup(name='fatool', - version='0.3.1', - description='tools for handling fasta files', - #url='http://github.com/storborg/funniest', - author='Blazej Marciniak', - author_email='blazejmarciniak@gmail.com', - license='Apache 2.0', - packages=['fatool'], - install_requires=[ - ], - scripts=['bin/cmdfatool.py'], +from setuptools import setup + +setup(name='fatool', + version='0.3.1', + description='tools for handling fasta files', + #url='http://github.com/storborg/funniest', + author='Blazej Marciniak', + author_email='blazejmarciniak@gmail.com', + license='Apache 2.0', + packages=['fatool'], + install_requires=[ + ], + scripts=['bin/cmdfatool.py'], zip_safe=False) \ No newline at end of file