From 8d73b4acfcaebf57ab6004ba150cfebca03bf78e Mon Sep 17 00:00:00 2001 From: Blazej Marciniak Date: Sun, 29 Jan 2017 11:10:42 +0100 Subject: [PATCH] cutting by length added modified: fatool/fa.py modified: fatool/tests/test_fa.py --- fatool/fa.py | 17 ++++++++++++----- fatool/tests/test_fa.py | 31 +++++++++++++++++-------------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/fatool/fa.py b/fatool/fa.py index c682778..1e54c34 100644 --- a/fatool/fa.py +++ b/fatool/fa.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -import re +import re #gex as re import math from fatool import Sequence import logging @@ -51,11 +51,10 @@ def load_from_file(file): @staticmethod def load_content(content): - #print content - nc = content.split('>') + ncs = re.findall(re.compile('(?=(^>[\S\s]+?)(^>|\Z))',re.M), content) contigs_list = [] - for r in nc[1:]: - contigs_list.append(Sequence('>'+r.split('\n', 1)[0].rstrip(), re.sub('^>.*\n', '', '>'+r.rstrip()))) + for r in ncs: + contigs_list.append(Sequence(r[0].split('\n', 1)[0].rstrip(), re.sub('^>.*\n', '', r[0].rstrip()))) return contigs_list def write(self, fafile): @@ -127,6 +126,14 @@ def remove(self, contigs_name_list): if not r.name in contigs_name_list: new_contig_list.append(r) return Fa(new_contig_list, 'rem_'+self.name) + + def cut_min_len(self, min_len): + nc = [] + for r in self.contigs: + if len(r) > min_len: + nc.append(r) + return Fa(nc,'cutof_'+str(min_len)+self.name) + def validate(self): ''' diff --git a/fatool/tests/test_fa.py b/fatool/tests/test_fa.py index 81b4e4d..a9e60c4 100644 --- a/fatool/tests/test_fa.py +++ b/fatool/tests/test_fa.py @@ -24,11 +24,11 @@ def test_setUpFa(self): f = Fa(cl, 'test-fa') self.assertEqual(cl, f.contigs) self.assertEqual('test-fa', f.name) - self.assertEqual({'name':0, 'name2':1, 'name3':2}, f.contigs_idx) + self.assertEqual({'>name':0, '>name2':1, '>name3':2}, f.contigs_idx) cl.append('something') with self.assertRaises(TypeError): Fa(cl, 'name4') - ''' + def test_str(self): cl = [] cl.append(Sequence('>name', 'ACTGactg')) @@ -69,29 +69,32 @@ def test_add_contigs(self): def test_show_names(self): cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] f = Fa(cl, 'test-fa') - self.assertEqual(['name','name2','name3'], f.show_names()) + self.assertEqual(['>name','>name2','>name3'], f.show_names()) f.add_contig(Sequence('>name2', 'ACTGaaaaaaa'), 1) - self.assertEqual(['name','name3','name2'], f.show_names()) + self.assertEqual(['>name','>name3','>name2'], f.show_names()) f.add_contig(Sequence('>name7', 'ACTGaaaaaaa'), 1) - self.assertEqual(['name','name3','name2','name7'], f.show_names()) + self.assertEqual(['>name','>name3','>name2','>name7'], f.show_names()) def test_extract(self): cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] f = Fa(cl, 'test-fa') self.assertEqual(cl, f.contigs) cl2 = [Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] - self.assertEqual(cl2, f.extract(['name2', 'name3']).contigs) - self.assertEqual('extr_test-fa', f.extract(['name2', 'name3']).name) - self.assertEqual(cl2, f.extract(['name2', 'name3', 'name321']).contigs) + self.assertEqual(cl2, f.extract(['>name2', '>name3']).contigs) + print 'printing contigs' + for c in f.extract(['name2', 'name3']).contigs: + print c + self.assertEqual('>extr_test-fa', f.extract(['>name2', '>name3']).name) + self.assertEqual(cl2, f.extract(['>name2', '>name3', '>name321']).contigs) def test_remove(self): cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')] f = Fa(cl, 'test-fa') - self.assertEqual([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], f.remove(['name']).contigs) - self.assertEqual([Sequence('>name', 'ACTGactg')], f.remove(['name2','name3']).contigs) - self.assertEqual([Sequence('>name', 'ACTGactg')], f.remove(['name2','name3','name234']).contigs) - self.assertEqual([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], f.remove(['name']).contigs) + self.assertEqual([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], f.remove(['>name']).contigs) + self.assertEqual([Sequence('>name', 'ACTGactg')], f.remove(['>name2','>name3']).contigs) + self.assertEqual([Sequence('>name', 'ACTGactg')], f.remove(['>name2','>name3','>name234']).contigs) + self.assertEqual([Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN')], f.remove(['>name']).contigs) def test_statistics(self): cl = [Sequence('>name', 'ACTGactg'), Sequence('>name2', 'NNNNNNNNNACTGNNNN'), Sequence('>name3', 'CTNACtacgatNNNNNNN'), Sequence('>name4', 'CTNAC')] @@ -187,7 +190,7 @@ def tearDown(self): os.remove('f2.fa') os.remove('test.fa') pass - ''' + def test_conv_to_fq(self): cl = [] test = 'ATGGAATCGGCTTTTAATACTGCAGGGGCGTTAAGTTGGCATGAACTCACAACCAATAATACCGAAGAGGCCATGCGCTTCTATGCTGAGATTTTTGGCTGGCACTTTAAAACCGTCAAAATGCCCCACGGTCACTATCACATTATTGAAAACGAGGGGATCAGCATTGGCGGAATTACCGACAGTTTAATCCCCACCCTTCCCTCACATTGGACTGGCTATATTACCGTTAACGATGTGGATCAAGTGGCTATCAGTGCTAAAAAACTCGGCGGTGACATTCTGTTTGGCCCTGAAGACATTCCAGAGGTGGGCCGTTTTTGTTGGATAAAAGACCCACAGGGCGCCATTATTGCGGCCATTAGCTATTTAAAACGTTGATGTAA' @@ -198,4 +201,4 @@ def test_conv_to_fq(self): print fq if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main()