From e38fd48aaf41f94eaec39a639a7f6c5db62c22e8 Mon Sep 17 00:00:00 2001 From: Naohisa Goto Date: Tue, 9 Sep 2008 20:22:30 +0900 Subject: [PATCH] Bug fix: GFF2 attributes parser misunderstand semicolons * Now, the priority of '"' (double quote) is greater than ';' (semicolon). To keep compatibility from older BioRuby, the '\;' can still be used anywhere in attributes. * Some tests are added. --- lib/bio/db/gff.rb | 31 +++++++++++++++++++--- test/unit/bio/db/test_gff.rb | 51 ++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/lib/bio/db/gff.rb b/lib/bio/db/gff.rb index e3a43ecfd..24e1fab1b 100644 --- a/lib/bio/db/gff.rb +++ b/lib/bio/db/gff.rb @@ -121,9 +121,34 @@ def initialize(str) def parse_attributes(attributes) hash = Hash.new - scanner = StringScanner.new(attributes) - while scanner.scan(/(.*[^\\])\;/) or scanner.scan(/(.+)/) - key, value = scanner[1].split(' ', 2) + + ary = attributes.split(/\;/) + while x = ary.shift + i0 = 0 + while pos = x.index('"', i0) + i0 = pos + 1 + while true + if pos = x.index('"', i0) then + i0 = pos + 1 + break unless x[pos - 1, 1] == "\\" + else + if y = ary.shift + x.concat ';' + x.concat y + else + break + end + end + end + end #while pos = ... + + # To keep compatibility, the '\;' is concatenated again. + if x[-1, 1] == "\\" and ary[0] then + x.concat ';' + x.concat ary.shift + end + + key, value = x.split(' ', 2) key.strip! value.strip! if value hash[key] = value diff --git a/test/unit/bio/db/test_gff.rb b/test/unit/bio/db/test_gff.rb index c12662998..7863db7ff 100644 --- a/test/unit/bio/db/test_gff.rb +++ b/test/unit/bio/db/test_gff.rb @@ -119,6 +119,57 @@ def test_add_seqname end # class TestGFFRecordConstruct + + class TestGFF2ComplexAttributes < Test::Unit::TestCase + + # The test string is privided by Tomoaki NISHIYAMA. + def test_attributes_case1 + str = "LG_I\tJGI\tCDS\t11052\t11064\t.\t-\t0\tname \"grail3.0116000101\"; proteinId 639579; exonNumber 3\n" + + attributes = { + "name" => "\"grail3.0116000101\"", + "proteinId" => "639579", + "exonNumber" => "3" + } + + assert_equal(attributes, + Bio::GFF::GFF2::Record.new(str).attributes) + end + + # The test string is privided by Tomoaki NISHIYAMA and modified. + def test_attributes_case2 + str = "LG_I\tJGI\tCDS\t11052\t11064\t.\t-\t0\tname \"grail3.0116000101\"; proteinId 639579; exonNumber 3; Note \"Semicolons ; and \;, and quote \\\" can be OK\"; Comment \"This is the \\\"comment\\\"\"\n" + + attributes = { + "name" => "\"grail3.0116000101\"", + "proteinId" => "639579", + "exonNumber" => "3", + "Note" => "\"Semicolons ; and \;, and quote \\\" can be OK\"", + "Comment" => "\"This is the \\\"comment\\\"\"" + } + + assert_equal(attributes, + Bio::GFF::GFF2::Record.new(str).attributes) + end + + def test_attributes_compatibility_backslash_semicolon + str =< '"CEN1"', + 'Note' => '"Chromosome I Centromere"', + 'Semicolon' => 'a "b;c" d "e;f;g" h', + 'Illegal' => 'a\;b c d', + 'Comment' => '"a ; b"' + } + assert_equal(attributes, + Bio::GFF::GFF2::Record.new(str).attributes) + end + + end #class TestGFF2ComplexAttributes + class TestGFF3 < Test::Unit::TestCase def setup @data =<