Skip to content

Commit

Permalink
Bug fix: GFF2 attributes parser misunderstand semicolons
Browse files Browse the repository at this point in the history
 * Now, the priority of '"' (double quote) is greater than ';'
   (semicolon). To keep compatibility from older BioRuby,
   the '\;' can still be used anywhere in attributes.
 * Some tests are added.
  • Loading branch information
ngoto committed Sep 9, 2008
1 parent 14fc7dd commit e38fd48
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 3 deletions.
31 changes: 28 additions & 3 deletions lib/bio/db/gff.rb
Expand Up @@ -121,9 +121,34 @@ def initialize(str)

def parse_attributes(attributes)
hash = Hash.new
scanner = StringScanner.new(attributes)
while scanner.scan(/(.*[^\\])\;/) or scanner.scan(/(.+)/)
key, value = scanner[1].split(' ', 2)

ary = attributes.split(/\;/)
while x = ary.shift
i0 = 0
while pos = x.index('"', i0)
i0 = pos + 1
while true
if pos = x.index('"', i0) then
i0 = pos + 1
break unless x[pos - 1, 1] == "\\"
else
if y = ary.shift
x.concat ';'
x.concat y
else
break
end
end
end
end #while pos = ...

# To keep compatibility, the '\;' is concatenated again.
if x[-1, 1] == "\\" and ary[0] then
x.concat ';'
x.concat ary.shift
end

key, value = x.split(' ', 2)
key.strip!
value.strip! if value
hash[key] = value
Expand Down
51 changes: 51 additions & 0 deletions test/unit/bio/db/test_gff.rb
Expand Up @@ -119,6 +119,57 @@ def test_add_seqname

end # class TestGFFRecordConstruct


class TestGFF2ComplexAttributes < Test::Unit::TestCase

# The test string is privided by Tomoaki NISHIYAMA.
def test_attributes_case1
str = "LG_I\tJGI\tCDS\t11052\t11064\t.\t-\t0\tname \"grail3.0116000101\"; proteinId 639579; exonNumber 3\n"

attributes = {
"name" => "\"grail3.0116000101\"",
"proteinId" => "639579",
"exonNumber" => "3"
}

assert_equal(attributes,
Bio::GFF::GFF2::Record.new(str).attributes)
end

# The test string is privided by Tomoaki NISHIYAMA and modified.
def test_attributes_case2
str = "LG_I\tJGI\tCDS\t11052\t11064\t.\t-\t0\tname \"grail3.0116000101\"; proteinId 639579; exonNumber 3; Note \"Semicolons ; and \;, and quote \\\" can be OK\"; Comment \"This is the \\\"comment\\\"\"\n"

attributes = {
"name" => "\"grail3.0116000101\"",
"proteinId" => "639579",
"exonNumber" => "3",
"Note" => "\"Semicolons ; and \;, and quote \\\" can be OK\"",
"Comment" => "\"This is the \\\"comment\\\"\""
}

assert_equal(attributes,
Bio::GFF::GFF2::Record.new(str).attributes)
end

def test_attributes_compatibility_backslash_semicolon
str =<<END_OF_DATA
I sgd gene 151453 151591 . + . Gene "CEN1" ; Note "Chromosome I Centromere"; Semicolon a "b;c" d "e;f;g" h; Illegal a\\;b c d; Comment "a ; b"
END_OF_DATA

attributes = {
'Gene' => '"CEN1"',
'Note' => '"Chromosome I Centromere"',
'Semicolon' => 'a "b;c" d "e;f;g" h',
'Illegal' => 'a\;b c d',
'Comment' => '"a ; b"'
}
assert_equal(attributes,
Bio::GFF::GFF2::Record.new(str).attributes)
end

end #class TestGFF2ComplexAttributes

class TestGFF3 < Test::Unit::TestCase
def setup
@data =<<END_OF_DATA
Expand Down

0 comments on commit e38fd48

Please sign in to comment.