Skip to content

Commit

Permalink
Got the parser a bit better
Browse files Browse the repository at this point in the history
  • Loading branch information
tommorris committed Oct 22, 2008
1 parent b43fbbd commit d9a8311
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 106 deletions.
88 changes: 44 additions & 44 deletions lib/rena/rdfxmlparser.rb
Expand Up @@ -16,16 +16,18 @@ def initialize(xml_str, uri = nil)
"http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID",
"http://www.w3.org/1999/02/22-rdf-syntax-ns#about",
"http://www.w3.org/1999/02/22-rdf-syntax-ns#ID"]
@uri = Addressable::URI.parse(uri) unless uri.nil?
@uri = Addressable::URI.parse(uri).to_s unless uri.nil?
@graph = Rena::Graph.new
@xml = LibXML::XML::Parser.string(xml_str).parse
@id_mapping = Hash.new
root = @xml.root
if is_rdf_root?(root)
parse_descriptions(root)
else
root.each {|n|
if is_rdf_root?(n)
parse_descriptions(n)
debugger
end
}
end
Expand Down Expand Up @@ -57,7 +59,8 @@ def parse_subject(el)
fail_check(el)

if el.attributes.get_attribute_ns(SYNTAX_BASE, "about")
return URIRef.new(el.attributes.get_attribute_ns(SYNTAX_BASE, "about").value)
debugger if el.attributes.get_attribute_ns(SYNTAX_BASE, "about").value =~ /artist$/
return URIRef.new(base_helper(el.attributes.get_attribute_ns(SYNTAX_BASE, "about").value, el.base).to_s)
elsif el.attributes.get_attribute_ns(SYNTAX_BASE, "ID")
id = el.attributes.get_attribute_ns(SYNTAX_BASE, "ID")
if id_check?(id.value)
Expand All @@ -71,15 +74,9 @@ def parse_subject(el)
return BNode.new
end
subject = nil
element.attributes.each_attribute do |att|
uri = att.namespace + att.name
el.attributes.each_attribute do |att|
uri = url_helper(att.namespace + att.name).to_s
value = att.to_s
if uri == "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"
raise AboutEachException, "Failed as per RDFMS-AboutEach-Error001.rdf test from 2004 test suite"
end
if uri == "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"
raise AboutEachException, "Failed as per RDFMS-AboutEach-Error002.rdf test from 2004 test suite"
end
if uri == "http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"
raise
if name =~ /^[a-zA-Z_][a-zA-Z0-9]*$/
Expand All @@ -89,23 +86,11 @@ def parse_subject(el)
end
end

if uri == resourceuri #specified resource
element_uri = Addressable::URI.parse(value)
if (element_uri.relative?)
# we have an element with a relative URI
if (element.base?)
# the element has a base URI, use that to build the URI
value = "##{value}" if (value[0..0].to_s != "#")
value = "#{element.base}#{value}"
elsif (!@uri.nil?)
# we can use the document URI to build the URI for the element
value = @uri + element_uri
end
end
subject = URIRef.new(value)
if uri == SYNTAX_BASE + "#resource" || uri == SYNTAX_BASE + "#about" #specified resource
subject = URIRef.new(base_helper(value, el.base))
end

if uri == "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID" #BNode with ID
if uri.to_s == SYNTAX_BASE + "#nodeID" #BNode with ID
# we have a BNode with an identifier. First, we need to do syntax checking.
if value =~ /^[a-zA-Z_][a-zA-Z0-9]*$/
# now we check to see if the graph has the value
Expand Down Expand Up @@ -133,11 +118,21 @@ def id_check?(id)
!(!(id =~ /^[a-zA-Z_]\w*$/))
end

def parse_object_atts (el)
if el.attributes.get_attribute_ns(SYNTAX_BASE, "resource")
return URIRef.new(base_helper(el.attributes.get_attribute_ns(SYNTAX_BASE, "resource").value, el.base).to_s)
end
end

def parse_descriptions (node, subject = nil)
node.each_element { |el|
node.each_element { |el|
fail_check(el)
# detect a subject
subject = parse_subject(el) #if subject.nil?
if @id_mapping[node.hash]
subject = @id_mapping[node.id]
elsif subject.nil?
subject = parse_subject(el)
end

# find a class
unless el.name == "Description" && el.namespace_node.href == SYNTAX_BASE
Expand All @@ -151,13 +146,11 @@ def parse_descriptions (node, subject = nil)

el.each_element {|child|
predicate = url_helper(child.name, child.namespace_node.href, child.base)
if predicate.to_s == "http://example.org/property3"
#debugger
end
object = child.content
#debugger
if el.attributes.get_attribute_ns(SYNTAX_BASE, "nodeID")
@graph.add_triple(subject, predicate, forge_bnode_from_string(child.attributes.get_attribute_ns(SYNTAX_BASE, "nodeID").value))
elsif child.attributes.get_attribute_ns(SYNTAX_BASE, "resource")
@graph.add_triple(subject, predicate, URIRef.new(base_helper(child.attributes.get_attribute_ns(SYNTAX_BASE, "resource").value, child.base).to_s))
end
child.each {|contents|
if contents.text? and contents.content.strip.length != 0
Expand Down Expand Up @@ -187,15 +180,9 @@ def parse_descriptions (node, subject = nil)
#when "Collection";
end
else
@id_mapping[cel.hash] = object
@graph.add_triple(subject, predicate, object)
# if cel.attributes.get_attribute_ns(SYNTAX_BASE, "nodeID") && !cel.children.delete_if{|i| !i.element? }.empty?
# debugger
# parse_descriptions(cel.parent)
# debugger
# object = forge_bnode_from_string(cel.attributes.get_attribute_ns(SYNTAX_BASE, "nodeID").value)
# else
parse_descriptions(cel.parent, object)
# end
end
}

Expand Down Expand Up @@ -258,18 +245,31 @@ def smells_like_xml?(str)
!(!(str =~ /xmlns/))
end

def base_helper(uri, base = nil)
uri = Addressable::URI.parse(uri)
if uri.relative?
if !base.nil?
uri = Addressable::URI.parse(base)
elsif !@uri.nil?
uri = Addressable::URI.parse(@uri) + uri
end
end
#debugger if @uri.to_s =~ /bbc\.co\.uk/
return uri.to_s
end

def url_helper(name, ns, base = nil)
if ns != "" and !ns.nil?
a = Addressable::URI.parse(ns) + Addressable::URI.parse(name)
if ns.to_s.split("")[-1] == "#"
a = Addressable::URI.parse(ns) + Addressable::URI.parse("#" + name)
else
a = Addressable::URI.parse(ns) + Addressable::URI.parse(name)
end
else
a = Addressable::URI.parse(name)
end
if a.relative?
if !base.nil?
a = Addressable::URI.parse(base) + a
elsif !@uri.nil?
a = @uri + a
end
a = base_helper(a.to_s, base)
end

return URIRef.new(a.to_s)
Expand Down
8 changes: 6 additions & 2 deletions lib/rena/triple.rb
Expand Up @@ -48,6 +48,8 @@ def is_type?
protected

def self.coerce_subject(subject)
# TODO: do something intelligent with an Addressable:URI or URI

case subject
when URIRef, BNode
subject
Expand All @@ -58,7 +60,7 @@ def self.coerce_subject(subject)
BNode.new(subject)
end
else
raise InvalidSubject, "Subject is not of a known class"
raise InvalidSubject, "Subject is not of a known class (#{subject.class}: #{subject.inspect})"
end
end

Expand All @@ -76,6 +78,8 @@ def self.coerce_predicate(uri_or_string)
end

def self.coerce_object(object)
# TODO: do something intelligent with an Addressable:URI or URI

case object
when String, Integer, Float
Literal.untyped(object)
Expand All @@ -84,7 +88,7 @@ def self.coerce_object(object)
when URIRef, BNode, Literal
object
else
raise InvalidObject, "#{object.inspect} is not a valid object"
raise InvalidObject, "#{object.class}: #{object.inspect} is not a valid object"
end
end
end
Expand Down
133 changes: 73 additions & 60 deletions spec/parser_spec.rb
Expand Up @@ -33,7 +33,11 @@
EOF

graph = RdfXmlParser.new(sampledoc)
graph.graph.size.should == 9
graph.graph.size.should == 10
# print graph.graph.to_ntriples
# TODO: add datatype parsing
# TODO: make sure the BNode forging is done correctly - an internal element->nodeID mapping
# TODO: proper test
end

it "should raise an error if rdf:aboutEach is used, as per the negative parser test rdfms-abouteach-error001 (rdf:aboutEach attribute)" do
Expand Down Expand Up @@ -255,43 +259,42 @@
end.should_not raise_error
end

# it "should pass rdfms-syntax-incomplete/test003.rdf" do
# sampledoc = <<-EOF;
# <?xml version="1.0"?>
#
# <!--
# Copyright World Wide Web Consortium, (Massachusetts Institute of
# Technology, Institut National de Recherche en Informatique et en
# Automatique, Keio University).
#
# All Rights Reserved.
#
# Please see the full Copyright clause at
# <http://www.w3.org/Consortium/Legal/copyright-software.html>
#
# -->
# <!--
#
# On an rdf:Description or typed node rdf:nodeID behaves
# similarly to an rdf:about.
# $Id: test003.rdf,v 1.2 2003/07/24 15:51:06 jcarroll Exp $
#
# -->
#
# <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
# xmlns:eg="http://example.org/">
#
# <!-- In this example the rdf:nodeID is redundant. -->
# <rdf:Description rdf:nodeID="a" eg:property1="value" />
#
# </rdf:RDF>
# EOF
#
# lambda do
# graph = RdfXmlParser.new(sampledoc)
# end.should_not raise_error
# end
#
it "should pass rdfms-syntax-incomplete/test003.rdf" do
sampledoc = <<-EOF;
<?xml version="1.0"?>
<!--
Copyright World Wide Web Consortium, (Massachusetts Institute of
Technology, Institut National de Recherche en Informatique et en
Automatique, Keio University).
All Rights Reserved.
Please see the full Copyright clause at
<http://www.w3.org/Consortium/Legal/copyright-software.html>
-->
<!--
On an rdf:Description or typed node rdf:nodeID behaves
similarly to an rdf:about.
$Id: test003.rdf,v 1.2 2003/07/24 15:51:06 jcarroll Exp $
-->
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:eg="http://example.org/">
<!-- In this example the rdf:nodeID is redundant. -->
<rdf:Description rdf:nodeID="a" eg:property1="value" />
</rdf:RDF>
EOF

graph = RdfXmlParser.new(sampledoc)
graph.graph[0].subject.to_s.should == "a"
end

# # when we have decent Unicode support, add http://www.w3.org/2000/10/rdf-tests/rdfcore/rdfms-rdf-id/error005.rdf
#
# it "should support reification" do
Expand All @@ -310,27 +313,37 @@
graph = RdfXmlParser.new(sampledoc)
end.should raise_error
end
# describe "parsing rdf files" do
# def test_file(filepath, uri = nil)
# n3_string = File.read(filepath)
# parser = RdfXmlParser.new(n3_string, uri)
# ntriples = parser.graph.to_ntriples
# ntriples.gsub!(/_:bn\d+/, '_:node1')
# ntriples = ntriples.split("\n").sort
#
# nt_string = File.read(filepath.sub('.rdf', '.nt'))
# nt_string = nt_string.split("\n").sort
#
# ntriples.should == nt_string
# end
#
# before(:all) do
# @rdf_dir = File.join(File.dirname(__FILE__), '..', 'test', 'rdf_tests')
# end

describe "parsing rdf files" do
def test_file(filepath, uri = nil)
n3_string = File.read(filepath)
parser = RdfXmlParser.new(n3_string, uri)
ntriples = parser.graph.to_ntriples
ntriples.gsub!(/_:bn\d+/, '_:node1')
ntriples = ntriples.split("\n").sort.join("\n")

nt_string = File.read(filepath.sub('.rdf', '.nt'))
nt_string = nt_string.split("\n").sort.join("\n")

if ntriples != nt_string
File.open("/Users/tommorris/tmp/expected.txt", 'w') {|f| f.write(nt_string) }
File.open("/Users/tommorris/tmp/got.txt", 'w') {|f| f.write(ntriples) }
# `cwm --ntriples /Users/tommorris/tmp/expected.txt > /Users/tommorris/tmp/expected.txt`
# `cwm --ntriples /Users/tommorris/tmp/got.txt > /Users/tommorris/tmp/got.txt`
`diff ~/tmp/expected.txt ~/tmp/got.txt > ~/tmp/diff.txt`
`open ~/tmp/diff.txt`
end
ntriples.should == nt_string
end

before(:all) do
@rdf_dir = File.join(File.dirname(__FILE__), '..', 'test', 'rdf_tests')
end

# it "should parse Coldplay's BBC Music profile" do
# gid = 'cc197bad-dc9c-440d-a5b5-d52ba2e14234'
# file = File.join(@rdf_dir, "#{gid}.rdf")
# test_file(file, "http://www.bbc.co.uk/music/artists/#{gid}")
# end
it "should parse Coldplay's BBC Music profile" do
gid = 'cc197bad-dc9c-440d-a5b5-d52ba2e14234'
file = File.join(@rdf_dir, "#{gid}.rdf")
test_file(file, "http://www.bbc.co.uk/music/artists/#{gid}")
end
end
end

0 comments on commit d9a8311

Please sign in to comment.