Got the parser a bit better

tommorris · Oct 22, 2008 · d9a8311 · d9a8311
1 parent b43fbbd
commit d9a8311
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 106 deletions.
diff --git a/lib/rena/rdfxmlparser.rb b/lib/rena/rdfxmlparser.rb
@@ -16,16 +16,18 @@ def initialize(xml_str, uri = nil)
                "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID",
                "http://www.w3.org/1999/02/22-rdf-syntax-ns#about",
                "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID"]
-      @uri = Addressable::URI.parse(uri) unless uri.nil?
+      @uri = Addressable::URI.parse(uri).to_s unless uri.nil?
       @graph = Rena::Graph.new
       @xml = LibXML::XML::Parser.string(xml_str).parse
+      @id_mapping = Hash.new
       root = @xml.root
       if is_rdf_root?(root)
         parse_descriptions(root)
       else
         root.each {|n|
           if is_rdf_root?(n)
             parse_descriptions(n)
+            debugger
           end
         }
       end
@@ -57,7 +59,8 @@ def parse_subject(el)
       fail_check(el)
 
       if el.attributes.get_attribute_ns(SYNTAX_BASE, "about")
-        return URIRef.new(el.attributes.get_attribute_ns(SYNTAX_BASE, "about").value)
+        debugger if el.attributes.get_attribute_ns(SYNTAX_BASE, "about").value =~ /artist$/
+        return URIRef.new(base_helper(el.attributes.get_attribute_ns(SYNTAX_BASE, "about").value, el.base).to_s)
       elsif el.attributes.get_attribute_ns(SYNTAX_BASE, "ID")
         id = el.attributes.get_attribute_ns(SYNTAX_BASE, "ID")
         if id_check?(id.value)
@@ -71,15 +74,9 @@ def parse_subject(el)
         return BNode.new
       end
       subject = nil
-      element.attributes.each_attribute do |att|
-        uri = att.namespace + att.name
+      el.attributes.each_attribute do |att|
+        uri = url_helper(att.namespace + att.name).to_s
         value = att.to_s
-        if uri == "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"
-          raise AboutEachException, "Failed as per RDFMS-AboutEach-Error001.rdf test from 2004 test suite"
-        end
-        if uri == "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"
-          raise AboutEachException, "Failed as per RDFMS-AboutEach-Error002.rdf test from 2004 test suite"
-        end
         if uri == "http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"
           raise
           if name =~ /^[a-zA-Z_][a-zA-Z0-9]*$/
@@ -89,23 +86,11 @@ def parse_subject(el)
           end
         end
 
-        if uri == resourceuri #specified resource
-          element_uri = Addressable::URI.parse(value)
-          if (element_uri.relative?)
-            # we have an element with a relative URI
-            if (element.base?)
-              # the element has a base URI, use that to build the URI
-              value = "##{value}" if (value[0..0].to_s != "#")
-              value = "#{element.base}#{value}"
-            elsif (!@uri.nil?)
-              # we can use the document URI to build the URI for the element
-              value = @uri + element_uri
-            end
-          end
-          subject = URIRef.new(value)
+        if uri == SYNTAX_BASE + "#resource" || uri == SYNTAX_BASE + "#about" #specified resource
+          subject = URIRef.new(base_helper(value, el.base))
         end
 
-        if uri == "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID" #BNode with ID
+        if uri.to_s == SYNTAX_BASE + "#nodeID" #BNode with ID
           # we have a BNode with an identifier. First, we need to do syntax checking.
           if value =~ /^[a-zA-Z_][a-zA-Z0-9]*$/
             # now we check to see if the graph has the value
@@ -133,11 +118,21 @@ def id_check?(id)
       !(!(id =~ /^[a-zA-Z_]\w*$/))
     end
 
+    def parse_object_atts (el)
+      if el.attributes.get_attribute_ns(SYNTAX_BASE, "resource")
+        return URIRef.new(base_helper(el.attributes.get_attribute_ns(SYNTAX_BASE, "resource").value, el.base).to_s)
+      end
+    end
+
     def parse_descriptions (node, subject = nil)
-      node.each_element { |el|        
+      node.each_element { |el|
         fail_check(el)
         # detect a subject
-        subject = parse_subject(el) #if subject.nil?
+        if @id_mapping[node.hash]
+          subject = @id_mapping[node.id]
+        elsif subject.nil?
+          subject = parse_subject(el)
+        end
 
         # find a class
         unless el.name == "Description" && el.namespace_node.href == SYNTAX_BASE
@@ -151,13 +146,11 @@ def parse_descriptions (node, subject = nil)
 
         el.each_element {|child|
           predicate = url_helper(child.name, child.namespace_node.href, child.base)
-          if predicate.to_s == "http://example.org/property3"
-            #debugger
-          end
           object = child.content
-          #debugger
           if el.attributes.get_attribute_ns(SYNTAX_BASE, "nodeID")
             @graph.add_triple(subject, predicate, forge_bnode_from_string(child.attributes.get_attribute_ns(SYNTAX_BASE, "nodeID").value))
+          elsif child.attributes.get_attribute_ns(SYNTAX_BASE, "resource")
+            @graph.add_triple(subject, predicate, URIRef.new(base_helper(child.attributes.get_attribute_ns(SYNTAX_BASE, "resource").value, child.base).to_s))
           end
             child.each {|contents|
               if contents.text? and contents.content.strip.length != 0
@@ -187,15 +180,9 @@ def parse_descriptions (node, subject = nil)
                 #when "Collection";
                 end
               else
+                @id_mapping[cel.hash] = object
                 @graph.add_triple(subject, predicate, object)
-                # if cel.attributes.get_attribute_ns(SYNTAX_BASE, "nodeID") && !cel.children.delete_if{|i| !i.element? }.empty?
-                  # debugger
-                  # parse_descriptions(cel.parent)
-                  # debugger
-                  # object = forge_bnode_from_string(cel.attributes.get_attribute_ns(SYNTAX_BASE, "nodeID").value)
-                # else
                 parse_descriptions(cel.parent, object)
-                # end
               end
             }
 
@@ -258,18 +245,31 @@ def smells_like_xml?(str)
       !(!(str =~ /xmlns/))
     end
 
+    def base_helper(uri, base = nil)
+      uri = Addressable::URI.parse(uri)
+      if uri.relative?
+        if !base.nil?
+          uri = Addressable::URI.parse(base)
+        elsif !@uri.nil?
+          uri = Addressable::URI.parse(@uri) + uri
+        end
+      end
+      #debugger if @uri.to_s =~ /bbc\.co\.uk/      
+      return uri.to_s
+    end
+
     def url_helper(name, ns, base = nil)
       if ns != "" and !ns.nil?
-        a = Addressable::URI.parse(ns) + Addressable::URI.parse(name)
+        if ns.to_s.split("")[-1] == "#"
+          a = Addressable::URI.parse(ns) + Addressable::URI.parse("#" + name)
+        else
+          a = Addressable::URI.parse(ns) + Addressable::URI.parse(name)
+        end
       else
         a = Addressable::URI.parse(name)
       end
       if a.relative?
-        if !base.nil?
-          a = Addressable::URI.parse(base) + a
-        elsif !@uri.nil?
-          a = @uri + a
-        end
+        a = base_helper(a.to_s, base)
       end
 
       return URIRef.new(a.to_s)

diff --git a/lib/rena/triple.rb b/lib/rena/triple.rb
@@ -48,6 +48,8 @@ def is_type?
     protected
 
     def self.coerce_subject(subject)
+      # TODO: do something intelligent with an Addressable:URI or URI
+
       case subject
       when URIRef, BNode
         subject
@@ -58,7 +60,7 @@ def self.coerce_subject(subject)
           BNode.new(subject)
         end
       else
-        raise InvalidSubject, "Subject is not of a known class"
+        raise InvalidSubject, "Subject is not of a known class (#{subject.class}: #{subject.inspect})"
       end
     end
 
@@ -76,6 +78,8 @@ def self.coerce_predicate(uri_or_string)
     end
 
     def self.coerce_object(object)
+      # TODO: do something intelligent with an Addressable:URI or URI
+
       case object
       when String, Integer, Float
         Literal.untyped(object)
@@ -84,7 +88,7 @@ def self.coerce_object(object)
       when URIRef, BNode, Literal
         object
       else
-        raise InvalidObject, "#{object.inspect} is not a valid object"
+        raise InvalidObject, "#{object.class}: #{object.inspect} is not a valid object"
       end
     end
   end

diff --git a/spec/parser_spec.rb b/spec/parser_spec.rb
@@ -33,7 +33,11 @@
     EOF
 
     graph = RdfXmlParser.new(sampledoc)
-    graph.graph.size.should == 9
+    graph.graph.size.should == 10
+    # print graph.graph.to_ntriples
+    # TODO: add datatype parsing
+    # TODO: make sure the BNode forging is done correctly - an internal element->nodeID mapping
+    # TODO: proper test
   end
 
   it "should raise an error if rdf:aboutEach is used, as per the negative parser test rdfms-abouteach-error001 (rdf:aboutEach attribute)" do
@@ -255,43 +259,42 @@
     end.should_not raise_error
   end
 
-  # it "should pass rdfms-syntax-incomplete/test003.rdf" do
-  #   sampledoc = <<-EOF;
-  #   <?xml version="1.0"?>
-  # 
-  #   <!--
-  #     Copyright World Wide Web Consortium, (Massachusetts Institute of
-  #     Technology, Institut National de Recherche en Informatique et en
-  #     Automatique, Keio University).
-  # 
-  #     All Rights Reserved.
-  # 
-  #     Please see the full Copyright clause at
-  #     <http://www.w3.org/Consortium/Legal/copyright-software.html>
-  # 
-  #   -->
-  #   <!--
-  # 
-  #     On an rdf:Description or typed node rdf:nodeID behaves
-  #     similarly to an rdf:about.
-  #     $Id: test003.rdf,v 1.2 2003/07/24 15:51:06 jcarroll Exp $
-  # 
-  #   -->
-  # 
-  #   <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-  #            xmlns:eg="http://example.org/">
-  # 
-  #    <!-- In this example the rdf:nodeID is redundant. -->
-  #    <rdf:Description rdf:nodeID="a" eg:property1="value" />
-  # 
-  #   </rdf:RDF>
-  #   EOF
-  #   
-  #   lambda do
-  #     graph = RdfXmlParser.new(sampledoc)
-  #   end.should_not raise_error
-  # end
-  # 
+  it "should pass rdfms-syntax-incomplete/test003.rdf" do
+    sampledoc = <<-EOF;
+<?xml version="1.0"?>
+  
+    <!--
+      Copyright World Wide Web Consortium, (Massachusetts Institute of
+      Technology, Institut National de Recherche en Informatique et en
+      Automatique, Keio University).
+  
+      All Rights Reserved.
+  
+      Please see the full Copyright clause at
+      <http://www.w3.org/Consortium/Legal/copyright-software.html>
+  
+    -->
+    <!--
+  
+      On an rdf:Description or typed node rdf:nodeID behaves
+      similarly to an rdf:about.
+      $Id: test003.rdf,v 1.2 2003/07/24 15:51:06 jcarroll Exp $
+  
+    -->
+  
+    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+             xmlns:eg="http://example.org/">
+  
+     <!-- In this example the rdf:nodeID is redundant. -->
+     <rdf:Description rdf:nodeID="a" eg:property1="value" />
+  
+    </rdf:RDF>
+    EOF
+
+    graph = RdfXmlParser.new(sampledoc)
+    graph.graph[0].subject.to_s.should == "a"
+  end
+
   # # when we have decent Unicode support, add http://www.w3.org/2000/10/rdf-tests/rdfcore/rdfms-rdf-id/error005.rdf
   # 
   # it "should support reification" do
@@ -310,27 +313,37 @@
       graph = RdfXmlParser.new(sampledoc)
     end.should raise_error
   end
-#  describe "parsing rdf files" do
-#    def test_file(filepath, uri = nil)
-#      n3_string = File.read(filepath)
-#      parser = RdfXmlParser.new(n3_string, uri)
-#      ntriples = parser.graph.to_ntriples
-#      ntriples.gsub!(/_:bn\d+/, '_:node1')
-#      ntriples = ntriples.split("\n").sort
-#      
-#      nt_string = File.read(filepath.sub('.rdf', '.nt'))
-#      nt_string = nt_string.split("\n").sort
-#      
-#      ntriples.should == nt_string
-#    end
-#    
-#    before(:all) do
-#      @rdf_dir = File.join(File.dirname(__FILE__), '..', 'test', 'rdf_tests')
-#    end
+
+ describe "parsing rdf files" do
+   def test_file(filepath, uri = nil)
+     n3_string = File.read(filepath)
+     parser = RdfXmlParser.new(n3_string, uri)
+     ntriples = parser.graph.to_ntriples
+     ntriples.gsub!(/_:bn\d+/, '_:node1')
+     ntriples = ntriples.split("\n").sort.join("\n")
+
+     nt_string = File.read(filepath.sub('.rdf', '.nt'))
+     nt_string = nt_string.split("\n").sort.join("\n")
+
+     if ntriples != nt_string
+       File.open("/Users/tommorris/tmp/expected.txt", 'w') {|f| f.write(nt_string) }
+       File.open("/Users/tommorris/tmp/got.txt", 'w') {|f| f.write(ntriples) }
+       # `cwm --ntriples /Users/tommorris/tmp/expected.txt > /Users/tommorris/tmp/expected.txt`
+       #        `cwm --ntriples /Users/tommorris/tmp/got.txt > /Users/tommorris/tmp/got.txt`
+       `diff ~/tmp/expected.txt ~/tmp/got.txt > ~/tmp/diff.txt`
+       `open ~/tmp/diff.txt`
+     end
+     ntriples.should == nt_string
+   end
+
+   before(:all) do
+     @rdf_dir = File.join(File.dirname(__FILE__), '..', 'test', 'rdf_tests')
+   end
 
-    # it "should parse Coldplay's BBC Music profile" do
-    #   gid = 'cc197bad-dc9c-440d-a5b5-d52ba2e14234'
-    #   file = File.join(@rdf_dir, "#{gid}.rdf")
-    #   test_file(file, "http://www.bbc.co.uk/music/artists/#{gid}")
-    # end 
+    it "should parse Coldplay's BBC Music profile" do
+      gid = 'cc197bad-dc9c-440d-a5b5-d52ba2e14234'
+      file = File.join(@rdf_dir, "#{gid}.rdf")
+      test_file(file, "http://www.bbc.co.uk/music/artists/#{gid}")
+    end 
+  end
 end