public
Description: Yet Another Planet Refactoring
Homepage: http://intertwingly.net/blog/2007/12/19/Yet-Another-Planet-Refactoring
Clone URL: git://github.com/rubys/mars.git
mars / planet / xmlparser.rb
100644 169 lines (136 sloc) 4.283 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
require 'rexml/document'
require 'html5/liberalxmlparser'
 
module Planet
  module XmlParser
    begin
      require 'xml/parser' # http://www.yoshidam.net/xmlparser_en.txt
      @@parser = :expat
    rescue LoadError
      begin
        require 'xml/libxml' # http://libxml.rubyforge.org/
        @@parser = :libxml2
      rescue LoadError
        @@parser = :rexml
      end
    end
 
    def XmlParser.parse source
      source = source.read if source.respond_to? :read
 
      begin
        case @@parser
        when :expat
          # fast, compliant, but not always installed
          doc = XmlParser.expat source
        when :libxml2
          # also fast, compliant, but not always installed
          doc = XmlParser.libxml2 source
        else
          # fairly fast, fairly compliant, always available
          doc = REXML::Document.new source
        end
        bozo = false
      rescue Exception => e
        # If everything is being bozo'd, enable this to see why.
        # print "PARSE ERROR: #{$!}\n #{$!.backtrace.join("\n ")}\n"
 
        # last ditch attempt: use a liberal XML parser
        parser = HTML5::XMLParser.new
        doc = REXML::Document.new
        parser.parse_fragment(source).each {|node| doc << node rescue nil}
        bozo = true
      end
 
      # augment the document with feed parser attributes
      source = nil
      class << doc
        attr_accessor :bozo
      end
      doc.bozo = bozo
 
      doc
    end
 
    def XmlParser.expat source
      parser = XML::Parser.new
      class <<parser
        # enable additional events
        attr_accessor :startDoctypeDecl
        attr_accessor :comment
      end
 
      doc = REXML::Document.new
      node = doc
 
      parser.parse(source) do |type, name, data|
        case type
        when XML::Parser::START_ELEM
          # name = element name ; data = hash of attributes
          node = node.add_element(name)
          data.each {|name,value| node.add_attribute(name,value)}
 
        when XML::Parser::END_ELEM
          # name = element name ; data = nil
          node = node.parent
 
        when XML::Parser::CDATA
          # name = nil ; data = string
          node.add_text(data)
 
        when XML::Parser::COMMENT
          # name = nil ; data = string
          REXML::Comment.new(data,node)
 
        when XML::Parser::START_DOCTYPE_DECL
          # name = notation name ; data = [URL base, system ID, public ID]
          REXML::DocType.new([name, data[2] ? 'SYSTEM' : 'PUBLIC',
            data[1].inspect, data[0].inspect], node)
        end
      end
 
      parser.done
 
      doc
    end
 
    if @@parser==:libxml2
      if !XML::SaxParser.const_defined?(:Callbacks)
        # shim to upgrade libxml 0.3.8.4 to the 0.5.2.0 interface
        class XML::SaxParser
          module Callbacks
          end
 
          def callbacks= callback
            callback.methods.grep(/^on_/).each do |method|
              send(method) { |*args| callback.send method, *args }
            end
          end
        end
      end
 
      class Callbacks
 
        include XML::SaxParser::Callbacks
 
        def initialize(node)
          @node = node
        end
 
        def on_start_element(name, attrs)
          @node = @node.add_element(name)
          attrs.each {|key,value| @node.add_attribute(key,value)}
        end
 
        def on_end_element(name)
          @node = @node.parent
        end
 
        def on_characters(chars)
          @node.add_text(chars)
        end
 
        def on_cdata_block(cdata)
          @node.add_text(cdata)
        end
 
        def on_comment(data)
          REXML::Comment.new(data,@node)
        end
 
        def on_parser_error(message)
          raise Exception.new(message)
        end
 
        def on_parser_fatal_error(message)
          raise Exception.new(message)
        end
 
        def on_external_subset(name, externalId, systemId)
          REXML::DocType.new([name, 'PUBLIC', externalId.inspect,
            systemId.inspect], @node)
        end
      end
    end
 
    def XmlParser.libxml2 source
      parser = XML::SaxParser.new
 
      doc = REXML::Document.new
 
      parser.string = source
      parser.callbacks = XmlParser::Callbacks.new(doc)
      parser.parse
 
      doc
    end
  end
end