#
# Simple Xml Tools library.
# http://tba
#
#
# --------------------------------------------------------
# License
# --------------------------------------------------------
#
# Copyright (c) 2007 Craig J M Turner
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
#
# --------------------------------------------------------
# Overview
# --------------------------------------------------------
# This aims to eventually be a hard-way character-at-a-time XML
# parser, editor and formatter. The reason this is nice is
# because it's all contained in a small, tight file which
# doesn't depend on particular xml libraries from recent
# versions of python. This is useful to me at the time of
# writing.
#
# This sort of parser should normally be written using some sort
# of parse grammar, but the hard work of a hand-crafted
# parsing algorithm is now done. Also, this library misses everything
# but the basic XML features.
#
#
# --------------------------------------------------------
# Notable features at the time of writing
# --------------------------------------------------------
#
# 1. Function create_dt_root_from_xml_string accepts a XML
# String and generates a DtRoot object from this. This is
# actually the gem in this library, but it's current
# implementation is limiting.
#
# 2. Class EventParser can be extended and allows for sax
# like events to be run against a dt root object (but
# it's not really that like sax, because you have to have
# the entire data structure stored in memory for it to
# work. Still - for anything but massive documents this
# is fine.).
#
# 3. DtRoot.as_xml allows you to output xml (unformatted)
#
# 4. DtRoot.as_debug_text allows you to see the structure
# of a XML document
#
# 5. create_formatted_xml, a slightly hacky but working
# function that uses an EventParser to print out a DtRoot
# object.
#
# 6. text_from_path is a crude mechanism that's a bit like
# xpath.
#
#
# --------------------------------------------------------
# Future
# --------------------------------------------------------
# create_dt_root needs to be rewritten so that {1} the
# events that fire as each character is encountered is
# delegated to an object so that it can be used for its
# current form (building a tree based on a DtRoot) but
# also other things such as handling events sax-style
# directly; {2} it can accept a String or some sort of
# stream.
#
import re
import sys
import traceback
class DtObject(object):
def is_root(self):
raise Exception, 'please implement me'
def is_tag(self):
raise Exception, 'please implement me'
def is_text(self):
raise Exception, 'please implement me'
def is_attr(self):
raise Exception, 'please implement me'
class DtRoot(DtObject):
def __init__(self):
self.children = []
def add_child(self, child):
self.children.append(child)
def as_xml(self):
sb = []
for child in self.children:
sb.append(child.as_xml())
return ''.join(sb)
def as_debug_text(self, indent=0):
s_indent = ' '*indent
sb = ["%sroot element"%s_indent]
for child in self.children:
sb.append(child.as_debug_text(indent))
return '\n'.join(sb)
def is_root(self):
return True
def is_tag(self):
return False
def is_text(self):
return False
def is_attr(self):
return False
class DtTag(DtObject):
"Tag"
def __init__(self):
self.attributes={}
self.children=[]
self.name = ''
def append_to_name(self, c):
self.name = self.name + c
def add_attribute(self, attr):
self.attributes[attr.name] = attr.value
def add_child(self, child):
self.children.append(child)
def as_xml(self):
if 0 == len(self.children):
sb = []
sb.append("<%s"%self.name)
sb_attr=[]
for attr_key in self.attributes.keys():
attr_value = self.attributes[attr_key]
sb_attr.append('%s="%s"'%(attr_key, attr_value))
if len(sb_attr)>0:
sb.append(' ')
sb.append( ' '.join(sb_attr) )
sb.append(" />")
return ''.join(sb)
else:
sb = []
sb.append('<')
sb.append(self.name)
sb_attr=[]
for attr_key in self.attributes.keys():
attr_value = self.attributes[attr_key]
sb_attr.append('%s="%s"'%(attr_key, attr_value))
if len(sb_attr)>0:
sb.append(' ')
sb.append( ' '.join(sb_attr) )
sb.append('>')
sb_contents=[]
for child in self.children:
sb_contents.append(child.as_xml())
sb.append( ''.join(sb_contents) )
sb.append('</%s>'%self.name)
return ''.join(sb)
def as_debug_text(self, indent=0):
indent = indent+2
sb = ['%stag with name "%s" and %s attributes'%(' '*indent, self.name, len(self.attributes))]
for child in self.children:
sb.append( child.as_debug_text(indent) )
indent = indent-2
return '\n'.join(sb)
def is_root(self):
return False
def is_tag(self):
return True
def is_text(self):
return False
def is_attr(self):
return False
class DtAttribute(DtObject):
"Attribute"
def __init__(self, name, value):
self.name = name
self.value = value
def render(self):
return '%s="%s"'%(name, value)
def is_root(self):
return False
def is_tag(self):
return False
def is_text(self):
return False
def is_attr(self):
return True
class DtText(DtObject):
"Text"
def __init__(self):
self.text = ''
def append_to_text(self, c):
self.text = self.text + c
def as_xml(self):
return self.text
def as_debug_text(self, indent=0):
indent = indent+2
s = "%sText: [%s]"%(' '*indent, self.text.strip())
indent = indent-2
return s
def is_root(self):
return False
def is_tag(self):
return False
def is_text(self):
return True
def is_attr(self):
return False
class EventParser(object):
def __init__(self, dt_root):
if None == dt_root:
raise Exception, 'Supplied xml document is null.'
def process_tag(tag):
self.start_element(tag.name, tag.attributes)
for child in tag.children:
if child.is_tag():
process_tag(child)
elif child.is_text():
self.text(child)
else:
tpl = (str(child), type(child))
raise Exception, "Unknown child, %s, type:%s"%tpl
self.end_element(tag.name)
self.start_document()
for child in dt_root.children:
if child.is_tag():
process_tag(child)
elif child.is_text():
self.text(child)
else:
tpl = (str(child), type(child))
raise Exception, "Unknown child, %s, type:%s"%tpl
self.end_document()
def start_document(self):
pass
def end_document(self):
pass
def start_element(self, name, attributes):
pass
def end_element(self, name):
pass
def text(self, dt_text):
pass
def get_output(self):
raise Exception, "Not implemented."
class DebugEventParser(EventParser):
"""This object processes a DtRoot node and launches events
at appropriate times. It can be easily extended, with the
events replaced."""
def __init__(self, dt_root):
super(DebugEventParser, self).__init__(dt_root)
def start_document(self):
print "start_document"
def end_document(self):
print "end_document"
def start_element(self, name, attributes):
print "start_element(%s, %s)"%(name, str(attributes))
def end_element(self, name):
print "end_element(%s)"%name
def text(self, text):
print "text %s"%text
def error(self, pos, c, stack, mode_str, msg=''):
print "Problem at '%s'."%c
print self.original_xml[0:pos]
print mode_str
raise Exception()
def create_formatted_xml(dt_root):
class Formatter(EventParser):
# Hack - note that this is receiving dt_tag rather than dt_root.
# It's fine though. It'll work. All hail duck-typing.
def __init__(self, dt_tag):
self.sb = []
self.indent = 0
self.stack__does_current_tag_have_content=[dt_tag]
self.stack__tag=[]
super(Formatter, self).__init__(dt_tag)
def start_document(self):
pass
def end_document(self):
pass
def _encountered_content(self):
if self.stack__does_current_tag_have_content[-1] == False:
name, dict_attr = self.stack__tag[-1]
self.sb.append(' '*self.indent)
self.sb.append('<')
self.sb.append(name)
if len(dict_attr) > 0:
self.sb.append(' ')
for key in dict_attr.keys():
self.sb.append(key)
self.sb.append('="')
self.sb.append(dict_attr[key])
self.sb.append('" ')
self.sb.append('>\n')
self.stack__does_current_tag_have_content[-1] = True
self.indent = self.indent+4
def start_element(self, name, attributes):
self._encountered_content()
self.stack__tag.append( (name, attributes) )
self.stack__does_current_tag_have_content.append(False)
def end_element(self, name):
name, dict_attr = self.stack__tag[-1]
if self.stack__does_current_tag_have_content[-1] == False:
self.sb.append(' '*self.indent)
self.sb.append('<')
self.sb.append(name)
if len(dict_attr) > 0:
self.sb.append(' ')
for key in dict_attr.keys():
self.sb.append(key)
self.sb.append('="')
self.sb.append(dict_attr[key])
self.sb.append('" ')
self.sb.append('/>\n')
else:
self.indent = self.indent-4
self.sb.append(' '*self.indent)
self.sb.append('</')
self.sb.append(name)
self.sb.append('>\n')
self.stack__does_current_tag_have_content.pop()
self.stack__tag.pop()
def text(self, dt_text):
t = dt_text.text.strip()
if len(t) > 0:
self._encountered_content()
self.stack__does_current_tag_have_content[-1] = True
self.sb.append(' '*self.indent)
self.sb.append(t)
self.sb.append('\n')
def get_output(self):
return ''.join(self.sb)
if len(dt_root.children) == 0:
print 'empty document'
else:
sb = []
for item in dt_root.children:
if item.is_tag():
sb.append( Formatter(item).get_output() )
if len(sb) > 0:
return '\n'.join( sb )
else:
return 'empty document'
def text_from_path(dt_root, path):
"""Returns a list of text groupings from the tree contained
by the supplied dt_root which are directly enclosed by the
supplied path. The path should be a list of String entries
such as ['html', 'head', 'title'].
If there are multiple text groupings for a matching scope
then all groupings will be joined so as to be space-separated
and thus captured in a single string."""
if path == None or type(path) != list:
msg = "Path supplied to text_from_path is invalid, %s."%path
raise Exception, msg
class Finder(EventParser):
def __init__(self, dt_root, path):
self.dt_root = dt_root
self.path = path
self.output = []
#
# As we find matches for the path we track match
# or fail after a match on the initial element
# with this list.
self.match_tracking = []
super(Finder, self).__init__(dt_root)
def start_element(self, name, attributes):
if len(self.match_tracking) < len(path):
if len(self.match_tracking) == 0:
stone = self.path[0]
if stone == name:
self.match_tracking.append(True)
else:
# We do nothing because the match_tracking
# var should be empty until we start finding
# a partial match.
pass
elif self.match_tracking[-1] == True:
# If we're on the right path so far, then have
# a look to see whether we have any path left,
# and if we do then compare it to the current
# position, otherwise add a false so that we
# can turn off matches for child nodes of the
# matching scope.
if len(self.path) >= len(self.match_tracking):
stone = self.path[len(self.match_tracking)]
if stone == name:
self.match_tracking.append(True)
else:
self.match_tracking.append(False)
else:
self.match_tracking.append(False)
else:
self.match_tracking.append(False)
else:
self.match_tracking.append(False)
def end_element(self, name):
if len(self.match_tracking) > 0:
self.match_tracking.pop()
def text(self, dt_text):
if len(self.path) > 0:
if len(self.match_tracking) == len(self.path):
if self.match_tracking[-1] == True:
t = dt_text.text.strip()
if len(t) > 0:
self.output.append(t)
def get_output(self):
"Returns a list of String"
return self.output[:]
finder = Finder(dt_root, path)
return finder.get_output()
def create_dt_root_from_xml_string(data):
#
# The initial stack has a root element and an
# empty text block within that.
doc_root = DtRoot()
stack = [doc_root]
peek = lambda: stack[-1]
def pop():
return stack.pop()
def push(item):
stack.append(item)
push(DtText())
class StringBuffer(object):
def __init__(self):
self.buffer = []
def append(self, car):
self.buffer.append(car)
def to_string(self, joiner=''):
return joiner.join(self.buffer)
def clear(self):
self.buffer = []
#
# Attribute buffer
sb_attr_name = StringBuffer() # buffer for attribute data
sb_attr_value = StringBuffer() # buffer for attribute data
def complete_attr():
name = sb_attr_name.to_string()
value = sb_attr_value.to_string()
attr = DtAttribute(name, value)
peek().add_attribute(attr)
sb_attr_name.clear()
sb_attr_value.clear()
return attr
def append_to_attr_name(c):
sb_attr_name.append(c)
def append_to_attr_value(c):
sb_attr_value.append(c)
#
# Close tag buffer
sb_close_tag_name_buffer = StringBuffer()
def complete_close_tag_name():
s = sb_close_tag_name_buffer.to_string()
if s != peek().name:
# xxx need decent handling
print "Name mismatch! [%s] is open, closing [%s]."%(s, peek().name)
pop()
sb_close_tag_name_buffer.clear()
def append_to_close_tag_name(c):
sb_close_tag_name_buffer.append(c)
modemap={ 0:'DEFAULT'
# '<' 1 TAG_BEGINNING
# ELSE -
, 1:'TAG_BEGINNING'
# '/' 4 CLOSE_TAG_NAME_INITIAL
# [a-zA-Z] 3 OPEN_TAG_NAME_REST
, 3:'OPEN_TAG_NAME_REST'
# [a-zA-Z0-9:_-] -
# ' ' 6 OPEN_TAG_GAP
# '/' 7 SELF_CLOSING_TAG_ENDING
# '>' 0 DEFAULT *
, 4:'CLOSE_TAG_NAME_INITIAL'
# [a-zA-Z] 5 CLOSE_TAG_NAME_REST
, 5:'CLOSE_TAG_NAME_REST'
# [a-zA-Z0-9_:-] -
# '>' 0 DEFAULT *
, 6:'OPEN_TAG_GAP'
# ' ' -
# [a-zA-Z] 10 ATTR_NAME_REST
# '>' 0 DEFAULT *
# '/' 7 SELF_CLOSING_TAG_ENDING
, 7:'SELF_CLOSING_TAG_ENDING'
# ' ' -
# '>' 0 DEFAULT * *
, 10:'ATTR_NAME_REST'
# [a-zA-Z0-9_:-] -
# '=' 11 ATTR_BETWEEN_NAME_AND_VALUE
, 11:'ATTR_BETWEEN_NAME_AND_VALUE'
# '"' 13 ATTR_VALUE
, 13:'ATTR_VALUE'
# '"' 6 OPEN_TAG_GAP
# '.*' -
}
def process(car, data_pos, mode):
if mode.value == 'DEFAULT':
if car == '<':
pop() # get rid of the text block
mode.to(1)
else:
peek().append_to_text(car)
elif mode.value == 'TAG_BEGINNING':
pattern = re.compile('[a-zA-Z]')
if car == '/':
mode.to(4)
elif pattern.match(car):
newtag = DtTag()
peek().add_child(newtag)
push(newtag)
peek().append_to_name(car)
mode.to(3)
else:
m = "%s %s %s %s"%(data_pos, car, stack, mode.value)
raise Exception, m
elif mode.value == 'OPEN_TAG_NAME_REST':
pattern = re.compile('[a-zA-Z0-9:_-]')
if pattern.match(car):
peek().append_to_name(car)
elif car == ' ':
mode.to(6)
elif car == '>':
mode.to(0)
text_ob = DtText()
peek().add_child(text_ob)
push(text_ob)
elif car == '/':
mode.to(7)
else:
m = "%s %s %s %s"%(data_pos, car, stack, mode.value)
raise Exception, m
elif mode.value == 'CLOSE_TAG_NAME_INITIAL':
pattern = re.compile('[a-zA-Z]')
if pattern.match(car):
append_to_close_tag_name(car)
mode.to(5)
else:
m = "%s %s %s %s"%(data_pos, car, stack, mode.value)
raise Exception, m
elif mode.value == 'CLOSE_TAG_NAME_REST':
pattern = re.compile('[a-zA-Z0-9_:-]')
if pattern.match(car):
append_to_close_tag_name(car)
elif car == '>':
complete_close_tag_name()
text_ob = DtText()
peek().add_child(text_ob)
push(text_ob)
mode.to(0)
else:
m = "%s %s %s %s"%(data_pos, car, stack, mode.value)
raise Exception, m
elif mode.value == 'OPEN_TAG_GAP':
pattern = re.compile('[a-zA-Z]')
if car == ' ':
pass
elif pattern.match(car):
append_to_attr_name(car)
mode.to(10)
elif car == '>':
text_ob = DtText()
peek().add_child(text_ob)
push(text_ob)
mode.to(0)
elif car == '/':
mode.to(7)
else:
m = "%s %s %s %s"%(data_pos, car, stack, mode.value)
raise Exception, m
elif mode.value == 'SELF_CLOSING_TAG_ENDING':
if car == ' ':
pass
elif car == '>':
tag = pop()
text_ob = DtText()
peek().add_child(text_ob)
push(text_ob)
mode.to(0)
else:
m = "%s %s %s %s"%(data_pos, car, stack, mode.value)
raise Exception, m
elif mode.value == 'ATTR_NAME_REST':
pattern = re.compile('[a-zA-Z0-9_:-]')
if pattern.match(car):
append_to_attr_name(car)
elif car == '=':
mode.to(11)
else:
m = "%s %s %s %s"%(data_pos, car, stack, mode.value)
raise Exception, m
elif mode.value == 'ATTR_BETWEEN_NAME_AND_VALUE':
if car == '"':
mode.to(13)
else:
m = "%s %s %s %s"%(data_pos, car, stack, mode.value)
raise Exception, m
elif mode.value == 'ATTR_VALUE':
if car == '"':
complete_attr()
mode.to(6)
else:
append_to_attr_value(car)
else:
fatal_alg_error("Unknown mode, [%s]."%mode)
#
# Our position in the data
data_pos=0
#
# Trigger the parse.
class Mode(object):
def __init__(self):
self.value = modemap[0]
def to(self, idx):
self.value = modemap[idx]
mode = Mode()
for car in data:
try:
process(car, data_pos, mode)
data_pos = data_pos + 1
except Exception, e:
print "mode", mode.value
print data[0:data_pos]
print data[data_pos]
traceback.print_exc()
raise e
return doc_root
def fatal_alg_error(self, msg):
print "FATAL: %s"%msg
sys.exit(1)
if __name__ == '__main__':
data = """
<html>
<head><title name="something" /></head>
<body>
<p>a</p>
<br />
q
<br/>
p
<h1>b</h1>
c
<hr width="80%" />
d
<p first="1" second="2">
before
<b>efg</b>
after
</p>
</body>
</html>
"""
dt_root = create_dt_root_from_xml_string(data)
print 'test as_xml'
print '--'
print dt_root.as_xml()
print
print 'test base EventParser'
print '--'
parser = DebugEventParser(dt_root)
print
print 'test FormatXml'
print '--'
print create_formatted_xml(dt_root)
print
print
print 'test text_from_path'
print text_from_path(dt_root, ['html', 'body'])
print