-
Notifications
You must be signed in to change notification settings - Fork 16
/
xmlutil.py
65 lines (56 loc) · 1.86 KB
/
xmlutil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from lxml import etree
import re
encoding_rx = re.compile('^<\?xml .*encoding=["\'](.+?)["\'].*\?>')
def detect_encoding(s):
m = encoding_rx.match(s)
if m is None:
return None
return m.group(1)
def fromstring(s):
# first try and parse the string directly
error = None
try:
return etree.fromstring(s)
except ValueError as e:
error = e
# if this failed, and this is not a unicode string, then just raise
# the exception, as there's nothing more to do for now
if not isinstance(s, str):
raise error
# our next best bet is to attempt to encode the unicode to a byte-stream
# with the relevant encoding
enc = detect_encoding(s)
if enc is not None:
try:
bs = s.encode(enc)
return etree.fromstring(bs)
except LookupError:
# this means the detected encoding is junk
pass
except ValueError as e:
# we had a problem parsing with the given encoding
pass
# if we get here, we failed to decode or failed to parse. Let's therefore strip
# the encoding declaration and see if lxml can sort it out (and just let the
# error raise as necessary)
clean = encoding_rx.sub("", s).strip()
return etree.fromstring(clean)
def xp_first_text(element, xpath, default=None):
el = element.xpath(xpath)
if len(el) > 0:
return el[0].text
return default
def xp_texts(element, xpath):
els = element.xpath(xpath)
return [e.text for e in els if e.text is not None]
def objectify(element):
obj = {}
for c in element.getchildren():
# FIXME: does not currently handle attributes
#for attr in c.keys():
# obj["@" + attr] = c.get(attr)
if len(c.getchildren()) > 0:
obj[c.tag] = objectify(c)
else:
obj[c.tag] = c.text
return obj