-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhtree.py
709 lines (559 loc) · 19.8 KB
/
htree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
# -*- coding: utf-8 -*-
#
# HTMLTree
#
# An HTML Node Tree toolkit.
#
# --------------------------------------------------------------------
#
# Copyright (c) 2015 by Waylan Limberg. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of HTMLTree nor the names of its contributors may be
# used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY WAYLAN LIMBERG ''AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO HTMLTree
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# --------------------------------------------------------------------
#
# Parts inspired by ElementTree.
#
# The ElementTree toolkit is Copyright (c) 1999-2007 by Fredrik Lundh
#
# --------------------------------------------------------------------
from __future__ import unicode_literals
import sys
try:
from html import entities
except ImportError:
import htmlentitydefs as entities
__version__ = '0.0.1'
__all__ = [
'Element',
'Comment',
'Text',
'RawText',
'is_node',
'is_element',
'is_text',
'is_raw_text',
'is_comment',
'to_string',
'to_bytes'
]
if sys.version_info[0] == 3: # pragma: no cover
text_type = str
else: # pragma: no cover
text_type = unicode # noqa
# --------------------------------------------------------------------
# Helpers
HTML_EMPTY = set([
'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
'img', 'input', 'isindex', 'link', 'meta' 'param'
])
HTML_BLOCK = set([
'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote',
'pre', 'table', 'dl', 'ol', 'ul', 'script', 'noscript', 'form',
'fieldset', 'iframe', 'math', 'hr', 'style', 'li', 'dt', 'dd',
'thead', 'tbody', 'tr', 'th', 'td', 'section', 'footer',
'header', 'group', 'figure', 'figcaption', 'aside', 'article',
'canvas', 'output', 'progress', 'video', 'nav'
])
def is_node(node):
"""
Returns True is object is a node.
"""
return isinstance(node, Node)
def is_element(node):
"""
Returns True if object is an element node.
"""
return isinstance(node, Element)
def is_text(node, strict=False):
"""
Returns True if object is a Text node.
If `strict` is True, the object must not be a subclass.
"""
if strict:
return isinstance(node, Text) and node.__class__ is Text
else:
return isinstance(node, Text)
def is_raw_text(node):
"""
Returns True if object is a RawText node.
"""
return isinstance(node, RawText)
def is_comment(node):
"""
Returns True if object is a Comment node.
"""
return isinstance(node, Comment)
def is_entity(node):
"""
Returns True if object is an Entity node:
"""
return isinstance(node, Entity)
# --------------------------------------------------------------------
# Nodes
class Node(object):
"""
Base class for nodes.
All nodes inherit from this class. Do not use this class directly.
"""
parent = None
def __repr__(self):
return '<{0}() at {1:#x}>'.format(self.__class__.__name__, id(self))
def next_sibling(self):
"""
Return the next (one) sibling of this node.
Returns None if this node has no parent or is the last sibling.
"""
if self.parent is None:
return None
try:
i = self.parent.index(self)
return self.parent[i+1]
except (ValueError, IndexError):
return None
def next_siblings(self):
"""
Return an iterator of all siblings which follow this node.
"""
if self.parent is None:
return []
i = self.parent.index(self)
return self.parent[i+1:]
def previous_sibling(self):
"""
Return the previous (one) sibling of this node.
Returns None if this node has no parent or is the first sibling.
"""
if self.parent is None:
return None
try:
i = self.parent.index(self)
# Prevent returning parent[-1] by checking for i > 0
return self.parent[i-1] if i > 0 else None
except (ValueError, IndexError):
return None
def previous_siblings(self):
"""
Return an iterator of all siblings which precede this node.
"""
if self.parent is None:
return []
i = self.parent.index(self)
return self.parent[:i]
def iter_ancestors(self):
"""
Return a tree iterator of all ancestors in document order.
"""
if self.parent is not None:
yield self.parent
for p in self.parent.iter_ancestors():
yield p
def to_string(self, format='html'):
"""
Return a serialized unicode string of a node and its children.
`format` may be one of "html" or "xhtml".
"""
data = []
write = data.append
_serialize_node(write, self, format)
return "".join(data)
def to_bytes(self, format='html', encoding='utf-8'):
"""
Return a serialized byte string of a node and its children.
`format` may be one of "html" or "xhtml".
`encoding` defaults to utf-8.
"""
return self.to_string(format).encode(encoding, "xmlcharrefreplace")
class BaseTextNode(Node, text_type):
"""
Base text node. A node which holds textlike content.
Do not use this class directly. Use the various subclasses instead.
"""
def __repr__(self):
truncated = '{0}...'.format(self[:6]) if len(self) > 9 else self
return '<{0}("{1}") at {2:#x}>'.format(self.__class__.__name__, truncated, id(self))
class Comment(BaseTextNode):
"""
Comment node.
Contains the text of an HTML Comment.
"""
class Text(BaseTextNode):
"""
Text node.
Contains the text of a text node.
"""
class RawText(Text):
"""
RawText node.
Contains the text of a raw text node, which the serializer serializes
as raw text. Be warned that no escaping of any kind is done to raw text.
"""
class Entity(BaseTextNode):
"""
Entity Node.
Contains a single HTML Entity. Accepts a single Unicode character, a
Unicode code point, or an HTML entity name. Renders as an HTML5 entity.
"""
def __new__(cls, obj):
original = obj
if isinstance(obj, text_type) and len(obj) == 1:
# Convert Unicode char to code point
obj = ord(obj)
if obj in entities.codepoint2name:
# Get name of code point
obj = entities.codepoint2name[obj]
# Ensure name ends with semicolon
name = obj if obj.endswith(';') else obj + ';'
# PY2 does not include semicolon in entitydef names so we also check obj
if name in entities.entitydefs or obj in entities.entitydefs:
# Unicode objects are imuttable, so return a new object
return super(Entity, cls).__new__(cls, '&'+name)
else:
raise TypeError('{0} is not a valid HTML Entity.'.format(repr(original)))
class Element(Node):
"""
An HTML Element Node
An element's length is the number of children (including text nodes).
The element tag, and attributes must be unicode strings.
When a child is added, that child's `parent` attribute is assigned
as a reference to the parent instance. When a child is removed,
the child's `parent` attribute is set to `None`.
`tag` is the element name. All additional keyword arguments are element
attributes. If tag is `None`, only its children will be serialized.
All text is contained in child Text or RawText nodes. The content of
RawText nodes will not be escaped when serialized. Therefore, use RawText
nodes to hold the content of "script" and "style" elements.
Text and RawText nodes cannot contain any children. Neither can any
Element nodes with tag names listed in HTML_EMPTY.
"""
tag = None
"""The element's name."""
attrib = None
"""Dictionary of the element's attributes."""
def __init__(self, tag=None, **attrib):
self.tag = tag
self.attrib = attrib
self._children = []
def __repr__(self):
return '<{0}("{1}") at {2:#x}>'.format(self.__class__.__name__, self.tag, id(self))
def copy(self):
"""
Return a shallow copy of current element.
Subelements will be shared with the original tree.
The copied element will be detatched from the tree and have no parent.
"""
node = self.__class__(self.tag, **self.attrib)
node[:] = self
node.parent = None
return node
def __len__(self):
return len(self._children)
def __getitem__(self, index):
return self._children[index]
def __setitem__(self, index, node):
self._assert_can_contain_children()
if isinstance(index, slice):
for n in node:
self._assert_is_node(n)
n.parent = self
else:
self._assert_is_node(node)
node.parent = self
self._children[index] = node
def __delitem__(self, index):
if hasattr(self._children[index], 'parent'):
self._children[index].parent = None
del self._children[index]
def __iter__(self):
return iter(self._children)
def __contains__(self, node):
return node in self._children
def _assert_is_node(self, node):
if not is_node(node):
raise TypeError('expected a Node, not {0}'.format(type(node).__name__))
def _assert_can_contain_children(self):
if self.tag is not None and self.tag.lower() in HTML_EMPTY:
raise TypeError(
'{0} is an "empty" HTML element and cannot accept any children'.format(repr(self))
)
def index(self, node):
"""
Return the index of the given child node.
"""
return self._children.index(node)
def append(self, node):
"""
Add child node to the end of this node's children.
"""
self._assert_can_contain_children()
self._assert_is_node(node)
node.parent = self
self._children.append(node)
def extend(self, nodes):
"""
Append child nodes from a sequence to end of this node's children.
"""
self._assert_can_contain_children()
for node in nodes:
self._assert_is_node(node)
node.parent = self
self._children.extend(nodes)
def insert(self, index, node):
"""
Insert child node at index.
"""
self._assert_can_contain_children()
self._assert_is_node(node)
node.parent = self
self._children.insert(index, node)
def remove(self, node):
"""
Remove matching child node.
ValueError is raised if a matching node could not be found.
"""
if hasattr(node, 'parent'):
node.parent = None
self._children.remove(node)
def clear(self):
"""
Reset Node. Remove all children and clear all attributes.
"""
self.attrib.clear()
# Detach parent from each child
for child in self._children:
self.remove(child)
def get(self, key, default=None):
"""
Get attribute of node or default.
"""
return self.attrib.get(key, default)
def set(self, key, value):
"""
Set attribute of node.
"""
self.attrib[key] = value
def keys(self):
"""
Get list of attribute names.
"""
return self.attrib.keys()
def items(self):
"""
Get element attributes as a list of (name, value) pairs.
"""
return self.attrib.items()
def add_class(self, value):
"""
Add a class name to the `class` attribute.
"""
value = ' '.join([self.get('class', ''), value]).strip()
self.set('class', value)
def remove_class(self, value):
"""
Remove a class name from the `class` attribute.
"""
classes = self.get('class', '').split()
if value in classes:
classes.remove(value)
self.set('class', ' '.join(classes).strip())
def iter_decendents(self, tags=None):
"""
Return a tree iterator of this node and all decedents in document order.
`tags` is a sequence of tag names of nodes which will be returned.
If `tags` is empty (the default), all nodes will be returned.
"""
tags = tags or []
if len(tags) == 0 or self.tag in tags:
yield self
for c in self._children:
for gc in c.iter_decendents(tags):
yield gc
def iter_text(self, entities=True, raw=False):
"""
Return a tree iterator of all decedent text nodes in document order.
Set `entities` to `False to exclude Entity nodes.
Set `raw` to `True` to include RawText nodes.
"""
for child in self:
if is_raw_text(child) and not raw:
continue
if is_text(child) or (is_entity(child) and entities):
yield child
elif is_element(child):
for gc in child.iter_text():
yield gc
# --------------------------------------------------------------------
# Serialization
def _raise_serialization_error(text):
raise TypeError(
'cannot serialize {0} (type {1})'.format(repr(text), type(text).__name__)
)
def _newline_required(node, start=False):
tag = node.tag.lower()
if start:
# This is a start tag
if tag in HTML_BLOCK and tag not in HTML_EMPTY:
if tag in ['p', 'P']:
return False
if len(node) < 1:
return False
return True
return False
else:
# This is an end tag
if tag in HTML_BLOCK:
return True
if tag == 'br':
return True
if tag == 'img' and (node.parent is None or node.parent.tag not in ['p', 'P']):
return True
return False
def _escape_cdata(text):
# escape character data
try:
# it's worth avoiding do-nothing calls for strings that are
# shorter than 500 character, or so. assume that's, by far,
# the most common case in most applications.
if '&' in text:
text = text.replace('&', '&')
if '<' in text:
text = text.replace('<', '<')
if '>' in text:
text = text.replace('>', '>')
return text
except (TypeError, AttributeError):
_raise_serialization_error(text)
def _escape_attrib(text):
# escape attribute value
try:
if '&' in text:
text = text.replace('&', '&')
if '<' in text:
text = text.replace('<', '<')
if '>' in text:
text = text.replace('>', '>')
if '"' in text:
text = text.replace('"', '"')
return text
except (TypeError, AttributeError):
_raise_serialization_error(text)
def _serialize_node(write, node, format):
if is_comment(node):
write('<!-- {0} -->'.format(_escape_cdata(node)))
elif is_raw_text(node) or is_entity(node):
write(node)
elif is_text(node):
write(_escape_cdata(node))
elif is_node(node):
tag = node.tag
if tag is None:
for n in node:
_serialize_node(write, n, format)
else:
write('<{0}'.format(tag))
attribs = sorted(node.items()) # lexical order
for k, v in attribs:
v = _escape_attrib(v)
if k == v and format == 'html':
# handle boolean attributes
write(' {0}'.format(v))
else:
write(' {0}="{1}"'.format(k, v))
if format == 'xhtml' and tag.lower() in HTML_EMPTY:
write(' />')
else:
write('>')
if _newline_required(node, start=True):
write('\n')
if tag.lower() not in HTML_EMPTY:
for n in node:
_serialize_node(write, n, format)
write('</{0}>'.format(tag))
if _newline_required(node):
write('\n')
else:
_raise_serialization_error(node)
# --------------------------------------------------------------------
# Parser
class TreeBuilderError(Exception):
pass
class TreeBuilder(object):
"""
Generic node structure builder.
Converts a sequance of `TreeBuilder.start`, `TreeBuilder.data` and
`TreeBuilder.end` method calls to a well-formed node structure.
Use this class to build a node structure using an HTML parser or
to convert from another HTML like format.
Note that a single root element is required. If a document fragment
is being built, then you may need to create a root Element with a
tag of `None` first. All other nodes can then be created as children
of that root (None) Element.
Empty tags must be explicitly closed. A call to `TreeBuilder.end` must
be made immediatly after the call to `TreeBuilder.start`. Otherwise,
an attempt will be made to append children to an empty tag, which will
generate an error.
"""
def __init__(self):
self._nodes = [] # node stack
self._last = None # Last node
def close(self):
"""
Returns the toplevel node.
"""
if len(self._nodes) != 0:
raise TreeBuilderError('Missing end tags.')
if self._last is None:
raise TreeBuilderError('Missing toplevel element.')
return self._last
def start(self, tag, **attrs):
"""
Open a new Element Node.
"""
self._last = elem = Element(tag, **attrs)
if self._nodes:
self._nodes[-1].append(elem)
self._nodes.append(elem)
def end(self, tag):
"""
Close the current Element node.
"""
if self._nodes and self._nodes[-1].tag == tag:
self._last = self._nodes.pop()
elif self._nodes:
expected = self._nodes[-1].tag
raise TreeBuilderError('End tag mismatch (expected {0}, got {1})'.format(expected, tag))
else:
raise TreeBuilderError('No nodes to close.')
def data(self, data, node_type=None):
"""
Add a non-element node to the current Element node.
`node_type` should be the class of the desired node type. Defaults to `Text`.
"""
node_type = node_type or Text
node = node_type(data)
if self._nodes:
self._nodes[-1].append(node)
else:
raise TreeBuilderError('Missing toplevel element.')