-
Notifications
You must be signed in to change notification settings - Fork 15
/
html.py
128 lines (106 loc) · 4.35 KB
/
html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# coding: utf-8
from io import StringIO
import re
from lxml.etree import Comment
from lxml.html import tostring, _looks_like_full_html_unicode, parse
try:
from django.utils.encoding import force_text
except ImportError: # For Django < 1.4.2
from django.utils.encoding import force_unicode as force_text
from .models import Term
from .settings import (
TERMS_IGNORED_TAGS, TERMS_IGNORED_CLASSES, TERMS_IGNORED_IDS,
TERMS_REPLACE_FIRST_ONLY, TERMS_ENABLED)
if TERMS_REPLACE_FIRST_ONLY:
def del_other_occurrences(key, replace_dict, variants_dict):
if key in replace_dict:
for variant in variants_dict[key]:
try:
del replace_dict[variant.lower()]
except KeyError: # Happens when two variants are case
pass # variants of the same word.
else:
def del_other_occurrences(*args, **kwargs):
pass
def get_translate_function(replace_dict, variants_dict):
def translate(match):
before, name = match.group('before', 'name')
key = name.lower()
if key in replace_dict:
replaced_name, case_sensitive = replace_dict.get(key, ('%s', True))
if case_sensitive and name not in variants_dict[key]:
replaced_name = name
else:
replaced_name %= name
del_other_occurrences(key, replace_dict, variants_dict)
else:
replaced_name = name
return before + replaced_name
return translate
def is_valid_node(node):
#if not node:
# return False
if node.tag is Comment or node.tag in TERMS_IGNORED_TAGS \
or node.get('id') in TERMS_IGNORED_IDS:
return False
classes = frozenset(node.get('class', '').split())
return classes.isdisjoint(TERMS_IGNORED_CLASSES)
def get_text(node):
text = node.text or ''
for subnode in node.getchildren():
# text += subnode.text or subnode.tail or ''
text += subnode.tail or ''
return text.replace('&', '&')
def get_interesting_contents(parent_node, replace_regexp):
if is_valid_node(parent_node):
text = get_text(parent_node)
if text and replace_regexp.search(text):
yield parent_node
for node in parent_node.getchildren():
for subnode in get_interesting_contents(node, replace_regexp):
yield subnode
PARAGRAPH_RE = re.compile(r'^\s*<p[^>]*>.*</p>\s*$', flags=re.DOTALL)
if TERMS_ENABLED:
def replace_terms(original_html):
html = force_text(original_html)
if not html:
return html
remove_body = False
remove_p = False
etree = parse(StringIO(html))
root_node = etree.getroot()
#if not _looks_like_full_html_unicode(html) and root_node:
if not _looks_like_full_html_unicode(html):
root_node = root_node.getchildren()[0]
remove_body = True
children = root_node.getchildren()
if len(children) == 1 and children[0].tag == 'p' \
and PARAGRAPH_RE.match(html) is None:
remove_p = True
variants_dict = Term.objects.variants_dict()
replace_dict = Term.objects.replace_dict()
replace_regexp = Term.objects.replace_regexp()
replace_regexp__sub = replace_regexp.sub
translate = get_translate_function(replace_dict, variants_dict)
interesting_contents = list(get_interesting_contents(root_node,
replace_regexp))
if not interesting_contents:
return original_html
for node in interesting_contents:
new_content = replace_regexp__sub(
translate, tostring(node, encoding='unicode'))
new_node = parse(StringIO(new_content)).getroot().getchildren()[0]
if node.tag != 'body':
new_node = new_node.getchildren()[0]
node.getparent().replace(node, new_node)
if remove_body:
if remove_p:
root_node = root_node.getchildren()[0]
out = root_node.text or ''
out += ''.join([tostring(node, encoding='unicode')
for node in root_node.getchildren()])
return out
return tostring(etree, encoding='unicode')
else:
def replace_terms(html):
return html