-
Notifications
You must be signed in to change notification settings - Fork 461
/
Copy pathpage_parser.py
145 lines (118 loc) · 3.87 KB
/
page_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import re
from url_helpers import absolute_url
from BeautifulSoup import BeautifulSoup, HTMLParseError, UnicodeDammit
from logging import error
__all__ = [
'Unparseable',
'parse',
'get_title',
'get_body',
'ascii']
def debug(s): pass
class Unparseable(ValueError):
pass
def parse(raw_content, base_href=None, notify=lambda x: None):
for parse_method in _parse_methods():
try:
return parse_method(raw_content, base_href)
except HTMLParseError, e:
notify("parsing (%s) failed: %s" % (parse_method.__name__, e))
continue
raise Unparseable()
def get_title(soup):
title = unicode(getattr(soup.title, 'string', ''))
if not title:
return None
return normalize_spaces(title)
def get_body(soup):
[ elem.extract() for elem in soup.findAll(['script', 'link', 'style']) ]
raw_html = unicode(soup.body or soup)
cleaned = clean_attributes(raw_html)
try:
BeautifulSoup(cleaned)
return cleaned
except HTMLParseError:
error("cleansing broke html content: %s\n---------\n%s" % (raw_html,cleaned))
return raw_html
def ascii(s):
return s.decode('ascii', 'ignore')
class Replacement(object):
def __init__(self, desc, regex, replacement):
self.desc = desc
self.regex = regex
self.replacement = replacement
def apply(self, content):
# # useful for debugging:
# try:
# print self. desc + ':' + str(self.regex.findall(content))
# except RuntimeError: pass
return self.regex.sub(self.replacement, content)
def beautiful_soup(content, base_href):
soup = BeautifulSoup(content)
if base_href:
_fix_references(soup, base_href)
return soup
def _make_absolute_links(soup, base_href):
for link in soup.findAll('a', attrs={'href':True}):
link['href'] = absolute_url(link['href'], base_href)
def _make_absolute_images(soup, base_href):
for img in soup.findAll('img', attrs={'src':True}):
img['src'] = absolute_url(img['src'], base_href)
def _fix_references(soup, base_href):
_make_absolute_links(soup, base_href)
_make_absolute_images(soup, base_href)
# a bunch of regexes to hack around lousy html
dodgy_regexes = (
Replacement('javascript',
regex=re.compile('<script.*?</script[^>]*>', re.DOTALL | re.IGNORECASE),
replacement=''),
Replacement('double double-quoted attributes',
regex=re.compile('(="[^"]+")"+'),
replacement='\\1'),
Replacement('unclosed tags',
regex = re.compile('(<[a-zA-Z]+[^>]*)(<[a-zA-Z]+[^<>]*>)'),
replacement='\\1>\\2'),
Replacement('unclosed (numerical) attribute values',
regex = re.compile('(<[^>]*[a-zA-Z]+\s*=\s*"[0-9]+)( [a-zA-Z]+="\w+"|/?>)'),
replacement='\\1"\\2'),
)
# helpers for parsing
def normalize_spaces(s):
"""replace any sequence of whitespace
characters with a single space"""
return ' '.join(s.split())
def _remove_crufty_html(content):
for replacement in dodgy_regexes:
content = replacement.apply(content)
return content
def _parse_methods():
def unicode_cleansed(content, base_href):
content = UnicodeDammit(content, isHTML=True).markup
cleaned = _remove_crufty_html(content)
debug("Cleaned content: %s" % (cleaned,))
return beautiful_soup(cleaned, base_href)
def ascii_cleansed(content, base_href):
content = ascii(content)
cleaned = _remove_crufty_html(content)
debug("Cleaned content: %s" % (cleaned,))
return beautiful_soup(cleaned, base_href)
return (
beautiful_soup,
unicode_cleansed,
ascii_cleansed)
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
bad_attrs = ['width','height','style','[-a-z]*color','background[-a-z]*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
htmlstrip = re.compile("<" # open
"([^>]+) " # prefix
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
"([^>]*)" # postfix
">" # end
, re.I)
def clean_attributes(html):
while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html)
return html