forked from buriy/python-readability
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathencoding.py
65 lines (55 loc) · 2 KB
/
encoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import re
try:
import cchardet
except ImportError:
import chardet
import sys
RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
CHARSETS = {
"big5": "big5hkscs",
"gb2312": "gb18030",
"ascii": "utf-8",
"maccyrillic": "cp1251",
"win1251": "cp1251",
"win-1251": "cp1251",
"windows-1251": "cp1251",
}
def fix_charset(encoding):
"""Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset. Created because of issues with Chinese websites"""
encoding = encoding.lower()
return CHARSETS.get(encoding, encoding)
def get_encoding(page):
# Regex for XML and HTML Meta charset declaration
declared_encodings = (
RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page)
)
# Try any declared encodings
for declared_encoding in declared_encodings:
try:
# Python3 only
# declared_encoding will actually be bytes but .decode() only
# accepts `str` type. Decode blindly with ascii because no one should
# ever use non-ascii characters in the name of an encoding.
declared_encoding = declared_encoding.decode("ascii", "replace")
encoding = fix_charset(declared_encoding)
# Now let's decode the page
page.decode(encoding)
# It worked!
return encoding
except UnicodeDecodeError:
pass
# Fallback to chardet if declared encodings fail
# Remove all HTML tags, and leave only text for chardet
text = re.sub(r'(\s*</?[^>]*>)+\s*', ' ', page).strip()
enc = 'utf-8'
if len(text) < 10:
return enc # can't guess
res = chardet.detect(text)
enc = res["encoding"] or "utf-8"
# print '->', enc, "%.2f" % res['confidence']
enc = fix_charset(enc)
return enc