Skip to content

Commit

Permalink
use utf-8 throughout htmldocck
Browse files Browse the repository at this point in the history
This commit improves compatibility with Python 3, which already uses
Unicode throughout.

It also fixes a subtle incompatibility stemming from the use of
`entitydefs`, which contains replacement text _encoded in latin-1_ for
HTML entities. When using Python 3, this would cause `0xa0` to be
incorrectly added to the element tree.

This meant that there was a rustdoc test that would pass under Python 2
but fail under Python 3, due to an incorrect regex match against the
non-breaking space character. This commit triggers that failure in both
versions, and also fixes it.
  • Loading branch information
euclio committed Jan 6, 2019
1 parent 6861426 commit 6fefcee
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 19 deletions.
46 changes: 28 additions & 18 deletions src/etc/htmldocck.py
@@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

r"""
htmldocck.py is a custom checker script for Rustdoc HTML outputs.
Expand Down Expand Up @@ -98,7 +101,10 @@
"""

from __future__ import print_function
from __future__ import absolute_import, print_function, unicode_literals

import codecs
import io
import sys
import os.path
import re
Expand All @@ -110,14 +116,10 @@
from HTMLParser import HTMLParser
from xml.etree import cElementTree as ET

# ⇤/⇥ are not in HTML 4 but are in HTML 5
try:
from html.entities import entitydefs
from html.entities import name2codepoint
except ImportError:
from htmlentitydefs import entitydefs
entitydefs['larrb'] = u'\u21e4'
entitydefs['rarrb'] = u'\u21e5'
entitydefs['nbsp'] = ' '
from htmlentitydefs import name2codepoint

# "void elements" (no closing tag) from the HTML Standard section 12.1.2
VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
Expand Down Expand Up @@ -157,11 +159,11 @@ def handle_data(self, data):
self.__builder.data(data)

def handle_entityref(self, name):
self.__builder.data(entitydefs[name])
self.__builder.data(unichr(name2codepoint[name]))

def handle_charref(self, name):
code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
self.__builder.data(unichr(code).encode('utf-8'))
self.__builder.data(unichr(code))

def close(self):
HTMLParser.close(self)
Expand Down Expand Up @@ -210,11 +212,11 @@ def concat_multi_lines(f):
(?<=(?<!\S)@)(?P<negated>!?)
(?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
(?P<args>.*)$
''', re.X)
''', re.X | re.UNICODE)


def get_commands(template):
with open(template, 'rU') as f:
with io.open(template, encoding='utf-8') as f:
for lineno, line in concat_multi_lines(f):
m = LINE_PATTERN.search(line)
if not m:
Expand All @@ -226,7 +228,10 @@ def get_commands(template):
if args and not args[:1].isspace():
print_err(lineno, line, 'Invalid template syntax')
continue
args = shlex.split(args)
try:
args = shlex.split(args)
except UnicodeEncodeError:
args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)


Expand Down Expand Up @@ -280,7 +285,7 @@ def get_file(self, path):
if not(os.path.exists(abspath) and os.path.isfile(abspath)):
raise FailedCheck('File does not exist {!r}'.format(path))

with open(abspath) as f:
with io.open(abspath, encoding='utf-8') as f:
data = f.read()
self.files[path] = data
return data
Expand All @@ -294,9 +299,9 @@ def get_tree(self, path):
if not(os.path.exists(abspath) and os.path.isfile(abspath)):
raise FailedCheck('File does not exist {!r}'.format(path))

with open(abspath) as f:
with io.open(abspath, encoding='utf-8') as f:
try:
tree = ET.parse(f, CustomHTMLParser())
tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
except Exception as e:
raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
self.trees[path] = tree
Expand All @@ -313,7 +318,7 @@ def check_string(data, pat, regexp):
if not pat:
return True # special case a presence testing
elif regexp:
return re.search(pat, data) is not None
return re.search(pat, data, flags=re.UNICODE) is not None
else:
data = ' '.join(data.split())
pat = ' '.join(pat.split())
Expand Down Expand Up @@ -350,7 +355,7 @@ def check_tree_text(tree, path, pat, regexp):
break
except Exception as e:
print('Failed to get path "{}"'.format(path))
raise e
raise
return ret


Expand All @@ -359,7 +364,12 @@ def get_tree_count(tree, path):
return len(tree.findall(path))

def stderr(*args):
print(*args, file=sys.stderr)
if sys.version_info.major < 3:
file = codecs.getwriter('utf-8')(sys.stderr)
else:
file = sys.stderr

print(*args, file=file)

def print_err(lineno, context, err, message=None):
global ERR_COUNT
Expand Down
2 changes: 1 addition & 1 deletion src/test/rustdoc/issue-32374.rs
Expand Up @@ -10,7 +10,7 @@
// 'Deprecated since 1.0.0: text'
// @has - '<code>test</code>&nbsp;<a href="http://issue_url/32374">#32374</a>'
// @matches issue_32374/struct.T.html '//*[@class="stab unstable"]' \
// '🔬 This is a nightly-only experimental API. \(test #32374\)$'
// '🔬 This is a nightly-only experimental API. \(test\s#32374\)$'
/// Docs
#[rustc_deprecated(since = "1.0.0", reason = "text")]
#[unstable(feature = "test", issue = "32374")]
Expand Down

0 comments on commit 6fefcee

Please sign in to comment.