diff --git a/bleach/__init__.py b/bleach/__init__.py index 3e7fe411..7d43a37d 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -1,3 +1,4 @@ +import logging import re import html5lib @@ -5,6 +6,8 @@ from sanitizer import BleachSanitizer from encoding import force_unicode +log = logging.getLogger('bleach') + ALLOWED_TAGS = [ 'a', 'abbr', @@ -72,7 +75,7 @@ class s(BleachSanitizer): parser = html5lib.HTMLParser(tokenizer=s) - return force_unicode(_serialize(parser.parseFragment(string))).strip() + return render(parser.parseFragment(string), string).strip() def linkify(self, text, nofollow=True): """Convert URL-like strings in an HTML fragment to links. @@ -141,7 +144,7 @@ def link_repl(match): linkify_nodes(forest) - return force_unicode(_serialize(forest)) + return render(forest, text) def filter_url(self, url): """Applied to the href attribute of an autolinked URL""" @@ -152,6 +155,19 @@ def filter_text(self, url): return url +def render(tree, source): + """Try rendering as HTML, then XML, then give up.""" + try: + return force_unicode(_serialize(tree)) + except Exception, e: + log.error('HTML: %r ::: %r' % (e, source)) + try: + return force_unicode(tree.to_xml()) + except Exception, e: + log.error('XML: %r ::: %r' % (e, source)) + return u'' + + def _serialize(domtree): walker = html5lib.treewalkers.getTreeWalker('simpletree') stream = walker(domtree) diff --git a/bleach/tests/test_basics.py b/bleach/tests/test_basics.py index a329c23a..e4d50a46 100644 --- a/bleach/tests/test_basics.py +++ b/bleach/tests/test_basics.py @@ -1,6 +1,8 @@ +import html5lib + from nose.tools import eq_ -from bleach import Bleach +from bleach import Bleach, render b = Bleach() @@ -84,3 +86,8 @@ def test_serializer(): def test_no_href_links(): s = u'x' eq_(s, b.linkify(s, nofollow=False)) + + +def test_xml_render(): + parser = html5lib.HTMLParser() + eq_(render(parser.parseFragment(''), 'src'), '')