From bfe6164a01e19e1b8e032bcfb6cdb278c1acb188 Mon Sep 17 00:00:00 2001 From: malcolm smith Date: Tue, 21 Apr 2015 22:10:25 -0400 Subject: [PATCH] support funky Microsoft Word XHTML unicode escapes --- html2text/__init__.py | 5 +++-- html2text/utils.py | 9 +++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 1f3e803fb7..61bd602973 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -14,6 +14,7 @@ from html2text import config from html2text.utils import ( + wide_unichr, name2cp, unifiable_n, google_text_emphasis, @@ -703,7 +704,7 @@ def charref(self, name): return unifiable_n[c] else: try: - return unichr(c) + return wide_unichr(c) except NameError: # Python3 return chr(c) @@ -720,7 +721,7 @@ def entityref(self, c): return config.UNIFIABLE[c] else: try: - return unichr(name2cp(c)) + return wide_unichr(name2cp(c)) except NameError: # Python3 return chr(name2cp(c)) diff --git a/html2text/utils.py b/html2text/utils.py index 0946930f4d..bc9fb48f45 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -1,8 +1,17 @@ import sys from html2text import config +import struct from html2text.compat import htmlentitydefs +# Based on http://stackoverflow.com/questions/7105874/valueerror-unichr-arg-not-in-range0x10000-narrow-python-build-please-hel +def wide_unichr(i): + try: + return unichr(i) + except ValueError: + return struct.pack('i', i).decode('utf-32') + + def name2cp(k): if k == 'apos':