Convert SGML character entities into Unicode
Function taken from:
Thanks agazso!
import re
import htmlentitydefs
def convert(s):
"""Take an input string s, find all things that look like SGML character
entities, and replace them with the Unicode equivalent.
Function is from:
matches = re.findall("&#\d+;", s)
if len(matches) > 0:
hits = set(matches)
for hit in hits:
name = hit[2:-1]
entnum = int(name)
s = s.replace(hit, unichr(entnum))
except ValueError:
matches = re.findall("&\w+;", s)
hits = set(matches)
amp = "&"
if amp in hits:
for hit in hits:
name = hit[1:-1]
if name in htmlentitydefs.name2codepoint:
s = s.replace(hit,
s = s.replace(amp, "&")
return s
