Skip to content

Commit

Permalink
allow unicode in filterWiki fnc (was: only utf8)
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed May 20, 2011
1 parent 796f6b5 commit d6974e9
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 4 deletions.
5 changes: 3 additions & 2 deletions src/gensim/corpora/wikicorpus.py
Expand Up @@ -68,11 +68,12 @@

def filterWiki(raw):
"""
Filter out wiki mark-up from utf8 string `raw`, leaving only text.
Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
or utf-8 encoded string.
"""
# parsing of the wiki markup is not perfect, but sufficient for our purposes
# contributions to improving this code are welcome :)
text = utils.decode_htmlentities(unicode(raw, 'utf8', 'ignore'))
text = utils.decode_htmlentities(utils.toUnicode(raw, 'utf8', errors='ignore'))
text = utils.decode_htmlentities(text) # ' ' --> '\xa0'
text = re.sub(RE_P2, "", text) # remove the last list (=languages)
# the wiki markup is recursive (markup inside markup etc)
Expand Down
11 changes: 9 additions & 2 deletions src/gensim/utils.py
Expand Up @@ -87,10 +87,17 @@ def tokenize(text, lowercase=False, deacc=False, errors="strict", toLower=False,
yield match.group()


def toUtf8(text):
def toUtf8(text, errors='strict'):
if isinstance(text, unicode):
return text.encode('utf8')
return unicode(text, 'utf8').encode('utf8') # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
# do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
return unicode(text, 'utf8', errors=errors).encode('utf8')


def toUnicode(text, encoding='utf8', errors='strict'):
if isinstance(text, unicode):
return text
return unicode(text, 'utf8', errors=errors)


class SaveLoad(object):
Expand Down

0 comments on commit d6974e9

Please sign in to comment.