Skip to content

Commit

Permalink
ignore repeated zero-width characters
Browse files Browse the repository at this point in the history
  • Loading branch information
normalhuman committed Mar 10, 2016
1 parent 0b246c4 commit cfb8109
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def has_repeating_characters(s, site):
s = regex.sub('http[^"]*', "", s) # remove URLs for this check
if s is None or len(s) == 0:
return False, ""
matches = regex.compile("([^\\s_.,?!=~*/0-9-])(\\1{10,})", regex.UNICODE).findall(s)
matches = regex.compile(u"([^\\s_\u200b\u200c.,?!=~*/0-9-])(\\1{10,})", regex.UNICODE).findall(s)
matches = ["".join(match) for match in matches]
match = "".join(matches)
if (100 * len(match) / len(s)) >= 20:
Expand Down

0 comments on commit cfb8109

Please sign in to comment.