Skip to content

Commit

Permalink
ignore 0-9 for mostly non-latin check
Browse files Browse the repository at this point in the history
  • Loading branch information
normalhuman committed Mar 24, 2016
1 parent c9ebd10 commit 0a8859f
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ def non_english_link(s, site): # non-english link in short answer


def mostly_non_latin(s, site): # non-english link in short answer
word_chars = regex.sub(r"(?u)\W", "", s)
word_chars = regex.sub(r"(?u)[\W0-9]", "", s)
non_latin_chars = regex.sub(r"\w", "", word_chars)
if (len(non_latin_chars) > 0.4 * len(word_chars)):
return True, u"Mostly non-Latin alphabet."
return True, u"Text contains {} non-Latin characters out of {}".format(len(non_latin_chars), len(word_chars))
return False, ""


Expand Down

0 comments on commit 0a8859f

Please sign in to comment.