Skip to content
Permalink
Browse files

Adds new character_utilization_ratio function to check if a single

character is over used in a post.

Initially only active on judaism.stackexchange.com
  • Loading branch information
AWegnerGitHub committed Jun 7, 2017
1 parent d045ea1 commit 3b62ae06bb94d151c39aef701921e128fd607e90
Showing with 31 additions and 2 deletions.
  1. +30 −1 findspam.py
  2. +1 −1 test/test_spamhandling.py
@@ -11,9 +11,11 @@
from urllib.parse import urlparse
from helpers import all_matches_unique, log
from itertools import chain
from collections import Counter

SIMILAR_THRESHOLD = 0.95
SIMILAR_ANSWER_THRESHOLD = 0.7
CHARACTER_USE_RATIO = 0.42
EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
RE_COMPILE = regex.compile(EXCEPTION_RE)
COMMON_MALFORMED_PROTOCOLS = [
@@ -343,6 +345,26 @@ def username_similar_website(s, site, *args):
return False, ""


# noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker
def character_utilization_ratio(s, site, *args):
counter = Counter(s)
total_chars = len(s)
highest_ratio = 0.0
# highest_char = None

for key, value in counter.items():
char_ratio = value / float(total_chars)
key, value, char_ratio
if char_ratio > highest_ratio:
highest_ratio = char_ratio
# highest_char = key

if highest_ratio > CHARACTER_USE_RATIO:
return True, "The `{}` character appears in a high percentage of the post"
else:
return False, ""


# noinspection PyMissingTypeHints
def perform_similarity_checks(post, name):
"""
@@ -988,7 +1010,14 @@ class FindSpam:
{'method': similar_answer, 'all': True, 'sites': ["codegolf.stackexchange.com"],
'reason': "answer similar to existing answer on post", 'whole_post': True,
'title': False, 'body': False, 'username': False, 'stripcodeblocks': False,
'max_rep': 50, 'max_score': 0}
'max_rep': 50, 'max_score': 0},

# A single character is utilized in a high percentage of the post
{'method': character_utilization_ratio, 'all': False, 'sites': ["judaism.stackexchange.com"],
'reason': "single character over used in post",
'title': False, 'body': True, 'username': False, 'stripcodeblocks': False, 'body_summary': True,
'max_rep': 20, 'max_score': 0}

]

@staticmethod
@@ -44,7 +44,7 @@
('Mostly Non-latin', '冰冰冰test冰冰冰冰冰冰冰冰冰冰冰冰 test 冰冰冰冰', '', '', True),
('Pattern Matching product name - 2 words', """<p>vxl male enhancement</p>""", '', '', True),
('Pattern Matching product name - 3 words', """<p>Extends Monster Male Enhancement And Male Penile Enhancement</p>""", '', '', True),

('A Title', """<p>E x t e n d s M o n s t e r Male E n h a n c e m e n t And M a l e P e n i l e E n h a n c e m e n t</p>""", '', 'judaism.stackexchange.com', True),
])
def test_check_if_spam(title, body, username, site, match):
# We can't check blacklists/whitelists in tests, so these are set to their default values

0 comments on commit 3b62ae0

Please sign in to comment.
You can’t perform that action at this time.