Skip to content

Commit

Permalink
Merge pull request #1295 from quartata/factory_optimize --autopull
Browse files Browse the repository at this point in the history
Optimize BlacklistFactory
  • Loading branch information
Undo1 committed Nov 29, 2017
2 parents 6f4b3b0 + 79fc39d commit 9633a5d
Showing 1 changed file with 37 additions and 26 deletions.
63 changes: 37 additions & 26 deletions findspam.py
Expand Up @@ -561,40 +561,51 @@ def mevaqesh_troll(s, *args):
return False, ""


def blacklist_factory(wb=[], nwb=[], **kwargs):
pattern = r'(?is){}'.format("|".join(chain(
(r"(?P<wb{}>\b{}\b)".format(i, keyword) for i, keyword in enumerate(wb)),
(r"(?P<nwb{}>{})".format(i, keyword) for i, keyword in enumerate(nwb))
)))
class BlacklistFactory:
def __init__(self, wb=[], nwb=[], **kwargs):
self.patterns = {}
self.info = regex._Info(0, chr, kwargs)
parts = []

compiled_pattern = regex.compile(pattern, **kwargs)
del pattern
for i, keyword in enumerate(wb):
self.add_pattern(parts, keyword, i, wb=True)

def f(s, *args):
for i, keyword in enumerate(nwb):
self.add_pattern(parts, keyword, i, wb=False)

self.compiled_regex = regex.compile("(?is)" + "|".join(parts), **kwargs)

def add_pattern(self, parts, pattern, line, wb=False):
wb_description = "wb" if wb else "nwb"
self.patterns[self.info.group_count + len(self.patterns)] = (pattern, line, wb_description)

while True:
try:
regex._regex_core._parse_pattern(regex._Source(pattern), self.info)
except regex._regex_core._UnscopedFlagSet:
continue
break

if wb:
parts.append("(\\b{}\\b)".format(pattern))
else:
parts.append("({})".format(pattern))

def __call__(self, s, *args):
why = []

for match in compiled_pattern.finditer(s):
group = match.lastgroup
for match in self.compiled_regex.finditer(s):
pattern, line, wb = self.patterns[match.lastindex - 1]
start, end = match.span()

if group[:3] == "nwb":
i = int(group[3:])
why.append("Position {}-{}: {} (nwb pattern `{}` on L{})"
.format(start, end, match.group(), nwb[i], i + 1))
elif group[:2] == "wb":
i = int(group[2:])
why.append("Position {}-{}: {} (wb pattern `{}` on L{})"
.format(start, end, match.group(), wb[i], i + 1))
else:
why.append("Position {}-{}: {} (can't retrieve matching pattern)".format(start, end, match.group()))
why.append("Position {}-{}: {} ({} pattern `{}` on L{})"
.format(start + 1, end + 1, match.group(), wb, pattern, line + 1))

if len(why) > 0:
return True, ", ".join(why)
else:
return False, ""

return f


load_blacklists()

Expand Down Expand Up @@ -732,11 +743,11 @@ class FindSpam:
#
# Category: Bad keywords
# The big list of bad keywords, for titles and posts
{'method': blacklist_factory(wb=GlobalVars.bad_keywords, nwb=bad_keywords_nwb, city=city_list),
{'method': BlacklistFactory(wb=GlobalVars.bad_keywords, nwb=bad_keywords_nwb, city=city_list),
'all': True, 'sites': [], 'reason': "bad keyword in {}", 'title': True, 'body': True, 'username': True,
'stripcodeblocks': False, 'body_summary': True, 'max_rep': 4, 'max_score': 1},
# The small list of *potentially* bad keywords, for titles and posts
{'method': blacklist_factory(wb=list(GlobalVars.watched_keywords)),
{'method': BlacklistFactory(wb=list(GlobalVars.watched_keywords)),
'reason': 'potentially bad keyword in {}',
'all': True, 'sites': [], 'title': True, 'body': True, 'username': True,
'stripcodeblocks': False, 'body_summary': True, 'max_rep': 30, 'max_score': 1},
Expand Down Expand Up @@ -865,7 +876,7 @@ class FindSpam:
#
# Category: Suspicious links
# Blacklisted sites
{'method': blacklist_factory(nwb=GlobalVars.blacklisted_websites), 'all': True,
{'method': BlacklistFactory(nwb=GlobalVars.blacklisted_websites), 'all': True,
'sites': [], 'reason': "blacklisted website in {}", 'title': True, 'body': True, 'username': False,
'stripcodeblocks': False, 'body_summary': True, 'max_rep': 50, 'max_score': 5},
# Suspicious sites
Expand Down Expand Up @@ -1112,7 +1123,7 @@ class FindSpam:
#
# Category: other
# Blacklisted usernames
{'method': blacklist_factory(nwb=GlobalVars.blacklisted_usernames), 'all': True, 'sites': [],
{'method': BlacklistFactory(nwb=GlobalVars.blacklisted_usernames), 'all': True, 'sites': [],
'reason': "blacklisted username", 'title': False, 'body': False, 'username': True, 'stripcodeblocks': False,
'body_summary': False, 'max_rep': 1, 'max_score': 0},
{'regex': u"(?i)^jeff$", 'all': False, 'sites': ["parenting.stackexchange.com"],
Expand Down

0 comments on commit 9633a5d

Please sign in to comment.