diff --git a/findspam.py b/findspam.py index e56daaa240..a7b4096fb2 100644 --- a/findspam.py +++ b/findspam.py @@ -561,40 +561,51 @@ def mevaqesh_troll(s, *args): return False, "" -def blacklist_factory(wb=[], nwb=[], **kwargs): - pattern = r'(?is){}'.format("|".join(chain( - (r"(?P\b{}\b)".format(i, keyword) for i, keyword in enumerate(wb)), - (r"(?P{})".format(i, keyword) for i, keyword in enumerate(nwb)) - ))) +class BlacklistFactory: + def __init__(self, wb=[], nwb=[], **kwargs): + self.patterns = {} + self.info = regex._Info(0, chr, kwargs) + parts = [] - compiled_pattern = regex.compile(pattern, **kwargs) - del pattern + for i, keyword in enumerate(wb): + self.add_pattern(parts, keyword, i, wb=True) - def f(s, *args): + for i, keyword in enumerate(nwb): + self.add_pattern(parts, keyword, i, wb=False) + + self.compiled_regex = regex.compile("(?is)" + "|".join(parts), **kwargs) + + def add_pattern(self, parts, pattern, line, wb=False): + wb_description = "wb" if wb else "nwb" + self.patterns[self.info.group_count + len(self.patterns)] = (pattern, line, wb_description) + + while True: + try: + regex._regex_core._parse_pattern(regex._Source(pattern), self.info) + except regex._regex_core._UnscopedFlagSet: + continue + break + + if wb: + parts.append("(\\b{}\\b)".format(pattern)) + else: + parts.append("({})".format(pattern)) + + def __call__(self, s, *args): why = [] - for match in compiled_pattern.finditer(s): - group = match.lastgroup + for match in self.compiled_regex.finditer(s): + pattern, line, wb = self.patterns[match.lastindex - 1] start, end = match.span() - if group[:3] == "nwb": - i = int(group[3:]) - why.append("Position {}-{}: {} (nwb pattern `{}` on L{})" - .format(start, end, match.group(), nwb[i], i + 1)) - elif group[:2] == "wb": - i = int(group[2:]) - why.append("Position {}-{}: {} (wb pattern `{}` on L{})" - .format(start, end, match.group(), wb[i], i + 1)) - else: - why.append("Position {}-{}: {} (can't retrieve matching pattern)".format(start, end, match.group())) + why.append("Position {}-{}: {} ({} pattern `{}` on L{})" + .format(start + 1, end + 1, match.group(), wb, pattern, line + 1)) if len(why) > 0: return True, ", ".join(why) else: return False, "" - return f - load_blacklists() @@ -732,11 +743,11 @@ class FindSpam: # # Category: Bad keywords # The big list of bad keywords, for titles and posts - {'method': blacklist_factory(wb=GlobalVars.bad_keywords, nwb=bad_keywords_nwb, city=city_list), + {'method': BlacklistFactory(wb=GlobalVars.bad_keywords, nwb=bad_keywords_nwb, city=city_list), 'all': True, 'sites': [], 'reason': "bad keyword in {}", 'title': True, 'body': True, 'username': True, 'stripcodeblocks': False, 'body_summary': True, 'max_rep': 4, 'max_score': 1}, # The small list of *potentially* bad keywords, for titles and posts - {'method': blacklist_factory(wb=list(GlobalVars.watched_keywords)), + {'method': BlacklistFactory(wb=list(GlobalVars.watched_keywords)), 'reason': 'potentially bad keyword in {}', 'all': True, 'sites': [], 'title': True, 'body': True, 'username': True, 'stripcodeblocks': False, 'body_summary': True, 'max_rep': 30, 'max_score': 1}, @@ -865,7 +876,7 @@ class FindSpam: # # Category: Suspicious links # Blacklisted sites - {'method': blacklist_factory(nwb=GlobalVars.blacklisted_websites), 'all': True, + {'method': BlacklistFactory(nwb=GlobalVars.blacklisted_websites), 'all': True, 'sites': [], 'reason': "blacklisted website in {}", 'title': True, 'body': True, 'username': False, 'stripcodeblocks': False, 'body_summary': True, 'max_rep': 50, 'max_score': 5}, # Suspicious sites @@ -1112,7 +1123,7 @@ class FindSpam: # # Category: other # Blacklisted usernames - {'method': blacklist_factory(nwb=GlobalVars.blacklisted_usernames), 'all': True, 'sites': [], + {'method': BlacklistFactory(nwb=GlobalVars.blacklisted_usernames), 'all': True, 'sites': [], 'reason': "blacklisted username", 'title': False, 'body': False, 'username': True, 'stripcodeblocks': False, 'body_summary': False, 'max_rep': 1, 'max_score': 0}, {'regex': u"(?i)^jeff$", 'all': False, 'sites': ["parenting.stackexchange.com"],