From 9cf85d2d663b9341c1b8cfb5b085830186a91b2e Mon Sep 17 00:00:00 2001 From: ArtOfCode Date: Sat, 25 Mar 2017 20:24:56 +0000 Subject: [PATCH 01/14] +Eric and Henders to CHQ --- globalvars.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/globalvars.py b/globalvars.py index a1c9e2fa6b..175e6f0690 100644 --- a/globalvars.py +++ b/globalvars.py @@ -122,7 +122,9 @@ class GlobalVars: "248139", # FelixSFD "156721", # D-side "167070", # quartata - "172450" # Hovercraft Full Of Eels + "172450", # Hovercraft Full Of Eels + "56200", # Eric Leschinski + "211021" # Henders ], meta_tavern_room_id: [ "315433", # Normal Human From f0d03e7b94e60a4758f3d49ecfe23412ffdd00d8 Mon Sep 17 00:00:00 2001 From: Thomas Ward Date: Sat, 25 Mar 2017 16:43:24 -0400 Subject: [PATCH 02/14] DEBUG: Remove 'pass' calls for answer and post handling --- bodyfetcher.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bodyfetcher.py b/bodyfetcher.py index 0aa8c4956a..690bc9b9c0 100644 --- a/bodyfetcher.py +++ b/bodyfetcher.py @@ -333,10 +333,10 @@ def make_api_call_for_site(self, site): handle_spam(answer_, reasons=reason, why=why) - except: - pass - except: - pass + except Exception as e: + raise e + except Exception as e: + raise e end_time = time.time() GlobalVars.posts_scan_stats_lock.acquire() From 87a2afad25304589069222d03b37082ae40c8eb9 Mon Sep 17 00:00:00 2001 From: Thomas Ward Date: Sat, 25 Mar 2017 16:49:39 -0400 Subject: [PATCH 03/14] Better handle the for loop to prevent key errors --- bodyfetcher.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/bodyfetcher.py b/bodyfetcher.py index 690bc9b9c0..bb3476424e 100644 --- a/bodyfetcher.py +++ b/bodyfetcher.py @@ -320,21 +320,24 @@ def make_api_call_for_site(self, site): pass try: - for answer in post["answers"]: - num_scanned += 1 - answer["IsAnswer"] = True # Necesssary for Post object - answer["title"] = "" # Necessary for proper Post object creation - answer["site"] = site # Necessary for proper Post object creation - answer_ = Post(api_response=answer, parent=post_) - - is_spam, reason, why = check_if_spam(answer_) - if is_spam: - try: - handle_spam(answer_, - reasons=reason, - why=why) - except Exception as e: - raise e + if "answers" not in post: + pass + else: + for answer in post["answers"]: + num_scanned += 1 + answer["IsAnswer"] = True # Necesssary for Post object + answer["title"] = "" # Necessary for proper Post object creation + answer["site"] = site # Necessary for proper Post object creation + answer_ = Post(api_response=answer, parent=post_) + + is_spam, reason, why = check_if_spam(answer_) + if is_spam: + try: + handle_spam(answer_, + reasons=reason, + why=why) + except Exception as e: + raise e except Exception as e: raise e From 619decf12d278445688ce56c42be1952896297b2 Mon Sep 17 00:00:00 2001 From: Thomas Ward Date: Sat, 25 Mar 2017 17:12:57 -0400 Subject: [PATCH 04/14] Fix some issues in Post class and spamhandling calls to Post properties. --autopull --- classes/Post.py | 2 +- spamhandling.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/classes/Post.py b/classes/Post.py index 1c62d1589b..148b79493f 100644 --- a/classes/Post.py +++ b/classes/Post.py @@ -194,7 +194,7 @@ def post_site(self): @property def post_url(self): - return unicode(self._post_url) + return unicode(self._post_site) @property def title(self): diff --git a/spamhandling.py b/spamhandling.py index c58fe3f102..f40cbaba3d 100644 --- a/spamhandling.py +++ b/spamhandling.py @@ -86,8 +86,8 @@ def handle_spam(post, reasons, why): datahandling.add_auto_ignored_post((post.post_id, post.post_site, datetime.now())) if why is not None and why != "": datahandling.add_why(post.post_site, post.post_id, why) - if post.is_answer and post.question_id is not None: - datahandling.add_post_site_id_link((post.post_id, post.post_site, "answer"), post.question_id) + if post.is_answer and post.post_id is not None and post.post_id is not "": + datahandling.add_post_site_id_link((post.post_id, post.post_site, "answer"), post.post_id) try: post.title = parsing.escape_special_chars_in_title(post.title) sanitized_title = regex.sub('(https?://|\n)', '', post.title) From 0338e732a44cf027cd4e3482e01b50156a1649df Mon Sep 17 00:00:00 2001 From: Thomas Ward Date: Sat, 25 Mar 2017 17:28:29 -0400 Subject: [PATCH 05/14] Answer titles need to be equal to parent title for chat message --- bodyfetcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bodyfetcher.py b/bodyfetcher.py index bb3476424e..50f5fb6cd6 100644 --- a/bodyfetcher.py +++ b/bodyfetcher.py @@ -326,7 +326,7 @@ def make_api_call_for_site(self, site): for answer in post["answers"]: num_scanned += 1 answer["IsAnswer"] = True # Necesssary for Post object - answer["title"] = "" # Necessary for proper Post object creation + answer["title"] = post_.title # Necessary for proper Post object creation answer["site"] = site # Necessary for proper Post object creation answer_ = Post(api_response=answer, parent=post_) From 52bae8b0b4735b788cb437b9cdec342293ce955b Mon Sep 17 00:00:00 2001 From: Thomas Ward Date: Sat, 25 Mar 2017 21:02:26 -0400 Subject: [PATCH 06/14] Post URL property went away in Post object --- classes/Post.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/classes/Post.py b/classes/Post.py index 148b79493f..d365407a12 100644 --- a/classes/Post.py +++ b/classes/Post.py @@ -12,6 +12,7 @@ class Post: _post_id = "" _post_score = 0 _post_site = "" + _post_url = "" _title = "" _user_name = "" _user_url = "" @@ -194,7 +195,10 @@ def post_site(self): @property def post_url(self): - return unicode(self._post_site) + try: + return unicode(self._post_url) + except: + return "NoLink" @property def title(self): From b39e9c5feb9705b44cb3a4f9428a59c1b366d401 Mon Sep 17 00:00:00 2001 From: Thomas Ward Date: Sat, 25 Mar 2017 22:47:03 -0400 Subject: [PATCH 07/14] We should probably *not* be shadowing preexisting commands... Also a bunch of IDE suppressions for other issues. --- chatcommands.py | 4 ++-- classes/Post.py | 1 + findspam.py | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/chatcommands.py b/chatcommands.py index bc2ebbb4f7..b9fbe6a964 100644 --- a/chatcommands.py +++ b/chatcommands.py @@ -30,14 +30,14 @@ # noinspection PyMissingTypeHints -def check_permissions(function): +def check_permissions(function_): # noinspection PyMissingTypeHints def run_command(ev_room, ev_user_id, wrap2, *args, **kwargs): if datahandling.is_privileged(ev_room, ev_user_id, wrap2): kwargs['ev_room'] = ev_room kwargs['ev_user_id'] = ev_user_id kwargs['wrap2'] = wrap2 - return function(*args, **kwargs) + return function_(*args, **kwargs) else: return Response(command_status=False, message="You are not a privileged user. Please see [the privileges wiki page](" + diff --git a/classes/Post.py b/classes/Post.py index d365407a12..03a02768f1 100644 --- a/classes/Post.py +++ b/classes/Post.py @@ -193,6 +193,7 @@ def post_score(self): def post_site(self): return unicode(self._post_site) + # noinspection PyBroadException @property def post_url(self): try: diff --git a/findspam.py b/findspam.py index f0f11c79a7..1f9ef45db8 100644 --- a/findspam.py +++ b/findspam.py @@ -30,7 +30,7 @@ r"""*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?""", regex.UNICODE) -# noinspection PyUnusedLocal,PyMissingTypeHints +# noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker def has_repeated_words(s, site, *args): words = regex.split(r"[\s.,;!/\()\[\]+_-]", s) words = [word for word in words if word != ""] @@ -83,7 +83,7 @@ def link_at_end(s, site, *args): # link at end of question, on selected sites return False, "" -# noinspection PyUnusedLocal,PyMissingTypeHints +# noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker def non_english_link(s, site, *args): # non-english link in short answer if len(s) < 600: links = regex.compile(ur'nofollow(?: noreferrer)?">([^<]*)(?=)', regex.UNICODE).findall(s) @@ -96,7 +96,7 @@ def non_english_link(s, site, *args): # non-english link in short answer return False, "" -# noinspection PyUnusedLocal,PyMissingTypeHints +# noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker def mostly_non_latin(s, site, *args): # majority of post is in non-Latin, non-Cyrillic characters word_chars = regex.sub(r'(?u)[\W0-9]|http\S*', "", s) non_latin_chars = regex.sub(r"(?u)\p{script=Latin}|\p{script=Cyrillic}", "", word_chars) @@ -301,7 +301,7 @@ def has_eltima(s, site, *args): return False, "" -# noinspection PyUnusedLocal,PyMissingTypeHints +# noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker def username_similar_website(s, site, *args): username = args[0] sim_result = perform_similarity_checks(s, username) From 76784a42f88944bfc8d6ce12cf3e6a614fe64a79 Mon Sep 17 00:00:00 2001 From: Thomas Ward Date: Sat, 25 Mar 2017 23:23:14 -0400 Subject: [PATCH 08/14] Revert debug 'raise' commands, put the 'pass' back into except blocks. --- bodyfetcher.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bodyfetcher.py b/bodyfetcher.py index 50f5fb6cd6..725610926b 100644 --- a/bodyfetcher.py +++ b/bodyfetcher.py @@ -336,10 +336,10 @@ def make_api_call_for_site(self, site): handle_spam(answer_, reasons=reason, why=why) - except Exception as e: - raise e - except Exception as e: - raise e + except: + pass + except: + pass end_time = time.time() GlobalVars.posts_scan_stats_lock.acquire() From fca25a39324dfdba832216cc29d607a19a6fc2d5 Mon Sep 17 00:00:00 2001 From: SmokeDetector Date: Sun, 26 Mar 2017 06:07:21 +0000 Subject: [PATCH 09/14] Auto blacklist of fiverr\.com by Ashish Ahuja --autopull --- blacklisted_websites.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/blacklisted_websites.txt b/blacklisted_websites.txt index fcc4dd1339..22f6c1a5fe 100644 --- a/blacklisted_websites.txt +++ b/blacklisted_websites.txt @@ -916,3 +916,4 @@ www\.pokemonomegarubyandalphasapphiredownload\.com alphagenixsweden\.com click2trial\.com examfreeresult\.in +fiverr\.com From f4752b67bfb0d4f2be0834b3f77bfc257eda484d Mon Sep 17 00:00:00 2001 From: angussidney Date: Sun, 26 Mar 2017 17:40:14 +1100 Subject: [PATCH 10/14] +hypertone to pattern product name --- findspam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index 1f9ef45db8..7b9abf4837 100644 --- a/findspam.py +++ b/findspam.py @@ -190,7 +190,7 @@ def pattern_product_name(s, site, *args): "Junivive", "Apexatropin", "Gain", "Allure", "Nuvella", "Trimgenix", "Satin", "Prodroxatone", "Elite", "Force", "Exceptional", "Enhance(ment)?", "Nitro", "Max", "Boost", "E?xtreme", "Grow", "Deep", "Male", "Pro", "Advanced", "Monster", "Divine", "Royale", "Angele", "Trinity", "Andro", - "Pure", "Skin", "Sea", "Muscle", "Ascend", "Youth", + "Pure", "Skin", "Sea", "Muscle", "Ascend", "Youth", "Hyper(tone)?", "Serum", "Supplement", "Fuel", "Cream"] if site != "math.stackexchange.com" and site != "mathoverflow.net": keywords += ["E?X[tl\d]?", "Alpha", "Prime", "Formula"] From 6f0a2c8f5836f411bc8ed9e0ea129a8831f150fc Mon Sep 17 00:00:00 2001 From: ArtOfCode Date: Sun, 26 Mar 2017 13:05:20 +0100 Subject: [PATCH 11/14] Replace pass calls with error logs --- bodyfetcher.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bodyfetcher.py b/bodyfetcher.py index 725610926b..b849ea3428 100644 --- a/bodyfetcher.py +++ b/bodyfetcher.py @@ -316,8 +316,8 @@ def make_api_call_for_site(self, site): handle_spam(post=post_, reasons=reason, why=why) - except: - pass + except e: + log('error', "Exception in handle_spam:", e) try: if "answers" not in post: @@ -336,10 +336,10 @@ def make_api_call_for_site(self, site): handle_spam(answer_, reasons=reason, why=why) - except: - pass - except: - pass + except e: + log('error', "Exception in handle_spam:", e) + except e: + log('error', "Exception handling answers:", e) end_time = time.time() GlobalVars.posts_scan_stats_lock.acquire() From 6fb042fae48cf3ebd97ef8f63b0682133cfcf75f Mon Sep 17 00:00:00 2001 From: ArtOfCode Date: Sun, 26 Mar 2017 13:09:00 +0100 Subject: [PATCH 12/14] Apparently I forgot how to Python --- bodyfetcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bodyfetcher.py b/bodyfetcher.py index b849ea3428..0360a81392 100644 --- a/bodyfetcher.py +++ b/bodyfetcher.py @@ -316,7 +316,7 @@ def make_api_call_for_site(self, site): handle_spam(post=post_, reasons=reason, why=why) - except e: + except Exception as e: log('error', "Exception in handle_spam:", e) try: @@ -336,9 +336,9 @@ def make_api_call_for_site(self, site): handle_spam(answer_, reasons=reason, why=why) - except e: + except Exception as e: log('error', "Exception in handle_spam:", e) - except e: + except Exception as e: log('error', "Exception handling answers:", e) end_time = time.time() From 3721aded9cf1c5f6e22710cd5b3ddd5b3828bce4 Mon Sep 17 00:00:00 2001 From: quartata Date: Sun, 26 Mar 2017 10:15:08 -0700 Subject: [PATCH 13/14] Lowered threshold for similar_answer --- findspam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index 7b9abf4837..1b07a4099a 100644 --- a/findspam.py +++ b/findspam.py @@ -397,7 +397,7 @@ def similar_answer(post): sanitized_answer = strip_urls_and_tags(other_answer.body) ratio = similar_ratio(sanitized_body, sanitized_answer) - if ratio >= SIMILAR_THRESHOLD: + if ratio >= 0.7: return False, False, True, \ u"Answer similar to answer {}, ratio {}".format(other_answer.post_id, ratio) From a58e38e5497dd43c1fe9ba37bf96b0bb7f6855e1 Mon Sep 17 00:00:00 2001 From: quartata Date: Sun, 26 Mar 2017 12:19:17 -0700 Subject: [PATCH 14/14] Update findspam.py --- findspam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index 1b07a4099a..04821ad227 100644 --- a/findspam.py +++ b/findspam.py @@ -10,6 +10,7 @@ from helpers import all_matches_unique, log SIMILAR_THRESHOLD = 0.95 +SIMILAR_ANSWER_THRESHOLD = 0.7 EXCEPTION_RE = r"^Domain (.*) didn't .*!$" RE_COMPILE = regex.compile(EXCEPTION_RE) COMMON_MALFORMED_PROTOCOLS = [ @@ -397,7 +398,7 @@ def similar_answer(post): sanitized_answer = strip_urls_and_tags(other_answer.body) ratio = similar_ratio(sanitized_body, sanitized_answer) - if ratio >= 0.7: + if ratio >= SIMILAR_ANSWER_THRESHOLD: return False, False, True, \ u"Answer similar to answer {}, ratio {}".format(other_answer.post_id, ratio)