Perform checks on username

Charcoal-SE · Oct 28, 2014 · 73cf36b · 73cf36b
1 parent ad23327
commit 73cf36b
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 15 deletions.
diff --git a/findspam.py b/findspam.py
@@ -5,29 +5,37 @@
 class FindSpam:
     rules = [
      {'regex': u"(?i)(baba(ji)?|nike|vashi?k[ae]r[ae]n|sumer|kolcak|porn|molvi|judi bola|ituBola.com|lost lover|11s|acai|skin care|me2.do|black magic|bam2u|Neuro3X|Xtreme Antler)|ಌ", 'all': True,
-        'sites': [], 'reason': "Bad keyword detected"},
+        'sites': [], 'reason': "Bad keyword in {}", 'title': True, 'username': True},
      {'regex': u"(?i)(weight loss|muscles? build(ing)?|muscles? grow(th)?)", 'all': True,
-        'sites': ["fitness.stackexchange.com"], 'reason': "Bad keyword detected"},
+        'sites': ["fitness.stackexchange.com"], 'reason': "Bad keyword in {}", 'title': True, 'username': True},
      {'regex': u"\\d(?:_*\\d){9}|\\+?\\d_*\\d[\\s\\-]?(?:_*\\d){8,10}", 'all': True,
-        'sites': ["patents.stackexchange.com"], 'reason': "Phone number detected", 'validation_method': 'checkphonenumbers'},
+        'sites': ["patents.stackexchange.com"], 'reason': "Phone number detected", 'validation_method': 'checkphonenumbers', 'title': True, 'username': False},
      {'regex': u"(?i)(nigg?(a|er)|asshole|crap|fag|fuck(ing?)?|shit|whore)s?", 'all': True,
-        'sites': [], 'reason': "Offensive title detected",'insensitive':True},
-     {'regex': u"^(?=.*[A-Z])[^a-z]*$", 'all': True, 'sites': [], 'reason': "All-caps title"},
-     {'regex': u"https?://[a-zA-Z0-9_.-]+\\.[a-zA-Z]{2,4}(/[a-zA-Z0-9_/?=.-])?", 'all': True, 'sites': ["stackoverflow.com", "superuser.com", "askubuntu.com"], 'reason': "URL in title"}
+        'sites': [], 'reason': "Offensive {} detected",'insensitive':True, 'title': True, 'username': False},
+     {'regex': u"^(?=.*[A-Z])[^a-z]*$", 'all': True, 'sites': [], 'reason': "All-caps title", 'title': True, 'username': False},
+     {'regex': u"https?://[a-zA-Z0-9_.-]+\\.[a-zA-Z]{2,4}(/[a-zA-Z0-9_/?=.-])?", 'all': True,
+        'sites': ["stackoverflow.com", "superuser.com", "askubuntu.com"], 'reason': "URL in title", 'title': True, 'username': False}
     ]
 
     @staticmethod
-    def testpost(title, site):
+    def testpost(title, user_name, site):
         result = [];
         for rule in FindSpam.rules:
             if rule['all'] != (site in rule['sites']):
-                matched = re.compile(rule['regex'], re.UNICODE).findall(title)
-                if matched:
+                matched_title = re.compile(rule['regex'], re.UNICODE).findall(title)
+                matched_username = re.compile(rule['regex'], re.UNICODE).findall(user_name)
+                if matched_title and rule['title']:
                     try:
-                        if getattr(FindSpam, "%s" % rule['validation_method'])(matched):
+                        if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
                             result.append(rule['reason'])
                     except KeyError:                # There is no special logic for this rule
-                        result.append(rule['reason'])
+                        result.append(rule['reason'].replace("{}", "title"))
+                if matched_username and rule['username']:
+                    try:
+                        if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
+                            result.append(rule['reason'])
+                    except KeyError:                # There is no special logic for this rule
+                        result.append(rule['reason'].replace("{}", "username"))                 
         return result
 
     @staticmethod

diff --git a/ws.py b/ws.py
@@ -100,6 +100,7 @@ def is_false_positive(post_id, site_name):
 def checkifspam(data):
     d=json.loads(json.loads(data)["data"])
     s= d["titleEncodedFancy"]
+    poster = d["owner"]
     print time.strftime("%Y-%m-%d %H:%M:%S"),parser.unescape(s).encode("ascii",errors="replace")
     quality_score = bayesian_score(s)
     print quality_score
@@ -108,7 +109,7 @@ def checkifspam(data):
     site = d["siteBaseHostAddress"]
     site=site.encode("ascii",errors="replace")
     sys.stdout.flush()
-    test=FindSpam.testpost(s,site) 
+    test=FindSpam.testpost(s,poster,site) 
     if (0<len(test)):
         post_id = d["id"]
         if(has_already_been_posted(site, post_id, s) or is_false_positive(post_id, site)):
@@ -125,7 +126,7 @@ def checkifspam(data):
     return False
 
 def fetch_post_id_and_site_from_msg_content(content):
-    search_regex = r"^\[ \[SmokeDetector\]\(https:\/\/github.com\/Charcoal-SE\/SmokeDetector\) \] [\w ]+: \[.+]\(http:\/\/[\w.]+\/questions\/(\d+)\/.+\) on `([\w.]+)`$"
+    search_regex = r"^\[ \[SmokeDetector\]\(https:\/\/github.com\/Charcoal-SE\/SmokeDetector\) \] [\w ]+: \[.+]\(http:\/\/[\w.]+\/questions\/(\d+)\/.+\) by `.+` on `([\w.]+)`$"
     m = re.compile(search_regex).search(content)
     if m is None:
         return None
@@ -161,9 +162,10 @@ def handlespam(data):
     try:
         d=json.loads(json.loads(data)["data"])
         title = d["titleEncodedFancy"]
-        reason=", ".join(FindSpam.testpost(title,d["siteBaseHostAddress"]))
+        poster = d["owner"]
+        reason=", ".join(FindSpam.testpost(title,poster,d["siteBaseHostAddress"]))
         titleToPost = parser.unescape(re.sub(r"([_*\\`\[\]])", r"\\\1", title)).strip()
-        s="[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] %s: [%s](%s) on `%s`" % (reason,titleToPost,d["url"],d["siteBaseHostAddress"])
+        s="[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] %s: [%s](%s) by `%s` on `%s`" % (reason,titleToPost,d["url"],poster,d["siteBaseHostAddress"])
         print parser.unescape(s).encode('ascii',errors='replace')
         if time.time() >= blockedTime:
             room.send_message(s)