Merge branch 'bodies'

Charcoal-SE · Jan 11, 2015 · 21f2066 · 21f2066
2 parents 25695ad + 8a67d58
commit 21f2066
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 9 deletions.
diff --git a/bodyfetcher.py b/bodyfetcher.py
@@ -0,0 +1,61 @@
+import json
+import requests
+from spamhandling import *
+from chatcommunicate import *
+from datahandling import *
+from parsing import get_user_from_url
+
+class BodyFetcher:
+    queue = {}
+
+    specialCases = {"stackoverflow.com" : 5, "serverfault.com" : 5, "superuser.com" : 5}
+
+    def addToQueue(self, post):
+        d=json.loads(json.loads(post)["data"])
+        sitebase = d["siteBaseHostAddress"]
+        postid = d["id"]
+        if sitebase in self.queue:
+            self.queue[sitebase].append(postid)
+        else:
+            self.queue[sitebase] = [postid]
+
+        print self.queue
+        self.checkQueue()
+
+    def checkQueue(self):
+        for site, values in self.queue.iteritems():
+            if site in self.specialCases:
+                if len(self.queue[site]) >= self.specialCases[site]:
+                    print "site " + site + " met special case quota, fetching..."
+                    self.makeApiCallForSite(site)
+                    return
+
+        # if we don't have any sites with their queue filled, take the first one without a special case
+        for site, values in self.queue.iteritems():
+            if site not in self.specialCases:
+                self.makeApiCallForSite(site)
+                return
+
+
+    def makeApiCallForSite(self, site):
+        posts = self.queue.pop(site)
+        url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts)  + "?site=" + site + "&filter=!-Kh)95tdb6R0joni_wabz(1g(16eESDja&key=IAkbitmze4B8KpacUfLqkw(("
+        response = requests.get(url).json()
+
+        for post in response["items"]:
+            result = FindSpam.testbody(post["body"],site)
+            if result != []:
+                try:
+                    reason = ", ".join(result)
+                    s="[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] %s: [%s](%s) by [%s](%s) on `%s`" % \
+                      (reason,post["title"].strip(), post["link"],post["owner"]["display_name"].strip(), post["owner"]["link"], site)
+                    print GlobalVars.parser.unescape(s).encode('ascii',errors='replace')
+                    if time.time() >= GlobalVars.blockedTime:
+                        GlobalVars.charcoal_hq.send_message(s)
+                        GlobalVars.tavern_on_the_meta.send_message(s)
+                        for specialroom in GlobalVars.specialrooms:
+                            sites = specialroom["sites"]
+                            if site in sites and reason not in specialroom["unwantedReasons"]:
+                                # specialroom["room"].send_message(s)
+                except:
+                    print "NOP"
diff --git a/findspam.py b/findspam.py
@@ -4,20 +4,21 @@
 
 class FindSpam:
     rules = [
+<<<<<<< HEAD
      {'regex': u"(?i)\\b(baba(ji)?|nike|vashi?k[ae]r[ae]n|sumer|kolcak|porn|molvi|judi bola|ituBola.com|lost lover|11s|acai|skin care|LifeForce|swtor2credits|me2.do|black magic|bam2u|Neuro(3X|flexyn)|Nutra|TesteroneXL|Bowtrol|Slim ?Genix|Cleanse EFX|Babyliss ?Pro|Forskolin|Blackline Elite|TestCore Pro|Xtreme Antler|Maxx Test 3000|Cheap Wigs?|(Improve )?Brain Power|aging skin|acne( prone)? skin|(skin )?eye serum|skin (serum|eye)|fake (passports?|driver'?s? licen[cs]e|ID cards?)|bagprada)\\b|ಌ|(support|service|helpline)( phone)? number|1[ -]?866[ -]?978[ -]?6819", 'all': True,
-        'sites': [], 'reason': "Bad keyword in {}", 'title': True, 'username': True},
-     {'regex': u"(?i)\\b(weight (loo?s[es]|reduction)|antiag(e)?ing lotion|muscles? build(ing)?|muscles?( (grow(th)?|diets?))?|anti aging|SkinCentric|loo?s[es] weight|wrinkles?)\\b", 'all': True,
-        'sites': ["fitness.stackexchange.com"], 'reason': "Bad keyword in {}", 'title': True, 'username': True},
+        'sites': [], 'reason': "Bad keyword in {}", 'title': True, 'body': True, 'username': True},
+     {'regex': u"(?i)\\b(weight (loo?s[es]|reduction)|muscles? build(ing)?|muscles?( (grow(th)?|diets?))?|anti aging|SkinCentric|loo?s[es] weight|wrinkles?)\\b", 'all': True,
+        'sites': ["fitness.stackexchange.com"], 'reason': "Bad keyword in {}", 'title': True, 'body': False, 'username': True},
      {'regex': u"(?i)^(?:(?=.*?\\b(?:online|hd)\\b)(?=.*?(?:free|full|unlimited)).*?movies?\\b|(?=.*?\\b(?:acai|kisn)\\b)(?=.*?care).*products?\\b|(?=.*?packer).*mover)", 'all': True,
-        'sites': [], 'reason': "Bad keywords in {}", 'title': True, 'username': True},
+        'sites': [], 'reason': "Bad keywords in {}", 'title': True, 'body': False, 'username': True},
      {'regex': u"\\d(?:_*\\d){9}|\\+?\\d_*\\d[\\s\\-]?(?:_*\\d){8,10}|\\d[ -]?\\d{3}[ -]?\\d{3}[ -]?\\d{4}", 'all': True,
-        'sites': ["patents.stackexchange.com"], 'reason': "Phone number detected", 'validation_method': 'checkphonenumbers', 'title': True, 'username': False},
+        'sites': ["patents.stackexchange.com"], 'reason': "Phone number detected", 'validation_method': 'checkphonenumbers', 'title': True, 'body': False, 'username': False},
      {'regex': u"(?i)\\b(nigg(a|er)|asshole|crap|fag|fuck(ing?)?|shit|whore)s?\\b", 'all': True,
-        'sites': [], 'reason': "Offensive {} detected",'insensitive':True, 'title': True, 'username': False},
+        'sites': [], 'reason': "Offensive {} detected",'insensitive':True, 'title': True, 'body': True, 'username': False},
      {'regex': u"^(?=.*[A-Z])[^a-z]*$", 'all': True, 'sites': [], 'reason': "All-caps title", 'title': True, 'username': False},
-     {'regex': u"^(?=.*[0-9])[^a-zA-Z]*$", 'all': True, 'sites': [], 'reason': "Numbers-only title", 'title': True, 'username': False},
+     {'regex': u"^(?=.*[0-9])[^a-zA-Z]*$", 'all': True, 'sites': [], 'reason': "Numbers-only title", 'title': True, 'body': False, 'username': False},
      {'regex': u"https?://[a-zA-Z0-9_.-]+\\.[a-zA-Z]{2,4}(/[a-zA-Z0-9_/?=.-])?", 'all': True,
-        'sites': ["stackoverflow.com", "superuser.com", "askubuntu.com"], 'reason': "URL in title", 'title': True, 'username': False}
+        'sites': ["stackoverflow.com", "superuser.com", "askubuntu.com"], 'reason': "URL in title", 'title': True, 'body': False, 'username': False}
     ]
 
     @staticmethod
@@ -38,7 +39,21 @@ def testpost(title, user_name, site):
                         if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
                             result.append(rule['reason'])
                     except KeyError:                # There is no special logic for this rule
-                        result.append(rule['reason'].replace("{}", "username"))                 
+                        result.append(rule['reason'].replace("{}", "username"))
+        return result
+
+    @staticmethod
+    def testbody(body,site):
+        result = [];
+        for rule in FindSpam.rules:
+            if rule['all'] != (site in rule['sites']):
+                matched_body = re.compile(rule['regex'], re.UNICODE).findall(body)
+                if matched_body and rule['body']:
+                    try:
+                        if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
+                            result.append(rule['reason'].replace("{}", "body"))
+                    except KeyError:                # There is no special logic for this rule
+                        result.append(rule['reason'].replace("{}", "body"))
         return result
 
     @staticmethod

diff --git a/ws.py b/ws.py
@@ -5,6 +5,7 @@
 from ChatExchange.chatexchange.client import *
 import traceback
 from spamhandling import *
+from bodyfetcher import *
 from chatcommunicate import *
 
 # !! Important! Be careful when adding code before this point.
@@ -28,6 +29,7 @@
 load_files()
 filter_auto_ignored_posts()
 
+b=BodyFetcher()
 GlobalVars.wrap.login(username, password)
 GlobalVars.wrapm.login(username, password)
 GlobalVars.s = "[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] SmokeDetector started at [rev " + GlobalVars.commit_with_author + "](https://github.com/Charcoal-SE/SmokeDetector/commit/"+ GlobalVars.commit +") (hosted by Undo)"
@@ -66,6 +68,8 @@ def restart_automatically(time_in_seconds):
         if a is not None and a != "":
             if checkifspam(a):
                 threading.Thread(target=handlespam,args=(a,)).start()
+            else:
+                threading.Thread(target=b.addToQueue,args=(a,)).start()
     except Exception, e:
         now = datetime.utcnow()
         delta = now - UtcDate.startup_utc_date