Skip to content

Commit

Permalink
Merge branch 'bodies'
Browse files Browse the repository at this point in the history
  • Loading branch information
Undo1 committed Jan 11, 2015
2 parents 25695ad + 8a67d58 commit 21f2066
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 9 deletions.
61 changes: 61 additions & 0 deletions bodyfetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import json
import requests
from spamhandling import *
from chatcommunicate import *
from datahandling import *
from parsing import get_user_from_url

class BodyFetcher:
queue = {}

specialCases = {"stackoverflow.com" : 5, "serverfault.com" : 5, "superuser.com" : 5}

def addToQueue(self, post):
d=json.loads(json.loads(post)["data"])
sitebase = d["siteBaseHostAddress"]
postid = d["id"]
if sitebase in self.queue:
self.queue[sitebase].append(postid)
else:
self.queue[sitebase] = [postid]

print self.queue
self.checkQueue()

def checkQueue(self):
for site, values in self.queue.iteritems():
if site in self.specialCases:
if len(self.queue[site]) >= self.specialCases[site]:
print "site " + site + " met special case quota, fetching..."
self.makeApiCallForSite(site)
return

# if we don't have any sites with their queue filled, take the first one without a special case
for site, values in self.queue.iteritems():
if site not in self.specialCases:
self.makeApiCallForSite(site)
return


def makeApiCallForSite(self, site):
posts = self.queue.pop(site)
url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!-Kh)95tdb6R0joni_wabz(1g(16eESDja&key=IAkbitmze4B8KpacUfLqkw(("
response = requests.get(url).json()

for post in response["items"]:
result = FindSpam.testbody(post["body"],site)
if result != []:
try:
reason = ", ".join(result)
s="[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] %s: [%s](%s) by [%s](%s) on `%s`" % \
(reason,post["title"].strip(), post["link"],post["owner"]["display_name"].strip(), post["owner"]["link"], site)
print GlobalVars.parser.unescape(s).encode('ascii',errors='replace')
if time.time() >= GlobalVars.blockedTime:
GlobalVars.charcoal_hq.send_message(s)
GlobalVars.tavern_on_the_meta.send_message(s)
for specialroom in GlobalVars.specialrooms:
sites = specialroom["sites"]
if site in sites and reason not in specialroom["unwantedReasons"]:
# specialroom["room"].send_message(s)
except:
print "NOP"
33 changes: 24 additions & 9 deletions findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,21 @@

class FindSpam:
rules = [
<<<<<<< HEAD
{'regex': u"(?i)\\b(baba(ji)?|nike|vashi?k[ae]r[ae]n|sumer|kolcak|porn|molvi|judi bola|ituBola.com|lost lover|11s|acai|skin care|LifeForce|swtor2credits|me2.do|black magic|bam2u|Neuro(3X|flexyn)|Nutra|TesteroneXL|Bowtrol|Slim ?Genix|Cleanse EFX|Babyliss ?Pro|Forskolin|Blackline Elite|TestCore Pro|Xtreme Antler|Maxx Test 3000|Cheap Wigs?|(Improve )?Brain Power|aging skin|acne( prone)? skin|(skin )?eye serum|skin (serum|eye)|fake (passports?|driver'?s? licen[cs]e|ID cards?)|bagprada)\\b|ಌ|(support|service|helpline)( phone)? number|1[ -]?866[ -]?978[ -]?6819", 'all': True,
'sites': [], 'reason': "Bad keyword in {}", 'title': True, 'username': True},
{'regex': u"(?i)\\b(weight (loo?s[es]|reduction)|antiag(e)?ing lotion|muscles? build(ing)?|muscles?( (grow(th)?|diets?))?|anti aging|SkinCentric|loo?s[es] weight|wrinkles?)\\b", 'all': True,
'sites': ["fitness.stackexchange.com"], 'reason': "Bad keyword in {}", 'title': True, 'username': True},
'sites': [], 'reason': "Bad keyword in {}", 'title': True, 'body': True, 'username': True},
{'regex': u"(?i)\\b(weight (loo?s[es]|reduction)|muscles? build(ing)?|muscles?( (grow(th)?|diets?))?|anti aging|SkinCentric|loo?s[es] weight|wrinkles?)\\b", 'all': True,
'sites': ["fitness.stackexchange.com"], 'reason': "Bad keyword in {}", 'title': True, 'body': False, 'username': True},
{'regex': u"(?i)^(?:(?=.*?\\b(?:online|hd)\\b)(?=.*?(?:free|full|unlimited)).*?movies?\\b|(?=.*?\\b(?:acai|kisn)\\b)(?=.*?care).*products?\\b|(?=.*?packer).*mover)", 'all': True,
'sites': [], 'reason': "Bad keywords in {}", 'title': True, 'username': True},
'sites': [], 'reason': "Bad keywords in {}", 'title': True, 'body': False, 'username': True},
{'regex': u"\\d(?:_*\\d){9}|\\+?\\d_*\\d[\\s\\-]?(?:_*\\d){8,10}|\\d[ -]?\\d{3}[ -]?\\d{3}[ -]?\\d{4}", 'all': True,
'sites': ["patents.stackexchange.com"], 'reason': "Phone number detected", 'validation_method': 'checkphonenumbers', 'title': True, 'username': False},
'sites': ["patents.stackexchange.com"], 'reason': "Phone number detected", 'validation_method': 'checkphonenumbers', 'title': True, 'body': False, 'username': False},
{'regex': u"(?i)\\b(nigg(a|er)|asshole|crap|fag|fuck(ing?)?|shit|whore)s?\\b", 'all': True,
'sites': [], 'reason': "Offensive {} detected",'insensitive':True, 'title': True, 'username': False},
'sites': [], 'reason': "Offensive {} detected",'insensitive':True, 'title': True, 'body': True, 'username': False},
{'regex': u"^(?=.*[A-Z])[^a-z]*$", 'all': True, 'sites': [], 'reason': "All-caps title", 'title': True, 'username': False},
{'regex': u"^(?=.*[0-9])[^a-zA-Z]*$", 'all': True, 'sites': [], 'reason': "Numbers-only title", 'title': True, 'username': False},
{'regex': u"^(?=.*[0-9])[^a-zA-Z]*$", 'all': True, 'sites': [], 'reason': "Numbers-only title", 'title': True, 'body': False, 'username': False},
{'regex': u"https?://[a-zA-Z0-9_.-]+\\.[a-zA-Z]{2,4}(/[a-zA-Z0-9_/?=.-])?", 'all': True,
'sites': ["stackoverflow.com", "superuser.com", "askubuntu.com"], 'reason': "URL in title", 'title': True, 'username': False}
'sites': ["stackoverflow.com", "superuser.com", "askubuntu.com"], 'reason': "URL in title", 'title': True, 'body': False, 'username': False}
]

@staticmethod
Expand All @@ -38,7 +39,21 @@ def testpost(title, user_name, site):
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
result.append(rule['reason'])
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", "username"))
result.append(rule['reason'].replace("{}", "username"))
return result

@staticmethod
def testbody(body,site):
result = [];
for rule in FindSpam.rules:
if rule['all'] != (site in rule['sites']):
matched_body = re.compile(rule['regex'], re.UNICODE).findall(body)
if matched_body and rule['body']:
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
result.append(rule['reason'].replace("{}", "body"))
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", "body"))
return result

@staticmethod
Expand Down
4 changes: 4 additions & 0 deletions ws.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ChatExchange.chatexchange.client import *
import traceback
from spamhandling import *
from bodyfetcher import *
from chatcommunicate import *

# !! Important! Be careful when adding code before this point.
Expand All @@ -28,6 +29,7 @@
load_files()
filter_auto_ignored_posts()

b=BodyFetcher()
GlobalVars.wrap.login(username, password)
GlobalVars.wrapm.login(username, password)
GlobalVars.s = "[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] SmokeDetector started at [rev " + GlobalVars.commit_with_author + "](https://github.com/Charcoal-SE/SmokeDetector/commit/"+ GlobalVars.commit +") (hosted by Undo)"
Expand Down Expand Up @@ -66,6 +68,8 @@ def restart_automatically(time_in_seconds):
if a is not None and a != "":
if checkifspam(a):
threading.Thread(target=handlespam,args=(a,)).start()
else:
threading.Thread(target=b.addToQueue,args=(a,)).start()
except Exception, e:
now = datetime.utcnow()
delta = now - UtcDate.startup_utc_date
Expand Down

0 comments on commit 21f2066

Please sign in to comment.