Skip to content

Commit

Permalink
Perform checks on username
Browse files Browse the repository at this point in the history
  • Loading branch information
thomas-daniels committed Oct 28, 2014
1 parent ad23327 commit 73cf36b
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 15 deletions.
30 changes: 19 additions & 11 deletions findspam.py
Expand Up @@ -5,29 +5,37 @@
class FindSpam:
rules = [
{'regex': u"(?i)(baba(ji)?|nike|vashi?k[ae]r[ae]n|sumer|kolcak|porn|molvi|judi bola|ituBola.com|lost lover|11s|acai|skin care|me2.do|black magic|bam2u|Neuro3X|Xtreme Antler)|ಌ", 'all': True,
'sites': [], 'reason': "Bad keyword detected"},
'sites': [], 'reason': "Bad keyword in {}", 'title': True, 'username': True},
{'regex': u"(?i)(weight loss|muscles? build(ing)?|muscles? grow(th)?)", 'all': True,
'sites': ["fitness.stackexchange.com"], 'reason': "Bad keyword detected"},
'sites': ["fitness.stackexchange.com"], 'reason': "Bad keyword in {}", 'title': True, 'username': True},
{'regex': u"\\d(?:_*\\d){9}|\\+?\\d_*\\d[\\s\\-]?(?:_*\\d){8,10}", 'all': True,
'sites': ["patents.stackexchange.com"], 'reason': "Phone number detected", 'validation_method': 'checkphonenumbers'},
'sites': ["patents.stackexchange.com"], 'reason': "Phone number detected", 'validation_method': 'checkphonenumbers', 'title': True, 'username': False},
{'regex': u"(?i)(nigg?(a|er)|asshole|crap|fag|fuck(ing?)?|shit|whore)s?", 'all': True,
'sites': [], 'reason': "Offensive title detected",'insensitive':True},
{'regex': u"^(?=.*[A-Z])[^a-z]*$", 'all': True, 'sites': [], 'reason': "All-caps title"},
{'regex': u"https?://[a-zA-Z0-9_.-]+\\.[a-zA-Z]{2,4}(/[a-zA-Z0-9_/?=.-])?", 'all': True, 'sites': ["stackoverflow.com", "superuser.com", "askubuntu.com"], 'reason': "URL in title"}
'sites': [], 'reason': "Offensive {} detected",'insensitive':True, 'title': True, 'username': False},
{'regex': u"^(?=.*[A-Z])[^a-z]*$", 'all': True, 'sites': [], 'reason': "All-caps title", 'title': True, 'username': False},
{'regex': u"https?://[a-zA-Z0-9_.-]+\\.[a-zA-Z]{2,4}(/[a-zA-Z0-9_/?=.-])?", 'all': True,
'sites': ["stackoverflow.com", "superuser.com", "askubuntu.com"], 'reason': "URL in title", 'title': True, 'username': False}
]

@staticmethod
def testpost(title, site):
def testpost(title, user_name, site):
result = [];
for rule in FindSpam.rules:
if rule['all'] != (site in rule['sites']):
matched = re.compile(rule['regex'], re.UNICODE).findall(title)
if matched:
matched_title = re.compile(rule['regex'], re.UNICODE).findall(title)
matched_username = re.compile(rule['regex'], re.UNICODE).findall(user_name)
if matched_title and rule['title']:
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched):
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
result.append(rule['reason'])
except KeyError: # There is no special logic for this rule
result.append(rule['reason'])
result.append(rule['reason'].replace("{}", "title"))
if matched_username and rule['username']:
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
result.append(rule['reason'])
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", "username"))
return result

@staticmethod
Expand Down
10 changes: 6 additions & 4 deletions ws.py
Expand Up @@ -100,6 +100,7 @@ def is_false_positive(post_id, site_name):
def checkifspam(data):
d=json.loads(json.loads(data)["data"])
s= d["titleEncodedFancy"]
poster = d["owner"]
print time.strftime("%Y-%m-%d %H:%M:%S"),parser.unescape(s).encode("ascii",errors="replace")
quality_score = bayesian_score(s)
print quality_score
Expand All @@ -108,7 +109,7 @@ def checkifspam(data):
site = d["siteBaseHostAddress"]
site=site.encode("ascii",errors="replace")
sys.stdout.flush()
test=FindSpam.testpost(s,site)
test=FindSpam.testpost(s,poster,site)
if (0<len(test)):
post_id = d["id"]
if(has_already_been_posted(site, post_id, s) or is_false_positive(post_id, site)):
Expand All @@ -125,7 +126,7 @@ def checkifspam(data):
return False

def fetch_post_id_and_site_from_msg_content(content):
search_regex = r"^\[ \[SmokeDetector\]\(https:\/\/github.com\/Charcoal-SE\/SmokeDetector\) \] [\w ]+: \[.+]\(http:\/\/[\w.]+\/questions\/(\d+)\/.+\) on `([\w.]+)`$"
search_regex = r"^\[ \[SmokeDetector\]\(https:\/\/github.com\/Charcoal-SE\/SmokeDetector\) \] [\w ]+: \[.+]\(http:\/\/[\w.]+\/questions\/(\d+)\/.+\) by `.+` on `([\w.]+)`$"
m = re.compile(search_regex).search(content)
if m is None:
return None
Expand Down Expand Up @@ -161,9 +162,10 @@ def handlespam(data):
try:
d=json.loads(json.loads(data)["data"])
title = d["titleEncodedFancy"]
reason=", ".join(FindSpam.testpost(title,d["siteBaseHostAddress"]))
poster = d["owner"]
reason=", ".join(FindSpam.testpost(title,poster,d["siteBaseHostAddress"]))
titleToPost = parser.unescape(re.sub(r"([_*\\`\[\]])", r"\\\1", title)).strip()
s="[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] %s: [%s](%s) on `%s`" % (reason,titleToPost,d["url"],d["siteBaseHostAddress"])
s="[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] %s: [%s](%s) by `%s` on `%s`" % (reason,titleToPost,d["url"],poster,d["siteBaseHostAddress"])
print parser.unescape(s).encode('ascii',errors='replace')
if time.time() >= blockedTime:
room.send_message(s)
Expand Down

0 comments on commit 73cf36b

Please sign in to comment.