Skip to content

Commit 2bb583e

Browse files
committed
This implements username like website; adjusts method calls to accept *args
Refs #450, #538
1 parent 4453953 commit 2bb583e

File tree

3 files changed

+143
-20
lines changed

3 files changed

+143
-20
lines changed

findspam.py

Lines changed: 135 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,36 @@
11
# -*- coding: utf-8 -*-
22
import regex
33
import phonenumbers
4+
from difflib import SequenceMatcher
5+
import tld
6+
from tld.utils import update_tld_names
7+
from urlparse import urlparse
8+
update_tld_names()
9+
10+
11+
SIMILAR_THRESHOLD = 0.95
12+
EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
13+
RE_COMPILE = regex.compile(EXCEPTION_RE)
14+
COMMON_MALFORMED_PROTOCOLS = [
15+
('httl://', 'http://'),
16+
]
17+
18+
# Flee before the ugly URL validator regex!
19+
# We are using this, instead of a nice library like BeautifulSoup, because spammers are
20+
# stupid and don't always know how to actually *link* their web site. BeautifulSoup misses
21+
# those plain text URLs.
22+
# https://gist.github.com/dperini/729294#gistcomment-1296121
23+
URL_REGEX = regex.compile(
24+
r"""((?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)"""
25+
r"""(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2}))"""
26+
r"""(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"""
27+
r"""(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"""
28+
r"""|(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-?)"""
29+
r"""*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?""", regex.UNICODE)
430

531

632
# noinspection PyUnusedLocal
7-
def has_repeated_words(s, site):
33+
def has_repeated_words(s, site, *args):
834
words = regex.split(r"[\s.,;!/\()\[\]+_-]", s)
935
words = [word for word in words if word != ""]
1036
streak = 0
@@ -21,7 +47,7 @@ def has_repeated_words(s, site):
2147

2248

2349
# noinspection PyUnusedLocal
24-
def has_few_characters(s, site):
50+
def has_few_characters(s, site, *args):
2551
s = regex.sub("</?p>", "", s).rstrip() # remove HTML paragraph tags from posts
2652
uniques = len(set(list(s)))
2753
if (len(s) >= 30 and uniques <= 6) or (len(s) >= 100 and uniques <= 15): # reduce if false reports appear
@@ -33,7 +59,7 @@ def has_few_characters(s, site):
3359

3460

3561
# noinspection PyUnusedLocal
36-
def has_repeating_characters(s, site):
62+
def has_repeating_characters(s, site, *args):
3763
s = regex.sub('http[^"]*', "", s) # remove URLs for this check
3864
if s is None or len(s) == 0 or len(s) >= 300 or regex.compile("<pre>|<code>").search(s):
3965
return False, ""
@@ -45,7 +71,7 @@ def has_repeating_characters(s, site):
4571

4672

4773
# noinspection PyUnusedLocal
48-
def link_at_end(s, site): # link at end of question, on selected sites
74+
def link_at_end(s, site, *args): # link at end of question, on selected sites
4975
s = regex.sub("</strong>|</em>|</p>", "", s)
5076
match = regex.compile(ur"(?i)https?://(?:[.A-Za-z0-9-]*/?[.A-Za-z0-9-]*/?|plus\.google\.com/"
5177
ur"[\w/]*|www\.pinterest\.com/pin/[\d/]*)</a>\s*$").search(s)
@@ -57,7 +83,7 @@ def link_at_end(s, site): # link at end of question, on selected sites
5783

5884

5985
# noinspection PyUnusedLocal
60-
def non_english_link(s, site): # non-english link in short answer
86+
def non_english_link(s, site, *args): # non-english link in short answer
6187
if len(s) < 600:
6288
links = regex.compile(ur'nofollow(?: noreferrer)?">([^<]*)(?=</a>)', regex.UNICODE).findall(s)
6389
for link_text in links:
@@ -69,7 +95,7 @@ def non_english_link(s, site): # non-english link in short answer
6995
return False, ""
7096

7197

72-
def mostly_non_latin(s, site): # majority of post is in non-Latin, non-Cyrillic characters
98+
def mostly_non_latin(s, site, *args): # majority of post is in non-Latin, non-Cyrillic characters
7399
if regex.compile("<pre>|<code>").search(s) and site == "stackoverflow.com": # Avoid false positives on SO
74100
return False, ""
75101
word_chars = regex.sub(r'(?u)[\W0-9]|http\S*', "", s)
@@ -80,7 +106,7 @@ def mostly_non_latin(s, site): # majority of post is in non-Latin, non-Cyrilli
80106

81107

82108
# noinspection PyUnusedLocal
83-
def has_phone_number(s, site):
109+
def has_phone_number(s, site, *args):
84110
if regex.compile(ur"(?i)\b(address(es)?|run[- ]?time|error|value|server|hostname|timestamp|warning|code|"
85111
ur"(sp)?exception|version|chrome|1234567)\b", regex.UNICODE).search(s):
86112
return False, "" # not a phone number
@@ -106,7 +132,7 @@ def has_phone_number(s, site):
106132
return False, ""
107133

108134

109-
def has_customer_service(s, site): # flexible detection of customer service in titles
135+
def has_customer_service(s, site, *args): # flexible detection of customer service in titles
110136
s = s[0:300].lower() # if applied to body, the beginning should be enough: otherwise many false positives
111137
s = regex.sub(r"[^A-Za-z0-9\s]", "", s) # deobfuscate
112138
phrase = regex.compile(r"(tech(nical)? support)|((support|service|contact|help(line)?) (telephone|phone|"
@@ -126,7 +152,7 @@ def has_customer_service(s, site): # flexible detection of customer service in
126152

127153

128154
# noinspection PyUnusedLocal
129-
def has_health(s, site): # flexible detection of health spam in titles
155+
def has_health(s, site, *args): # flexible detection of health spam in titles
130156
s = s[0:200] # if applied to body, the beginning should be enough: otherwise many false positives
131157
capitalized = len(regex.compile(r"\b[A-Z][a-z]").findall(s)) >= 5 # words beginning with uppercase letter
132158
organ = regex.compile(r"(?i)\b(colon|skin|muscle|bicep|fac(e|ial)|eye|brain|IQ|mind|head|hair|peni(s|le)|"
@@ -157,7 +183,7 @@ def has_health(s, site): # flexible detection of health spam in titles
157183
return False, ""
158184

159185

160-
def keyword_email(s, site): # a keyword and an email in the same post
186+
def keyword_email(s, site, *args): # a keyword and an email in the same post
161187
if regex.compile("<pre>|<code>").search(s) and site == "stackoverflow.com": # Avoid false positives on SO
162188
return False, ""
163189
keyword = regex.compile(ur"(?i)\b(training|we (will )?(offer|develop|provide)|sell|invest(or|ing|ment)|credit|"
@@ -178,7 +204,7 @@ def keyword_email(s, site): # a keyword and an email in the same post
178204

179205

180206
# noinspection PyUnusedLocal
181-
def keyword_link(s, site): # thanking keyword and a link in the same short answer
207+
def keyword_link(s, site, *args): # thanking keyword and a link in the same short answer
182208
if len(s) > 400:
183209
return False, ""
184210
link = regex.compile(ur'(?i)<a href="https?://\S+').search(s)
@@ -200,7 +226,7 @@ def keyword_link(s, site): # thanking keyword and a link in the same short ans
200226

201227

202228
# noinspection PyUnusedLocal
203-
def bad_link_text(s, site): # suspicious text of a hyperlink
229+
def bad_link_text(s, site, *args): # suspicious text of a hyperlink
204230
s = regex.sub("</?strong>|</?em>", "", s) # remove font tags
205231
keywords = regex.compile(ur"(?isu)^(buy|cheap) |live[ -]?stream|(^| )make (money|\$)|(^| )(porno?|(whole)?sale|"
206232
ur"coins|replica|luxury|essays?|in \L<city>)($| )|(^| )\L<city>.*(service|escort|"
@@ -224,7 +250,7 @@ def bad_link_text(s, site): # suspicious text of a hyperlink
224250

225251

226252
# noinspection PyUnusedLocal
227-
def is_offensive_post(s, site):
253+
def is_offensive_post(s, site, *args):
228254
if s is None or len(s) == 0:
229255
return False, ""
230256

@@ -246,13 +272,99 @@ def is_offensive_post(s, site):
246272

247273

248274
# noinspection PyUnusedLocal
249-
def has_eltima(s, site):
275+
def has_eltima(s, site, *args):
250276
reg = regex.compile(ur"(?is)\beltima")
251277
if reg.search(s) and len(s) <= 750:
252278
return True, u"Bad keyword *eltima* and body length under 750 chars"
253279
return False, ""
254280

255281

282+
# noinspection PyUnusedLocal
283+
def username_similar_website(s, site, *args):
284+
username = args[0]
285+
print "Username:", username
286+
sim_result = perform_similarity_checks(s, username)
287+
print "Sim result:", sim_result
288+
if sim_result >= SIMILAR_THRESHOLD:
289+
return True, u"Username similar to website"
290+
else:
291+
return False, ""
292+
293+
294+
def perform_similarity_checks(post, name):
295+
"""
296+
Performs 4 tests to determine similarity between links in the post and the user name
297+
:param post: Test of the post
298+
:param name: Username to compare against
299+
:return: Float ratio of similarity
300+
"""
301+
# Fix stupid spammer tricks
302+
for p in COMMON_MALFORMED_PROTOCOLS:
303+
post = post.replace(p[0], p[1])
304+
print "Post:", post
305+
# Find links in post
306+
found_links = regex.findall(URL_REGEX, post)
307+
308+
links = []
309+
for l in found_links:
310+
if l[-1].isalnum():
311+
links.append(l)
312+
else:
313+
links.append(l[:-1])
314+
315+
links = list(set(links))
316+
t1 = 0
317+
t2 = 0
318+
t3 = 0
319+
t4 = 0
320+
321+
if links:
322+
for link in links:
323+
domain = get_domain(link)
324+
# Straight comparison
325+
t1 = similar_ratio(domain, name)
326+
# Strip all spaces check
327+
t2 = similar_ratio(domain, name.replace(" ", ""))
328+
# Strip all hypens
329+
t3 = similar_ratio(domain.replace("-", ""), name.replace("-", ""))
330+
# Strip both hypens and spaces
331+
t4 = similar_ratio(domain.replace("-", "").replace(" ", ""), name.replace("-", "").replace(" ", ""))
332+
# Have we already exceeded the threshold? End now if so, otherwise, check the next link
333+
if max(t1, t2, t3, t4) >= SIMILAR_THRESHOLD:
334+
return max(t1, t2, t3, t4)
335+
else:
336+
return 0
337+
#print "ERROR PARSING STRING: ", post
338+
return max(t1, t2, t3, t4)
339+
340+
341+
def similar_ratio(a, b):
342+
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
343+
344+
345+
def get_domain(s):
346+
try:
347+
extract = tld.get_tld(s, fix_protocol=True, as_object=True, )
348+
domain = extract.domain
349+
except tld.exceptions.TldDomainNotFound as e:
350+
invalid_tld = RE_COMPILE.match(e.message).group(1)
351+
# Attempt to replace the invalid protocol
352+
s1 = s.replace(invalid_tld, 'http', 1)
353+
try:
354+
extract = tld.get_tld(s1, fix_protocol=True, as_object=True, )
355+
domain = extract.domain
356+
except tld.exceptions.TldDomainNotFound as e:
357+
# Assume bad TLD and try one last fall back, just strip the trailing TLD and leading subdomain
358+
parsed_uri = urlparse(s)
359+
# print "Exception within Exception result:", parsed_uri
360+
# print parsed_uri.path.split(".")
361+
if len(parsed_uri.path.split(".")) >= 3:
362+
domain = parsed_uri.path.split(".")[1]
363+
else:
364+
domain = parsed_uri.path.split(".")[0]
365+
return domain
366+
367+
256368
# noinspection PyClassHasNoInit
257369
class FindSpam:
258370
with open("bad_keywords.txt", "r") as f:
@@ -723,7 +835,12 @@ class FindSpam:
723835
'body_summary': False, 'max_rep': 1, 'max_score': 0},
724836
{'regex': u"(?i)^jeff$", 'all': False, 'sites': ["parenting.stackexchange.com"],
725837
'reason': "blacklisted username", 'title': False, 'body': False, 'username': True,
726-
'stripcodeblocks': False, 'body_summary': False, 'max_rep': 1, 'max_score': 0}
838+
'stripcodeblocks': False, 'body_summary': False, 'max_rep': 1, 'max_score': 0},
839+
840+
# User name similar to link
841+
{'method': username_similar_website, 'all': True, 'sites': [], 'reason': "username similar to website {}",
842+
'title': False, 'body': True, 'username': False, 'stripcodeblocks': False, 'body_summary': True,
843+
'max_rep': 50, 'max_score': 0},
727844
]
728845

729846
@staticmethod
@@ -767,15 +884,15 @@ def test_post(title, body, user_name, site, is_answer, body_is_summary, user_rep
767884
matched_body = compiled_regex.findall(body_to_check)
768885
else:
769886
assert 'method' in rule
770-
matched_title, why_title = rule['method'](title, site)
887+
matched_title, why_title = rule['method'](title, site, user_name)
771888
if matched_title and rule['title']:
772889
why["title"].append(u"Title - {}".format(why_title))
773-
matched_username, why_username = rule['method'](user_name, site)
890+
matched_username, why_username = rule['method'](user_name, site, user_name)
774891
if matched_username and rule['username']:
775892
why["username"].append(u"Username - {}".format(why_username))
776893
if (not body_is_summary or rule['body_summary']) and (not is_answer or check_if_answer) and \
777894
(is_answer or check_if_question):
778-
matched_body, why_body = rule['method'](body_to_check, site)
895+
matched_body, why_body = rule['method'](body_to_check, site, user_name)
779896
if matched_body and rule['body']:
780897
why["body"].append(u"Post - {}".format(why_body))
781898
if matched_title and rule['title']:

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ pep8-naming
1111
regex==2015.11.22
1212
termcolor
1313
sh
14+
tld

test/test_regexes.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,19 @@
7070
('Title here', '<img src="http://example.com/11111111111.jpg" alt="my image">', '', 'stackoverflow.com', False, False),
7171
('Title here', '<img src="http://example.com/11111111111111.jpg" alt="my image" />', '', 'stackoverflow.com', False, False),
7272
('Title here', '<a href="http://example.com/11111111111111.html">page</a>', '', 'stackoverflow.com', False, False),
73-
('Error: 2147467259', '', '', 'stackoverflow.com', False, False)
73+
('Error: 2147467259', '', '', 'stackoverflow.com', False, False),
74+
('Max limit on number of concurrent ajax request', """<p>Php java script boring yaaarrr <a href="http://www.price-buy.com/" rel="nofollow noreferrer">Price-Buy.com</a> </p>""", 'Price Buy', 'stackoverflow.com', True, True),
75+
('Proof of onward travel in Japan?', """<p>The best solution to overcome the problem of your travel<a href="https://i.stack.imgur.com/eS6WQ.jpg" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/eS6WQ.jpg" alt="enter image description here"></a></p>
76+
77+
<p>httl://bestonwardticket.com</p>""", 'Best onward Ticket', 'travel.stackexchange.com', True, True),
78+
('Max limit on number of concurrent ajax request', """<p>Php java script boring yaaarrr <a href="http://www.price-buy.com/" rel="nofollow noreferrer">Price-Buy.com</a> </p>""", 'Totally Unrelated Username', 'stackoverflow.com', True, False),
7479
])
7580
def test_regexes(title, body, username, site, body_is_summary, match):
7681
# If we want to test answers separately, this should be changed
7782
is_answer = False
7883
result = FindSpam.test_post(title, body, username, site, is_answer, body_is_summary, 1, 0)[0]
7984
print title
80-
print result
85+
print "Result:", result
8186
isspam = False
8287
if len(result) > 0:
8388
isspam = True

0 commit comments

Comments
 (0)