11# -*- coding: utf-8 -*-
22import regex
33import phonenumbers
4+ from difflib import SequenceMatcher
5+ import tld
6+ from tld .utils import update_tld_names
7+ from urlparse import urlparse
8+ update_tld_names ()
9+
10+
11+ SIMILAR_THRESHOLD = 0.95
12+ EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
13+ RE_COMPILE = regex .compile (EXCEPTION_RE )
14+ COMMON_MALFORMED_PROTOCOLS = [
15+ ('httl://' , 'http://' ),
16+ ]
17+
18+ # Flee before the ugly URL validator regex!
19+ # We are using this, instead of a nice library like BeautifulSoup, because spammers are
20+ # stupid and don't always know how to actually *link* their web site. BeautifulSoup misses
21+ # those plain text URLs.
22+ # https://gist.github.com/dperini/729294#gistcomment-1296121
23+ URL_REGEX = regex .compile (
24+ r"""((?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)"""
25+ r"""(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2}))"""
26+ r"""(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"""
27+ r"""(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"""
28+ r"""|(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-?)"""
29+ r"""*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?""" , regex .UNICODE )
430
531
632# noinspection PyUnusedLocal
7- def has_repeated_words (s , site ):
33+ def has_repeated_words (s , site , * args ):
834 words = regex .split (r"[\s.,;!/\()\[\]+_-]" , s )
935 words = [word for word in words if word != "" ]
1036 streak = 0
@@ -21,7 +47,7 @@ def has_repeated_words(s, site):
2147
2248
2349# noinspection PyUnusedLocal
24- def has_few_characters (s , site ):
50+ def has_few_characters (s , site , * args ):
2551 s = regex .sub ("</?p>" , "" , s ).rstrip () # remove HTML paragraph tags from posts
2652 uniques = len (set (list (s )))
2753 if (len (s ) >= 30 and uniques <= 6 ) or (len (s ) >= 100 and uniques <= 15 ): # reduce if false reports appear
@@ -33,7 +59,7 @@ def has_few_characters(s, site):
3359
3460
3561# noinspection PyUnusedLocal
36- def has_repeating_characters (s , site ):
62+ def has_repeating_characters (s , site , * args ):
3763 s = regex .sub ('http[^"]*' , "" , s ) # remove URLs for this check
3864 if s is None or len (s ) == 0 or len (s ) >= 300 or regex .compile ("<pre>|<code>" ).search (s ):
3965 return False , ""
@@ -45,7 +71,7 @@ def has_repeating_characters(s, site):
4571
4672
4773# noinspection PyUnusedLocal
48- def link_at_end (s , site ): # link at end of question, on selected sites
74+ def link_at_end (s , site , * args ): # link at end of question, on selected sites
4975 s = regex .sub ("</strong>|</em>|</p>" , "" , s )
5076 match = regex .compile (ur"(?i)https?://(?:[.A-Za-z0-9-]*/?[.A-Za-z0-9-]*/?|plus\.google\.com/"
5177 ur"[\w/]*|www\.pinterest\.com/pin/[\d/]*)</a>\s*$" ).search (s )
@@ -57,7 +83,7 @@ def link_at_end(s, site): # link at end of question, on selected sites
5783
5884
5985# noinspection PyUnusedLocal
60- def non_english_link (s , site ): # non-english link in short answer
86+ def non_english_link (s , site , * args ): # non-english link in short answer
6187 if len (s ) < 600 :
6288 links = regex .compile (ur'nofollow(?: noreferrer)?">([^<]*)(?=</a>)' , regex .UNICODE ).findall (s )
6389 for link_text in links :
@@ -69,7 +95,7 @@ def non_english_link(s, site): # non-english link in short answer
6995 return False , ""
7096
7197
72- def mostly_non_latin (s , site ): # majority of post is in non-Latin, non-Cyrillic characters
98+ def mostly_non_latin (s , site , * args ): # majority of post is in non-Latin, non-Cyrillic characters
7399 if regex .compile ("<pre>|<code>" ).search (s ) and site == "stackoverflow.com" : # Avoid false positives on SO
74100 return False , ""
75101 word_chars = regex .sub (r'(?u)[\W0-9]|http\S*' , "" , s )
@@ -80,7 +106,7 @@ def mostly_non_latin(s, site): # majority of post is in non-Latin, non-Cyrilli
80106
81107
82108# noinspection PyUnusedLocal
83- def has_phone_number (s , site ):
109+ def has_phone_number (s , site , * args ):
84110 if regex .compile (ur"(?i)\b(address(es)?|run[- ]?time|error|value|server|hostname|timestamp|warning|code|"
85111 ur"(sp)?exception|version|chrome|1234567)\b" , regex .UNICODE ).search (s ):
86112 return False , "" # not a phone number
@@ -106,7 +132,7 @@ def has_phone_number(s, site):
106132 return False , ""
107133
108134
109- def has_customer_service (s , site ): # flexible detection of customer service in titles
135+ def has_customer_service (s , site , * args ): # flexible detection of customer service in titles
110136 s = s [0 :300 ].lower () # if applied to body, the beginning should be enough: otherwise many false positives
111137 s = regex .sub (r"[^A-Za-z0-9\s]" , "" , s ) # deobfuscate
112138 phrase = regex .compile (r"(tech(nical)? support)|((support|service|contact|help(line)?) (telephone|phone|"
@@ -126,7 +152,7 @@ def has_customer_service(s, site): # flexible detection of customer service in
126152
127153
128154# noinspection PyUnusedLocal
129- def has_health (s , site ): # flexible detection of health spam in titles
155+ def has_health (s , site , * args ): # flexible detection of health spam in titles
130156 s = s [0 :200 ] # if applied to body, the beginning should be enough: otherwise many false positives
131157 capitalized = len (regex .compile (r"\b[A-Z][a-z]" ).findall (s )) >= 5 # words beginning with uppercase letter
132158 organ = regex .compile (r"(?i)\b(colon|skin|muscle|bicep|fac(e|ial)|eye|brain|IQ|mind|head|hair|peni(s|le)|"
@@ -157,7 +183,7 @@ def has_health(s, site): # flexible detection of health spam in titles
157183 return False , ""
158184
159185
160- def keyword_email (s , site ): # a keyword and an email in the same post
186+ def keyword_email (s , site , * args ): # a keyword and an email in the same post
161187 if regex .compile ("<pre>|<code>" ).search (s ) and site == "stackoverflow.com" : # Avoid false positives on SO
162188 return False , ""
163189 keyword = regex .compile (ur"(?i)\b(training|we (will )?(offer|develop|provide)|sell|invest(or|ing|ment)|credit|"
@@ -178,7 +204,7 @@ def keyword_email(s, site): # a keyword and an email in the same post
178204
179205
180206# noinspection PyUnusedLocal
181- def keyword_link (s , site ): # thanking keyword and a link in the same short answer
207+ def keyword_link (s , site , * args ): # thanking keyword and a link in the same short answer
182208 if len (s ) > 400 :
183209 return False , ""
184210 link = regex .compile (ur'(?i)<a href="https?://\S+' ).search (s )
@@ -200,7 +226,7 @@ def keyword_link(s, site): # thanking keyword and a link in the same short ans
200226
201227
202228# noinspection PyUnusedLocal
203- def bad_link_text (s , site ): # suspicious text of a hyperlink
229+ def bad_link_text (s , site , * args ): # suspicious text of a hyperlink
204230 s = regex .sub ("</?strong>|</?em>" , "" , s ) # remove font tags
205231 keywords = regex .compile (ur"(?isu)^(buy|cheap) |live[ -]?stream|(^| )make (money|\$)|(^| )(porno?|(whole)?sale|"
206232 ur"coins|replica|luxury|essays?|in \L<city>)($| )|(^| )\L<city>.*(service|escort|"
@@ -224,7 +250,7 @@ def bad_link_text(s, site): # suspicious text of a hyperlink
224250
225251
226252# noinspection PyUnusedLocal
227- def is_offensive_post (s , site ):
253+ def is_offensive_post (s , site , * args ):
228254 if s is None or len (s ) == 0 :
229255 return False , ""
230256
@@ -246,13 +272,99 @@ def is_offensive_post(s, site):
246272
247273
248274# noinspection PyUnusedLocal
249- def has_eltima (s , site ):
275+ def has_eltima (s , site , * args ):
250276 reg = regex .compile (ur"(?is)\beltima" )
251277 if reg .search (s ) and len (s ) <= 750 :
252278 return True , u"Bad keyword *eltima* and body length under 750 chars"
253279 return False , ""
254280
255281
282+ # noinspection PyUnusedLocal
283+ def username_similar_website (s , site , * args ):
284+ username = args [0 ]
285+ print "Username:" , username
286+ sim_result = perform_similarity_checks (s , username )
287+ print "Sim result:" , sim_result
288+ if sim_result >= SIMILAR_THRESHOLD :
289+ return True , u"Username similar to website"
290+ else :
291+ return False , ""
292+
293+
294+ def perform_similarity_checks (post , name ):
295+ """
296+ Performs 4 tests to determine similarity between links in the post and the user name
297+ :param post: Test of the post
298+ :param name: Username to compare against
299+ :return: Float ratio of similarity
300+ """
301+ # Fix stupid spammer tricks
302+ for p in COMMON_MALFORMED_PROTOCOLS :
303+ post = post .replace (p [0 ], p [1 ])
304+ print "Post:" , post
305+ # Find links in post
306+ found_links = regex .findall (URL_REGEX , post )
307+
308+ links = []
309+ for l in found_links :
310+ if l [- 1 ].isalnum ():
311+ links .append (l )
312+ else :
313+ links .append (l [:- 1 ])
314+
315+ links = list (set (links ))
316+ t1 = 0
317+ t2 = 0
318+ t3 = 0
319+ t4 = 0
320+
321+ if links :
322+ for link in links :
323+ domain = get_domain (link )
324+ # Straight comparison
325+ t1 = similar_ratio (domain , name )
326+ # Strip all spaces check
327+ t2 = similar_ratio (domain , name .replace (" " , "" ))
328+ # Strip all hypens
329+ t3 = similar_ratio (domain .replace ("-" , "" ), name .replace ("-" , "" ))
330+ # Strip both hypens and spaces
331+ t4 = similar_ratio (domain .replace ("-" , "" ).replace (" " , "" ), name .replace ("-" , "" ).replace (" " , "" ))
332+ # Have we already exceeded the threshold? End now if so, otherwise, check the next link
333+ if max (t1 , t2 , t3 , t4 ) >= SIMILAR_THRESHOLD :
334+ return max (t1 , t2 , t3 , t4 )
335+ else :
336+ return 0
337+ #print "ERROR PARSING STRING: ", post
338+ return max (t1 , t2 , t3 , t4 )
339+
340+
341+ def similar_ratio (a , b ):
342+ return SequenceMatcher (None , a .lower (), b .lower ()).ratio ()
343+
344+
345+ def get_domain (s ):
346+ try :
347+ extract = tld .get_tld (s , fix_protocol = True , as_object = True , )
348+ domain = extract .domain
349+ except tld .exceptions .TldDomainNotFound as e :
350+ invalid_tld = RE_COMPILE .match (e .message ).group (1 )
351+ # Attempt to replace the invalid protocol
352+ s1 = s .replace (invalid_tld , 'http' , 1 )
353+ try :
354+ extract = tld .get_tld (s1 , fix_protocol = True , as_object = True , )
355+ domain = extract .domain
356+ except tld .exceptions .TldDomainNotFound as e :
357+ # Assume bad TLD and try one last fall back, just strip the trailing TLD and leading subdomain
358+ parsed_uri = urlparse (s )
359+ # print "Exception within Exception result:", parsed_uri
360+ # print parsed_uri.path.split(".")
361+ if len (parsed_uri .path .split ("." )) >= 3 :
362+ domain = parsed_uri .path .split ("." )[1 ]
363+ else :
364+ domain = parsed_uri .path .split ("." )[0 ]
365+ return domain
366+
367+
256368# noinspection PyClassHasNoInit
257369class FindSpam :
258370 with open ("bad_keywords.txt" , "r" ) as f :
@@ -723,7 +835,12 @@ class FindSpam:
723835 'body_summary' : False , 'max_rep' : 1 , 'max_score' : 0 },
724836 {'regex' : u"(?i)^jeff$" , 'all' : False , 'sites' : ["parenting.stackexchange.com" ],
725837 'reason' : "blacklisted username" , 'title' : False , 'body' : False , 'username' : True ,
726- 'stripcodeblocks' : False , 'body_summary' : False , 'max_rep' : 1 , 'max_score' : 0 }
838+ 'stripcodeblocks' : False , 'body_summary' : False , 'max_rep' : 1 , 'max_score' : 0 },
839+
840+ # User name similar to link
841+ {'method' : username_similar_website , 'all' : True , 'sites' : [], 'reason' : "username similar to website {}" ,
842+ 'title' : False , 'body' : True , 'username' : False , 'stripcodeblocks' : False , 'body_summary' : True ,
843+ 'max_rep' : 50 , 'max_score' : 0 },
727844 ]
728845
729846 @staticmethod
@@ -767,15 +884,15 @@ def test_post(title, body, user_name, site, is_answer, body_is_summary, user_rep
767884 matched_body = compiled_regex .findall (body_to_check )
768885 else :
769886 assert 'method' in rule
770- matched_title , why_title = rule ['method' ](title , site )
887+ matched_title , why_title = rule ['method' ](title , site , user_name )
771888 if matched_title and rule ['title' ]:
772889 why ["title" ].append (u"Title - {}" .format (why_title ))
773- matched_username , why_username = rule ['method' ](user_name , site )
890+ matched_username , why_username = rule ['method' ](user_name , site , user_name )
774891 if matched_username and rule ['username' ]:
775892 why ["username" ].append (u"Username - {}" .format (why_username ))
776893 if (not body_is_summary or rule ['body_summary' ]) and (not is_answer or check_if_answer ) and \
777894 (is_answer or check_if_question ):
778- matched_body , why_body = rule ['method' ](body_to_check , site )
895+ matched_body , why_body = rule ['method' ](body_to_check , site , user_name )
779896 if matched_body and rule ['body' ]:
780897 why ["body" ].append (u"Post - {}" .format (why_body ))
781898 if matched_title and rule ['title' ]:
0 commit comments