Skip to content

Commit

Permalink
spaces for separation in link text
Browse files Browse the repository at this point in the history
not to be confused with URL-as-link text
  • Loading branch information
normalhuman committed Aug 22, 2016
1 parent a1f3388 commit 80cc094
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ def bad_link_text(s, site): # suspicious text of a hyperlink
s = regex.sub("</?strong>|</?em>", "", s) # remove font tags
keywords = regex.compile(ur"(?isu)^(buy|cheap)\b|live[ -]?stream|\bmake (money|\$)| \$[\d,]{3}|\b(porno?|(whole)?sale|coins|replica|luxury|essays?|in \L<city>)\b|\b\L<city>.*(service|escort|call girl)|(best|make|full|hd|software|cell|data)[\w ]{1,20}(online|service|company|repair|recovery)|\bwriting service", city=FindSpam.city_list)
links = regex.compile(ur'(?<=nofollow">)[^<]*(?=</a>)', regex.UNICODE).findall(s)
business = regex.compile(r"(?i)\b(airlines?|AVG|BT|netflix|dell|Delta|epson|facebook|gmail|google|hotmail|hp|lexmark|mcafee|microsoft|norton|out[l1]ook|quickbooks|sage|windows?|yahoo)\b")
support = regex.compile(r"(?i)\b(customer|help|care|helpline|reservation|phone|recovery|service|support|contact|tech|technical|telephone|number)\b")
business = regex.compile(r"(?i)(^| )(airlines?|AVG|BT|netflix|dell|Delta|epson|facebook|gmail|google|hotmail|hp|lexmark|mcafee|microsoft|norton|out[l1]ook|quickbooks|sage|windows?|yahoo)($| )")
support = regex.compile(r"(?i)(^| )(customer|help|care|helpline|reservation|phone|recovery|service|support|contact|tech|technical|telephone|number)($| )")
for link_text in links:
keywords_match = keywords.search(link_text)
if keywords_match:
Expand Down

0 comments on commit 80cc094

Please sign in to comment.