Skip to content

Commit

Permalink
findspam.py: obfuscated_word(): new rule "obfuscated word"
Browse files Browse the repository at this point in the history
Simple 1337 decoder for a small set of troll / phone support scam keywords

test/test_findspam.py: test case for the above
  • Loading branch information
tripleee committed Nov 24, 2021
1 parent 8c1d2ef commit 4bbdf99
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 0 deletions.
70 changes: 70 additions & 0 deletions findspam.py
Expand Up @@ -8,6 +8,7 @@
from itertools import chain
from collections import Counter
from datetime import datetime
from string import punctuation
import time
import os
import os.path as path
Expand Down Expand Up @@ -2413,6 +2414,75 @@ def religion_troll(s, site):
r'\bICQ[ :]{0,5}\d{9}\b|\bwh?atsa+pp?[ :+]{0,5}\d{10}',
stripcodeblocks=True)


# Homoglyph obfuscation, used both by trolls and spammers
@create_rule("obfuscated word", max_rep=50, stripcodeblocks=True)
def obfuscated_word(s, site):
obfuscation_keywords = [
# Trolls && cussing
"c" + "ock",
"c" + "unt",
"d" + "ick",
"f" + "uck",
"mother" + "f" + "ucker",
"p" + "enis",
"p" + "ussy",
"w" + "hore",
"black helicopters",
"attack helicopters",

# Phone support scam
# Rule only covers single words for the time being
# Also, comment out really short ones to reduce chance for FPs
# "amazon prime",
# "avg",
"binance",
"coinbase",
# "ebay",
"gemini",
# "hp printer",
"norton",
"paypal",
"printer", # maybe remove if we enable "hp printer"
"quickbooks",
# "sage",
"sbcglobal",
"ticketmaster",
# "trust wallet",
"turbotax",
"wallet", # maybe remove if we enable "trust wallet"

"support",
"phone",
"number",
"helpline"
]
# Simple 1337 translator
t = {
'4': 'a',
'3': 'e',
'6': 'g',
'9': 'g',
'1': 'l',
'!': 'i',
'0': 'o',
'5': 's',
'7': 't',
'2': 'z',
}
for p in punctuation:
if p not in t:
t[p] = ''
trans = "".maketrans(t)
for word in regex.split(r'[-\s_]+', s):
# prevent FP on stuff like 'I have this "number": 1111'
word = word.strip(punctuation)
translated = word.translate(trans).lower()
if translated in obfuscation_keywords and translated != word.lower():
return True, "%r is obfuscated %r" % (word, translated)
return False, ""


# Category: Trolling
# Offensive title: titles are more sensitive
create_rule("offensive {} detected",
Expand Down
1 change: 1 addition & 0 deletions test/test_findspam.py
Expand Up @@ -118,6 +118,7 @@
('keytones', '<p>Some body</p>', 'a username', 'superuser.com', True, True, False),
('A title', 'keytones', 'a username', 'superuser.com', True, True, True),
('A title', '<p>Some body</p>', 'keytones', 'superuser.com', True, True, True),
('C01nb4s3 support number', 'C01nb4s3 support number', 'spammer', 'stackoverflow.com', True, True, True),
('emoji \U0001f525 emoji', 'emoji \U0001f525 emoji \U0001f525 emoji', 'tripleee', 'stackoverflow.com', True, False, False),
('emoji \U0001f525 emoji \U0001f525 emoji', 'two emojis in title should trigger, others not', 'tripleee', 'stackoverflow.com', True, False, True),
])
Expand Down

0 comments on commit 4bbdf99

Please sign in to comment.