In [29]:
import requests
from collections import defaultdict
import random


### Using unicode database for homoglyphs
https://www.unicode.org/reports/tr39/ :

**Summary**
Because Unicode contains such a large number of characters and incorporates the varied writing systems of the world, incorrect usage can expose programs or systems to possible security attacks. This document specifies mechanisms that can be used to detect possible security problems.

**Status**
This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium. This is a stable document and may be used as reference material or cited as a normative reference by other specifications.

In [2]:
def build_homoglyph_map():
    url = "https://www.unicode.org/Public/security/latest/confusables.txt" 
    response = requests.get(url) # Fetch the confusables data
    raw_text = response.text     # Get the text content

    homoglyph_map = defaultdict(list) 

    for line in raw_text.splitlines():
        if line.startswith('#') or not line.strip(): # Skip comments and empty lines
            continue
        try:
            src_hex, target_hex, *_ = line.split(';') # 
            src_char = chr(int(src_hex.strip(), 16))
            target_chars = ''.join([chr(int(h, 16)) for h in target_hex.strip().split()])

            # We only want visually similar substitutions that map to 1 character
            if len(src_char) == 1 and len(target_chars) == 1:
                ascii_base = target_chars.lower()
                if ascii_base.isascii() and ascii_base.isalnum():
                    homoglyph_map[ascii_base].append(src_char)
        except Exception as e:
            continue  # skip malformed lines

    # Convert defaultdict to normal dict and deduplicate entries
    homoglyph_map = {k: list(set(v)) for k, v in homoglyph_map.items()}

    return homoglyph_map



In [3]:
# Some letters to test
if __name__ == "__main__":
    homoglyph_map = build_homoglyph_map()
    for letter in ['a', 'e', 'i', 'o', 's', 't']:
        print(f"{letter} → {homoglyph_map.get(letter, [])}")

a → ['𝔞', '𝗔', '𝚨', 'а', '𝔄', '𝘼', '𝘈', '𝛂', 'Α', '𐊠', '𝓐', '𝖆', 'А', '𝕬', '𝐀', '𝞪', '𝖺', '𝞐', '𝙖', '𝑨', '𝒶', '𝖠', '𝐚', 'ꓮ', 'ɑ', '⍺', '𖽀', '𝙰', '𝒜', '𝒂', 'ᗅ', '𝕒', 'Ａ', '𝓪', '𝚊', '\U0001ccd6', '𝜶', '𝗮', 'ａ', '𝛢', '𝝰', '𝝖', 'α', '𝔸', 'Ꭺ', '𝘢', '𝛼', '𝐴', '𝑎', '𝜜']
e → ['𝛦', 'ℯ', '𝗲', '𝕰', '𝑒', '𝖤', 'ⴹ', 'Ε', '𑢮', '𝑬', '℮', '𝙴', '⋿', '𝖾', '𝗘', '𝜠', 'Ｅ', '𝔼', '𝓮', 'ꬲ', '𝔢', '𝓔', 'Е', '𝔈', '𝘦', 'ｅ', '𑢦', '𝚬', '𝚎', 'ⅇ', '𝐸', '𝙀', '𝖊', '𝐄', '𝐞', '𐊆', '𝞔', '\U0001ccda', '𝘌', 'е', 'ℰ', '𝝚', '𝒆', '𝕖', 'ꓰ', 'Ꭼ', 'ҽ', '𝙚']
i → ['𝗶', 'ｉ', 'ι', 'ꙇ', '𝜾', '˛', '𝐢', 'ӏ', '𝓲', '𝒾', '𝙞', '𝘪', 'ι', 'і', '𝗂', '𝑖', '𝒊', '𝚤', '⍳', 'ɩ', 'ı', 'ɪ', '𝕚', '𑣃', '𝖎', '𝚒', '𝞲', '𝔦', 'ꭵ', 'Ꭵ', 'ℹ', '𝛊', 'ⅰ', '𝜄', 'ⅈ', '𝝸', 'ͺ']
o → ['०', '𝚶', '𝘖', 'ⲟ', 'ﮪ', '𝝄', 'ﻬ', '𞺄', '𝜪', '𐊒', '౦', '𝙾', 'ﻪ', '໐', 'О', 'ە', 'ﮧ', '۵', '𝗢', 'о', '𐓪', '๐', '𝕠', '𝛰', '𝘰', 'ಂ', '𑣗', 'ꬽ', 'ഠ', '𝝤', 'Օ', '𝕺', 'ﻩ', '೦', 'ﮫ', '0', '𑣈', '𝐨', '𐓂', '𝟘', '🯰', 'ం', 'ᴑ', '𝞞', '𝒐', '𝑶', '𝛔', '𝗼', '\U0001ccf0', 'ං', '𝓸', '𝛐', 'ﮦ', '𝟎', '𝝈', '𝟬'

### Leetspeak perturbations

https://pypi.org/project/pyleetspeak/

This tool aims to counter new misinformation that emerges in social media platforms by providing a mechanism for simulating and generating leetspeak/word camouflaging data.

In [None]:
# ! pip install pyleetspeak
# ! pip install pyphen
# ! pip install keybert
# ! pip install codetiming

In [4]:
from pyleetspeak.LeetSpeaker import LeetSpeaker
from pyleetspeak.Leet_NER_generator import NER_data_generator


In [5]:
# Example usage
text_in = "I speak leetspeak"
leeter = LeetSpeaker(
    change_prb=0.8, change_frq=0.6, mode="basic", seed=None, verbose=False
)
leet_result = leeter.text2leet(text_in)
print(leet_result)


I sp34k l3etsp3@k


In [6]:
# All possible combinations
leeter = LeetSpeaker(get_all_combs=True, mode="basic")
combinations = leeter.text2leet("hate")
print(combinations)  # list of strings


['hate', 'hat3', 'h@te', 'h4t3', 'h4te', 'h@t3']


In [7]:
leeter = LeetSpeaker(
    change_prb=0.6,     # probability of changing each char
    change_frq=0.6,     # frequency across the whole string
    mode="basic",       # you can try "intermediate" or "advanced" too
    seed=42,            # set for reproducibility
    verbose=False
)

texts = [
    "I hate you",
    "Go back to your country",
    "You are disgusting"
]

augmented_texts = [leeter.text2leet(t) for t in texts]

for clean, aug in zip(texts, augmented_texts):
    print(f"Clean:     {clean}")
    print(f"Augmented: {aug}")
    print("---")



Clean:     I hate you
Augmented: 1 hat3 y0_
---
Clean:     Go back to your country
Augmented: Go b@ck to your country
---
Clean:     You are disgusting
Augmented: You @r3 d1sgust1ng
---


### Combine homoglyph and leetspeak

In [30]:

def homoglyph_augment(text, homoglyph_map, prob=0.3):
    new_text = ""
    for char in text:
        lower_char = char.lower()
        if lower_char in homoglyph_map and random.random() < prob:
            replacement = random.choice(homoglyph_map[lower_char])
            new_text += replacement.upper() if char.isupper() else replacement
        else:
            new_text += char
    return new_text

def augment_text(text, leeter, homoglyph_map, leet_prob=0.5, homoglyph_prob=0.5):
    aug_text = text

    if random.random() < leet_prob:
        aug_text = leeter.text2leet(aug_text)

    if random.random() < homoglyph_prob:
        aug_text = homoglyph_augment(aug_text, homoglyph_map)

    return aug_text


In [31]:
# Your clean dataset
data = [
    ("I hate immigrants", 1),
    ("Have a nice day", 0),
    ("Go back to your country", 1),
    ("Welcome to our community", 0)
]

augmented_data = []

for text, label in data:
    aug_text = augment_text(text, leeter, homoglyph_map, leet_prob=0.7, homoglyph_prob=0.4)
    augmented_data.append((aug_text, label))

# Optionally merge with original
full_dataset = data + augmented_data


In [33]:
full_dataset

[('I hate immigrants', 1),
 ('Have a nice day', 0),
 ('Go back to your country', 1),
 ('Welcome to our community', 0),
 ('I h4t3 ⍳\U0001cce2migr@nts', 1),
 ('HaⅤ3 a n1c3 𝔡Аy', 0),
 ('G0 𝐵@ck t0 y0ur 𝐂ouＮ𝒯ry', 1),
 ('Welc0me t0 0ur community', 0)]

In [32]:
from torch.utils.data import Dataset

class HateSpeechDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'labels': label
        }


In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [36]:
# Assuming `full_dataset` is already prepared (clean + leetspeak + homoglyph)
dataset = HateSpeechDataset(full_dataset, tokenizer)


In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

In [38]:
for batch in train_loader:
    print(batch['input_ids'].shape)        # torch.Size([8, 128])
    print(batch['attention_mask'].shape)   # torch.Size([8, 128])
    print(batch['labels'])                 # Tensor of size [8]
    break


torch.Size([8, 128])
torch.Size([8, 128])
tensor([0, 1, 1, 0, 0, 1, 1, 0])
