# Data Loading & Preview

In [34]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Concatenate, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import class_weight

In [None]:
import random
import nltk
from nltk.corpus import wordnet
import pandas as pd


PROTECTED_KEYWORDS = [
    "jews", "jew", "muslims", "muslim", "whites", "white",
    "blacks", "black", "asians", "asian", "immigrants", "immigrant"
]

def get_synonyms(word):

    synonyms = set()
    for syn in wordnet.synsets(word):
        for lem in syn.lemmas():
            s = lem.name().replace("_", " ").lower()
            if s != word:
                synonyms.add(s)
    return list(synonyms)


def synonym_replacement(words, n=1):

    new_words = words.copy()
    candidates = [w for w in new_words if w.lower() not in PROTECTED_KEYWORDS]

    random.shuffle(candidates)
    num_replaced = 0

    for word in candidates:
        syns = get_synonyms(word.lower())
        if len(syns) >= 1:
            synonym = random.choice(syns)
            new_words = [synonym if w == word else w for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return new_words


def random_insertion(words, n=1):

    new_words = words.copy()
    for _ in range(n):
        word = random.choice(new_words)
        syns = get_synonyms(word.lower())
        if len(syns) > 0:
            insert_word = random.choice(syns)
            pos = random.randint(0, len(new_words))
            new_words.insert(pos, insert_word)
    return new_words


def random_swap(words, n=1):

    new_words = words.copy()
    for _ in range(n):
        if len(new_words) < 2:
            return new_words
        idx1, idx2 = random.sample(range(len(new_words)), 2)
        new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
    return new_words


def random_deletion(words, p=0.05):

    if len(words) == 1:
        return words
    new_words = [w for w in words if random.random() > p]
    if len(new_words) == 0:
        return [random.choice(words)]
    return new_words


def augment_sentence(sentence, num_aug=4):

    words = sentence.split()
    augmented_sentences = []

    # 1. synonym replacement
    augmented_sentences.append(" ".join(synonym_replacement(words, n=1)))

    # 2. random insertion
    augmented_sentences.append(" ".join(random_insertion(words, n=1)))

    # 3. random swap
    augmented_sentences.append(" ".join(random_swap(words, n=1)))

    # 4. random deletion
    augmented_sentences.append(" ".join(random_deletion(words, p=0.05)))

    return augmented_sentences


df = pd.read_csv("data/cleaned/hate_speech_dataset.tsv")
df["cleaned_post"] = df["cleaned_post"].astype(str)

explicit_df = df[df["class"] == "explicit_hate"]

augmented_rows = []

for idx, row in explicit_df.iterrows():
    original = row["cleaned_post"]
    augmented = augment_sentence(original, num_aug=4)

    for aug_text in augmented:
        augmented_rows.append({
            "cleaned_post": aug_text,
            "class": row["class"],
            "bi_class": row["bi_class"],
            "mul_class": row["mul_class"]
        })

augmented_df = pd.DataFrame(augmented_rows)

print("Original explicit_hate:", len(explicit_df))
print("Augmented explicit_hate:", len(augmented_df))


Original explicit_hate: 1086
Augmented explicit_hate: 4344


In [38]:
df = pd.concat([df, augmented_df], ignore_index=True)

In [23]:
df = pd.read_csv("data/cleaned/hate_speech_dataset.tsv")

print(df.head())

                                        cleaned_post          class  bi_class  \
0  jewish harvard professor noel ignatiev wants a...  implicit_hate         1   
1  higher education european culture imported con...       not_hate         0   
2           problem whites christians ahead free say       not_hate         0   
3  yasir qadhi hate preacher calling christians f...       not_hate         0   
4    million germans mass murdered destruction reich       not_hate         0   

   mul_class  
0          1  
1          0  
2          0  
3          0  
4          0  


In [39]:
X_text = df["cleaned_post"].values
y_binary = df["bi_class"].values.astype("int")
y_multi = df["mul_class"].values.astype("int") 
X_train_text, X_test_text, y_train_bi, y_test_bi, y_train_multi, y_test_multi = train_test_split(
    X_text, y_binary, y_multi,
    test_size=0.2,
    random_state=42,
    stratify=y_multi
)

In [40]:
MAX_WORDS = 20000
MAX_LEN = 50

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)


In [41]:
EMBED_DIM = 50
FILTERS = 64
KERNEL_SIZES = [3, 4, 5]

inputs = Input(shape=(MAX_LEN,))
embedding = Embedding(input_dim=MAX_WORDS, output_dim=EMBED_DIM, input_length=MAX_LEN)(inputs)

conv_pools = []
for k in KERNEL_SIZES:
    conv = Conv1D(filters=FILTERS, kernel_size=k, activation='relu')(embedding)
    pool = GlobalMaxPooling1D()(conv)
    conv_pools.append(pool)

concat = Concatenate()(conv_pools)
drop = Dropout(0.5)(concat)
output = Dense(1, activation='sigmoid')(drop)

cnn_binary = Model(inputs=inputs, outputs=output)
cnn_binary.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
cnn_binary.summary()



In [42]:
class_weights_bi = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),
    y=y_train_bi.astype(int)
)
class_weights_bi = {0: class_weights_bi[0], 1: class_weights_bi[1]}


callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True
    )
]

history_bi = cnn_binary.fit(
    X_train_pad, y_train_bi,
    validation_split=0.1,
    epochs=15,
    batch_size=64,
    verbose=1,
    class_weight=class_weights_bi,
    callbacks=callbacks
)

Epoch 1/15
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.6819 - loss: 0.5913 - val_accuracy: 0.7689 - val_loss: 0.4964
Epoch 2/15
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.8223 - loss: 0.3964 - val_accuracy: 0.7602 - val_loss: 0.5096
Epoch 3/15
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.8899 - loss: 0.2683 - val_accuracy: 0.7592 - val_loss: 0.5878


In [43]:
loss_bi, acc_bi = cnn_binary.evaluate(X_test_pad, y_test_bi, verbose=0)
print("[Binary] Test Accuracy:", acc_bi)

y_prob_bi = cnn_binary.predict(X_test_pad)
y_pred_bi = (y_prob_bi >= 0.5).astype("int32").ravel()

print("\n[Binary] Classification Report:")
print(classification_report(y_test_bi, y_pred_bi, digits=4))

print("[Binary] Confusion Matrix:")
print(confusion_matrix(y_test_bi, y_pred_bi))

[Binary] Test Accuracy: 0.7577049732208252
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

[Binary] Classification Report:
              precision    recall  f1-score   support

           0     0.7579    0.7773    0.7675      2654
           1     0.7575    0.7369    0.7471      2505

    accuracy                         0.7577      5159
   macro avg     0.7577    0.7571    0.7573      5159
weighted avg     0.7577    0.7577    0.7576      5159

[Binary] Confusion Matrix:
[[2063  591]
 [ 659 1846]]
