In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM,
    Dense, Dropout
)

print("TensorFlow version:", tf.__version__)


df = pd.read_csv("data/cleaned/hate_speech_dataset.tsv")
df["cleaned_post"] = df["cleaned_post"].astype(str)

label_map = {"not_hate": 0, "implicit_hate": 1, "explicit_hate": 2}
df["multi_label"] = df["class"].map(label_map)

if df["multi_label"].isna().any():
    raise ValueError("Some values in 'class' do not match label_map keys.")

X_text   = df["cleaned_post"].values
y_binary = df["bi_class"].values.astype("int32")
y_multi  = df["multi_label"].values.astype("int32")

print("Binary label counts:", np.bincount(y_binary.astype("int32")))
print("Multi label counts:", np.bincount(y_multi))

X_train_text, X_test_text, y_train_bi, y_test_bi, y_train_multi, y_test_multi = train_test_split(
    X_text, y_binary, y_multi,
    test_size=0.2,
    random_state=42,
    stratify=y_multi
)




TensorFlow version: 2.20.0
Binary label counts: [13270  8180]
Multi label counts: [13270  7094  1086]
Train shape: (17160, 100)
Test shape: (4290, 100)

 Training Bi-LSTM (Binary)





Epoch 1/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 208ms/step - accuracy: 0.6792 - loss: 0.5977 - val_accuracy: 0.7063 - val_loss: 0.5670
Epoch 2/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 225ms/step - accuracy: 0.8055 - loss: 0.4294 - val_accuracy: 0.6935 - val_loss: 0.5919
Epoch 3/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 228ms/step - accuracy: 0.8681 - loss: 0.3111 - val_accuracy: 0.6871 - val_loss: 0.6741
Epoch 4/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 221ms/step - accuracy: 0.9091 - loss: 0.2220 - val_accuracy: 0.6807 - val_loss: 0.8520
Epoch 5/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 225ms/step - accuracy: 0.9319 - loss: 0.1646 - val_accuracy: 0.6783 - val_loss: 0.9974
Epoch 6/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 238ms/step - accuracy: 0.9503 - loss: 0.1240 - val_accuracy: 0.6568 - val_loss: 1.1865

[Binary] Test A



Epoch 1/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 195ms/step - accuracy: 0.6494 - loss: 0.7673 - val_accuracy: 0.6871 - val_loss: 0.6992
Epoch 2/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 248ms/step - accuracy: 0.7606 - loss: 0.5723 - val_accuracy: 0.6853 - val_loss: 0.7274
Epoch 3/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 207ms/step - accuracy: 0.8381 - loss: 0.4067 - val_accuracy: 0.6620 - val_loss: 0.7934
Epoch 4/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 233ms/step - accuracy: 0.8916 - loss: 0.2823 - val_accuracy: 0.6445 - val_loss: 0.9131
Epoch 5/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 204ms/step - accuracy: 0.9218 - loss: 0.2059 - val_accuracy: 0.6451 - val_loss: 1.0854
Epoch 6/6
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 237ms/step - accuracy: 0.9395 - loss: 0.1605 - val_accuracy: 0.6259 - val_loss: 1.3816

[Multiclass] Te

In [None]:
MAX_WORDS = 20000
MAX_LEN   = 100 

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq  = tokenizer.texts_to_sequences(X_test_text)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_pad  = pad_sequences(X_test_seq,  maxlen=MAX_LEN)

print("Train shape:", X_train_pad.shape)
print("Test shape:", X_test_pad.shape)


def build_bilstm(
    max_words=MAX_WORDS,
    max_len=MAX_LEN,
    embed_dim=100,
    lstm_units=128,
    num_classes=1,
    dropout_rate=0.5
):
    inputs = Input(shape=(max_len,))
    x = Embedding(
        input_dim=max_words,
        output_dim=embed_dim,
        input_length=max_len
    )(inputs)

    # Bi-LSTM 层
    x = Bidirectional(LSTM(lstm_units, return_sequences=False))(x)
    x = Dropout(dropout_rate)(x)

    if num_classes == 1:
        outputs = Dense(1, activation="sigmoid")(x)
        loss = "binary_crossentropy"
        metrics = ["accuracy"]
    else:
        outputs = Dense(num_classes, activation="softmax")(x)
        loss = "sparse_categorical_crossentropy"
        metrics = ["accuracy"]

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer="adam",
        loss=loss,
        metrics=metrics
    )
    return model



In [None]:
print("\n=======================")
print(" Training Bi-LSTM (Binary)")
print("=======================\n")

bilstm_binary = build_bilstm(num_classes=1)
bilstm_binary.summary()

history_bi = bilstm_binary.fit(
    X_train_pad, y_train_bi,
    validation_split=0.1,
    epochs=6,
    batch_size=64,
    verbose=1
)

loss_bi, acc_bi = bilstm_binary.evaluate(X_test_pad, y_test_bi, verbose=0)
print(f"\n[Binary] Test Accuracy: {acc_bi:.4f}")

y_prob_bi = bilstm_binary.predict(X_test_pad)
y_pred_bi = (y_prob_bi >= 0.5).astype("int32").ravel()

print("\n[Binary] Classification Report:")
print(classification_report(y_test_bi, y_pred_bi, digits=4))

print("[Binary] Confusion Matrix:")
print(confusion_matrix(y_test_bi, y_pred_bi))

In [None]:
print("\n===========================")
print(" Training Bi-LSTM (Multiclass)")
print("===========================\n")

bilstm_multi = build_bilstm(num_classes=3)
bilstm_multi.summary()

history_multi = bilstm_multi.fit(
    X_train_pad, y_train_multi,
    validation_split=0.1,
    epochs=6,
    batch_size=64,
    verbose=1
)


In [None]:
loss_multi, acc_multi = bilstm_multi.evaluate(X_test_pad, y_test_multi, verbose=0)
print(f"\n[Multiclass] Test Accuracy: {acc_multi:.4f}")

y_prob_multi = bilstm_multi.predict(X_test_pad)
y_pred_multi = np.argmax(y_prob_multi, axis=1)

print("\n[Multiclass] Classification Report:")
target_names = ["not_hate", "implicit_hate", "explicit_hate"]
print(classification_report(y_test_multi, y_pred_multi,
                            target_names=target_names, digits=4))

print("[Multiclass] Confusion Matrix:")
print(confusion_matrix(y_test_multi, y_pred_multi))