In [1]:
from datasets import load_from_disk

restored_dataset = load_from_disk("cleaned_balanced_civil_comments")

In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
df = pd.DataFrame(restored_dataset)
df.head()

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,haha you guys are a bunch of losers,0.893617,0.021277,0.0,0.0,0.87234,0.021277,0.0
1,ur a shtty comment,0.666667,0.047619,0.638095,0.0,0.333333,0.0,0.009524
2,its ridiculous that these guys are being calle...,0.6,0.0,0.1,0.1,0.6,0.0,0.0
3,this story gets more ridiculous by the hour an...,0.5,0.0,0.0,0.0,0.3,0.0,0.0
4,angry trolls misogynists and racists oh my it ...,0.5,0.0,0.0,0.0,0.5,0.1,0.0


In [3]:
# Check null or None
print("Null or None values:")
print(df.isnull().sum())

# Check only empty text in the text column
print("\nEmpty string values in 'text':")
print((df["text"] == "").sum())

Null or None values:
text               0
toxicity           0
severe_toxicity    0
obscene            0
threat             0
insult             0
identity_attack    0
sexual_explicit    0
dtype: int64

Empty string values in 'text':
0


In [4]:
# lables
labels = df[df.columns[1:]].values

# texts
texts = df['text'].astype(str).values

In [5]:
print(type(texts))
print(texts.shape)

<class 'numpy.ndarray'>
(297057,)


In [6]:
print(type(labels))
print(labels.shape)

<class 'numpy.ndarray'>
(297057, 7)


In [7]:
filtered_texts = []
filtered_labels = []

for text, label in zip(texts, labels):
    if text.strip():  
        filtered_texts.append(text)
        filtered_labels.append(label)

texts = filtered_texts
labels = filtered_labels

In [8]:
MAX_WORDS=20000

In [9]:
# Create the vectorizer
vectorizer = TextVectorization(
    max_tokens=MAX_WORDS,
    output_sequence_length=300,  
    output_mode='int'
)

# Vectorizer training on texts
vectorizer.adapt(texts)

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split

# numpy
labels = np.array(labels)

# Split train and temp (val + test)
texts_train, texts_temp, labels_train, labels_temp = train_test_split(
    texts, labels, test_size=0.3, random_state=42, stratify=labels.argmax(axis=1)
)

# Split temp into val and test
texts_val, texts_test, labels_val, labels_test = train_test_split(
    texts_temp, labels_temp, test_size=0.33, random_state=42, stratify=labels_temp.argmax(axis=1)
)


In [11]:
# Save spilt data
import numpy as np

np.savez_compressed(
    'dataset_splits.npz',
    texts_train=texts_train,
    labels_train=labels_train,
    texts_val=texts_val,
    labels_val=labels_val,
    texts_test=texts_test,
    labels_test=labels_test
)

In [12]:
# Helper function
def create_dataset(texts, labels):
    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    ds = ds.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.cache().shuffle(10000).batch(32).prefetch(tf.data.AUTOTUNE)
    return ds

# Application to the three groups
train = create_dataset(texts_train, labels_train)
val = create_dataset(texts_val, labels_val)
test = create_dataset(texts_test, labels_test)

In [13]:
import numpy as np

print("Train positive:", np.sum(labels_train.argmax(axis=1) > 0))
print("Val positive:", np.sum(labels_val.argmax(axis=1) > 0))
print("Test positive:", np.sum(labels_test.argmax(axis=1) > 0))

Train positive: 10224
Val positive: 2935
Test positive: 1446


In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dropout, Bidirectional, Dense, Embedding

In [15]:
model = Sequential()
model.add(Input(shape=(300,)))
# Create the embedding layer
model.add(Embedding(MAX_WORDS+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer
model.add(Dense(7, activation='sigmoid'))

In [16]:
from tensorflow.keras.metrics import Precision, Recall, AUC
import tensorflow as tf

# Compile with metrics
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc'),
    ]
)


In [17]:
model.summary()

In [18]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=4,
    restore_best_weights=True
)
checkpoint = ModelCheckpoint(
    "best_model.keras", 
    monitor='val_auc', 
    save_best_only=True, 
    mode='max'
)


In [21]:
history = model.fit(train, 
                    epochs=10, 
                    validation_data=val,
                    batch_size=32,
                    callbacks=[early_stop, checkpoint]       
                   )

Epoch 1/10
[1m6498/6498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1191s[0m 182ms/step - auc: 0.8356 - loss: 0.2290 - precision: 0.9218 - recall: 0.2548 - val_auc: 0.9036 - val_loss: 0.1875 - val_precision: 0.9707 - val_recall: 0.3611
Epoch 2/10
[1m6498/6498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1288s[0m 198ms/step - auc: 0.9147 - loss: 0.1841 - precision: 0.9743 - recall: 0.3570 - val_auc: 0.9253 - val_loss: 0.1830 - val_precision: 0.9719 - val_recall: 0.3784
Epoch 3/10
[1m6498/6498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1359s[0m 209ms/step - auc: 0.9275 - loss: 0.1757 - precision: 0.9757 - recall: 0.3713 - val_auc: 0.9181 - val_loss: 0.1815 - val_precision: 0.9625 - val_recall: 0.4032
Epoch 4/10
[1m6498/6498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1038s[0m 160ms/step - auc: 0.9347 - loss: 0.1708 - precision: 0.9780 - recall: 0.3820 - val_auc: 0.9216 - val_loss: 0.1833 - val_precision: 0.9655 - val_recall: 0.3895
Epoch 5/10
[1m6498/6498[0m [32m━━

In [None]:
model.save('best_model.keras')
print("model saved successfully!")

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

label_names = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']

# Dump val data
y_true = []
for batch in val:
    _, labels = batch
    y_true.append(labels)

y_true = np.concatenate([y.numpy() for y in y_true], axis=0)

# Convert y_true to binary data (0 or 1)
y_true_binary = (y_true >= 0.5).astype(int)

# Predictions
y_true = np.vstack([labels.numpy() for _, labels in val])
y_pred_binary = (y_pred >= 0.5).astype(int)

# Confusion matrices for each classification
for i, label in enumerate(label_names):
    print(f"\nConfusion Matrix for '{label}':")
    cm = confusion_matrix(y_true_binary[:, i], y_pred_binary[:, i])
    print(cm)

    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {label}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()

# Full Performance Report
print("\nClassification Report:")
print(classification_report(
                            y_true_binary,
                            y_pred_binary,
                            target_names=label_names,
                            zero_division=0,
                            labels=[0, 1]  
                            ))


In [None]:
import tensorflow as tf
from tensorflow.keras.metrics import Precision, Recall, BinaryAccuracy

# Performance measure for each classification
pre = Precision()
re = Recall()
acc = BinaryAccuracy()

for batch in val:
    X_true, y_true = batch
    
    # Making predictions as tensors
    yhat = model(X_true, training=False)
    
    # Convert values ​​to binary
    y_true_binary = tf.cast(y_true >= 0.5, tf.float32)
    yhat_binary = tf.cast(yhat >= 0.5, tf.float32)

    # Update metrics
    pre.update_state(y_true_binary, yhat_binary)
    re.update_state(y_true_binary, yhat_binary)
    acc.update_state(y_true_binary, yhat_binary)


print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')