In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets



In [None]:
from datasets import load_from_disk

# From Google Drive
dataset_path = "/content/drive/MyDrive/cleaned_balanced_civil_comments"

# load data
dataset = load_from_disk(dataset_path)

dataset

Dataset({
    features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
    num_rows: 297057
})

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,haha you guys are a bunch of losers,0.893617,0.021277,0.0,0.0,0.87234,0.021277,0.0
1,ur a shtty comment,0.666667,0.047619,0.638095,0.0,0.333333,0.0,0.009524
2,its ridiculous that these guys are being calle...,0.6,0.0,0.1,0.1,0.6,0.0,0.0
3,this story gets more ridiculous by the hour an...,0.5,0.0,0.0,0.0,0.3,0.0,0.0
4,angry trolls misogynists and racists oh my it ...,0.5,0.0,0.0,0.0,0.5,0.1,0.0


In [None]:
# Check null or None
print("Null or None values:")
print(df.isnull().sum())

# Check only empty text in the text column
print("\nEmpty string values in 'text':")
print((df["text"] == "").sum())

Null or None values:
text               0
toxicity           0
severe_toxicity    0
obscene            0
threat             0
insult             0
identity_attack    0
sexual_explicit    0
dtype: int64

Empty string values in 'text':
0


In [None]:
# lables
labels = df[df.columns[1:]].values

# texts
texts = df['text'].astype(str).values

In [None]:
print(type(texts))
print(texts.shape)

<class 'numpy.ndarray'>
(297057,)


In [None]:
print(type(labels))
print(labels.shape)

<class 'numpy.ndarray'>
(297057, 7)


In [None]:
filtered_texts = []
filtered_labels = []

for text, label in zip(texts, labels):
    if text.strip():
        filtered_texts.append(text)
        filtered_labels.append(label)

texts = filtered_texts
labels = filtered_labels

In [None]:
MAX_WORDS=40000

In [None]:
# Create a vectorizer
vectorizer = TextVectorization(
    max_tokens=MAX_WORDS,
    output_sequence_length=600,
    output_mode='int'
)

# Training the vectorizer on texts
vectorizer.adapt(texts)

In [None]:
vocab = vectorizer.get_vocabulary()

import pickle
with open("vectorizer_vocabulary.pkl", "wb") as f:
    pickle.dump(vocab, f)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# labels is a numpy array
labels = np.array(labels)

# Split train and temp (val + test)
texts_train, texts_temp, labels_train, labels_temp = train_test_split(
    texts, labels, test_size=0.3, random_state=42, stratify=labels.argmax(axis=1)
)

# Split temp into val and test
texts_val, texts_test, labels_val, labels_test = train_test_split(
    texts_temp, labels_temp, test_size=0.33, random_state=42, stratify=labels_temp.argmax(axis=1)
)


In [None]:
# Save spilt data
import numpy as np

np.savez_compressed(
    'dataset_splits3.npz',
    texts_train=texts_train,
    labels_train=labels_train,
    texts_val=texts_val,
    labels_val=labels_val,
    texts_test=texts_test,
    labels_test=labels_test
)

In [None]:
# Helper function
def create_dataset(texts, labels):
    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    ds = ds.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.cache().shuffle(20000).batch(64).prefetch(tf.data.AUTOTUNE)
    return ds

# Apply to the three groups
train = create_dataset(texts_train, labels_train)
val = create_dataset(texts_val, labels_val)
test = create_dataset(texts_test, labels_test)

In [None]:
import numpy as np

print("Train positive:", np.sum(labels_train.argmax(axis=1) > 0))
print("Val positive:", np.sum(labels_val.argmax(axis=1) > 0))
print("Test positive:", np.sum(labels_test.argmax(axis=1) > 0))

Train positive: 10224
Val positive: 2935
Test positive: 1446


In [None]:
# Function to calculate class weights for multi-label
def compute_class_weights(labels):
    class_weights = np.sum(labels, axis=0) / float(len(labels))
    max_weight = np.max(class_weights)
    return max_weight / class_weights

In [None]:
# Compute the class weights
class_weights = compute_class_weights(labels_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}
print(f"Calculated Class Weights: {class_weights_dict}")

Calculated Class Weights: {0: np.float64(1.0), 1: np.float64(21.426235112398476), 2: np.float64(5.8609650362053705), 3: np.float64(14.430381899970007), 4: np.float64(1.1795194237116449), 5: np.float64(5.5552130911887225), 6: np.float64(16.016855676405747)}


In [None]:
from keras.saving import register_keras_serializable

@register_keras_serializable()
def weighted_binary_crossentropy(class_weights):
    def loss(y_true, y_pred):
        weights = y_true * class_weights + (1 - y_true)
        bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
        return tf.reduce_mean(bce * weights)
    return loss

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Attention, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout


inputs = Input(shape=(600,))

embedding_layer = Embedding(MAX_WORDS + 1, 32)(inputs)


bi_lstm = Bidirectional(LSTM(32, activation='tanh', return_sequences=True))(embedding_layer)

dropout = Dropout(0.3)(bi_lstm)

# Attention
attention = Attention()([bi_lstm, bi_lstm])

# Merge outputs using GlobalAveragePooling1D
attention = GlobalMaxPooling1D()(attention)

dense1 = Dense(128, activation='relu')(attention)
dense2 = Dense(256, activation='relu')(dense1)
dense3 = Dense(128, activation='relu')(dense2)
dense4 = Dense(256, activation='relu')(dense3)

output = Dense(7, activation='sigmoid')(dense4)

model = Model(inputs=inputs, outputs=output)

In [None]:
# Calculating class weights before training
class_weights_array = compute_class_weights(labels_train)

loss_fn = weighted_binary_crossentropy(class_weights_array)

In [None]:
from tensorflow.keras.metrics import Precision, Recall, AUC
import tensorflow as tf

# Compile with metrics appropriate to the task.
model.compile(
    optimizer='adam',
    #loss=weighted_binary_crossentropy(class_weights),
    loss = loss_fn,
    metrics=[
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc'),
    ]
)

In [None]:
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=4,
    restore_best_weights=True
)
checkpoint = ModelCheckpoint(
    "best_model.keras",
    monitor='val_auc',
    save_best_only=True,
    mode='max'
)


In [None]:
history = model.fit(train,
                    epochs=13,
                    validation_data=val,
                    batch_size=32,
                    callbacks=[early_stop, checkpoint]
                   )

Epoch 1/13
[1m3249/3249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 65ms/step - auc: 0.7904 - loss: 0.4200 - precision: 0.8341 - recall: 0.1907 - val_auc: 0.9070 - val_loss: 0.3309 - val_precision: 0.9710 - val_recall: 0.3535
Epoch 2/13
[1m3249/3249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 55ms/step - auc: 0.9136 - loss: 0.3217 - precision: 0.9693 - recall: 0.3752 - val_auc: 0.9113 - val_loss: 0.3207 - val_precision: 0.9710 - val_recall: 0.3830
Epoch 3/13
[1m3249/3249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 55ms/step - auc: 0.9255 - loss: 0.3085 - precision: 0.9723 - recall: 0.3895 - val_auc: 0.9219 - val_loss: 0.3183 - val_precision: 0.9761 - val_recall: 0.3694
Epoch 4/13
[1m3249/3249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 56ms/step - auc: 0.9344 - loss: 0.2989 - precision: 0.9735 - recall: 0.3988 - val_auc: 0.9225 - val_loss: 0.3204 - val_precision: 0.9649 - val_recall: 0.3985
Epoch 5/13
[1m3249/3249[0m [32m━━━━━━━━━━

In [None]:
model.save('last_model.keras')
print("model saved successfully!")

model saved successfully!


In [None]:
import tensorflow as tf
from tensorflow.keras.metrics import Precision, Recall, BinaryAccuracy

# Performance measure for each classification
pre = Precision()
re = Recall()
acc = BinaryAccuracy()

for batch in val:
    X_true, y_true = batch

    # Making predictions as tensors
    yhat = model(X_true, training=False)

    # Convert values ​​to binary
    y_true_binary = tf.cast(y_true >= 0.5, tf.float32)
    yhat_binary = tf.cast(yhat >= 0.5, tf.float32)

    # Update metrics
    pre.update_state(y_true_binary, yhat_binary)
    re.update_state(y_true_binary, yhat_binary)
    acc.update_state(y_true_binary, yhat_binary)


print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

Precision: 0.8969367146492004, Recall: 0.7150833010673523, Accuracy: 0.9499740600585938


In [None]:
from google.colab import files


files.download('last_model.keras')
files.download('dataset_splits3.npz')
files.download('vectorizer_vocabulary.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:

# Convert text to a digital representation in the same format as the training data.
input_text = ["Fuck you son of the pitch!"]  
vectorized_input = vectorizer(input_text)  
reslut = model.predict(vectorized_input)      
binary = (reslut >= 0.5).astype(int)          
print(binary)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[[1 0 1 0 1 0 0]]


In [None]:
labels = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']
pred = [[1, 0, 1, 0, 1, 0, 0]]

for i, val in enumerate(pred[0]):
    if val == 1:
        print(f"- {labels[i]}")

- toxicity
- obscene
- insult
