In [1]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import kagglehub
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras import layers, models, regularizers


# Set the path to the file you'd like to load
file_path = "youtoxic_english_1000.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "reihanenamdari/youtube-toxicity-data",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

2025-11-28 17:27:15.866395: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  df = kagglehub.load_dataset(


In [2]:
df.head()

Unnamed: 0,CommentId,VideoId,Text,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism
0,Ugg2KwwX0V8-aXgCoAEC,04kJtp6pVXI,If only people would just take a step back and...,False,False,False,False,False,False,False,False,False,False,False,False
1,Ugg2s5AzSPioEXgCoAEC,04kJtp6pVXI,Law enforcement is not trained to shoot to app...,True,True,False,False,False,False,False,False,False,False,False,False
2,Ugg3dWTOxryFfHgCoAEC,04kJtp6pVXI,\nDont you reckon them 'black lives matter' ba...,True,True,False,False,True,False,False,False,False,False,False,False
3,Ugg7Gd006w1MPngCoAEC,04kJtp6pVXI,There are a very large number of people who do...,False,False,False,False,False,False,False,False,False,False,False,False
4,Ugg8FfTbbNF8IngCoAEC,04kJtp6pVXI,"The Arab dude is absolutely right, he should h...",False,False,False,False,False,False,False,False,False,False,False,False


In [3]:
# Keep only the relevant columns
df = df[['Text', 'IsToxic']]

In [4]:
# Convert 'IsToxic' to integer type
df['IsToxic'] = df['IsToxic'].astype(int)

In [5]:
# cleaning for deep learning
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+", "", t)                # remove URLs
    t = re.sub(r"@\w+", "", t)                   # remove @username
    t = re.sub(r"[^a-z0-9\s!?']", " ", t)        # keep letters, digits, !, ?, '
    t = re.sub(r"\s+", " ", t)                   # collapse multiple spaces
    return t.strip()
df['Text'] = df['Text'].apply(clean_text)

In [6]:
X = df['Text'].values
y = df['IsToxic'].values.astype(int)

# 70% train, 15% val, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

len(X_train), len(X_val), len(X_test)

(700, 150, 150)

In [None]:
MAX_VOCAB_SIZE = 5000
MAX_LEN = 80

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

# Convert text → sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

# Padding
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')


In [8]:
embedding_index = {}
EMBED_DIM = 100

with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

print("Loaded word vectors:", len(embedding_index))


Loaded word vectors: 400000


In [9]:
embedding_matrix = np.zeros((MAX_VOCAB_SIZE, EMBED_DIM))

for word, i in tokenizer.word_index.items():
    if i < MAX_VOCAB_SIZE:
        vec = embedding_index.get(word)
        if vec is not None:
            embedding_matrix[i] = vec


In [None]:
model = models.Sequential([

    layers.Embedding(
        input_dim=MAX_VOCAB_SIZE,
        output_dim=EMBED_DIM,
        weights=[embedding_matrix], # pre-trained embeddings
        input_length=MAX_LEN,
        trainable=False             # small dataset necessity (otherwise ruins pre-trained vectors)
    ),

    layers.Conv1D(128, 5, activation='relu'),
    layers.GlobalMaxPooling1D(),

    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),

    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()




In [11]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.5486 - loss: 0.7277 - val_accuracy: 0.6067 - val_loss: 0.6806
Epoch 2/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.6571 - loss: 0.6302 - val_accuracy: 0.6667 - val_loss: 0.6424
Epoch 3/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7571 - loss: 0.5265 - val_accuracy: 0.6533 - val_loss: 0.6363
Epoch 4/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8343 - loss: 0.4143 - val_accuracy: 0.6933 - val_loss: 0.5983
Epoch 5/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8957 - loss: 0.3148 - val_accuracy: 0.6667 - val_loss: 0.5913
Epoch 6/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9471 - loss: 0.2195 - val_accuracy: 0.6600 - val_loss: 0.6453
Epoch 7/20
[1m22/22[0m [32m━━━━

In [12]:
# 1. Final training accuracy
train_loss, train_acc = model.evaluate(X_train_pad, y_train, verbose=0)

print(f"Final Train Loss: {train_loss:.4f}")
print(f"Final Train Accuracy: {train_acc:.4f}")

# 2. Final test accuracy
test_loss, test_acc = model.evaluate(X_test_pad, y_test, verbose=0)

print(f"Final Test Loss: {test_loss:.4f}")
print(f"Final Test Accuracy: {test_acc:.4f}")

# 3. Predictions
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob >= 0.5).astype(int)

# 4. Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

# 5. Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Final Train Loss: 0.2206
Final Train Accuracy: 0.9700
Final Test Loss: 0.5865
Final Test Accuracy: 0.6933
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step

Classification Report:
              precision    recall  f1-score   support

           0     0.6923    0.7778    0.7326        81
           1     0.6949    0.5942    0.6406        69

    accuracy                         0.6933       150
   macro avg     0.6936    0.6860    0.6866       150
weighted avg     0.6935    0.6933    0.6903       150


Confusion Matrix:
[[63 18]
 [28 41]]
