In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
import re
import nltk
from tensorflow import math as tf_math

In [3]:
import keras
print(keras.__version__)
keras.utils.set_random_seed(812)

3.10.0


## Duomenų paruošimas

In [4]:
!wget https://raw.githubusercontent.com/mrthlinh/toxic-comment-classification/refs/heads/master/data/data_train_clean.csv

--2025-10-21 10:13:39--  https://raw.githubusercontent.com/mrthlinh/toxic-comment-classification/refs/heads/master/data/data_train_clean.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46556225 (44M) [text/plain]
Saving to: ‘data_train_clean.csv’


2025-10-21 10:13:39 (533 MB/s) - ‘data_train_clean.csv’ saved [46556225/46556225]



In [5]:
train_filename = 'data_train_clean.csv'
train = pd.read_csv(train_filename)

In [6]:
class_names = train.columns[3:]
n_classes = len(class_names)
class_names

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

1. Kiek kokių klasių turime?
1. Tokenizuokite ir apdorokite duomenis kaip tik norite
1. Sukonstruokite paprasčiausią RNN modelį (naudojant Keras):
   - *Input*
   - *Embedding*
   - *SimpleRNN*
   - *Dense* - išvesties sluoksnis, kuris grąžins komentarų klasės tikimybes (6 skaičiai nuo 0 iki 1).
1. Apmokame modelį. Panaudokite žemiau pateiktus sakinius kaip testą.
1. Pabandykite pakeisti modelį LSTM arba BiLSTM.

In [7]:
insult_comment = "You swine. You vulgar little maggot. You worthless bag of filth. I wager you couldn't empty a boot of excrement were the instructions on the heel. You are a canker. A sore that won't go away. I would rather kiss a lawyer than be seen with you. Try to edit your responses of unnecessary material before attempting to impress us with your insight. The evidence that you are a nincompoop will still be available to readers, but they will be able to access it more rapidly."
obscene_comment = "I'M TIRED OF SEEING IT! My friends on TikTok send me memes, on Discord it's fucking memes. I was in a server, right? And all of the channels are just Among Us stuff. I showed my champion underwear to my girlfriend, and the logo, I flipped it, and I said, 'Hey babe, when the underwear sus! HAHA!' I fucking looked at a trashcan, I said, 'That's a bit sussy!' I looked at my penis, I think of the astronaut’s helmet, and I go, 'PENIS? MORE LIKE SUS!"
wholesome_comment = "You are truly amazing you know that right? Probably not but that is ok that's why i'm still around. You are beautiful, you are amazing and when you are happy it can truly brighten up even the darkest of nights :you know that right? Well if not I d. Life might be hard but it won't always be, there are people willing to help even if it might not seem like it, not every problem can be fixed and not everyone can help with those problems..... But they can try and help make your problems seem less scary and worrying."

In [8]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,clean_comment,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,54568,91da4b4b95fe5e2d,b' conflict of interest note by your user name...,0,0,0,0,0,0
1,3786,0a1df25805d6d2e7,b'update actually i changed this to something ...,0,0,0,0,0,0
2,22938,3c9e75d2b8dcf720,b'mrca article sniperz thanks for your comment...,0,0,0,0,0,0
3,137856,e1b03f4baf9d58ce,b'arguing that bart and caltrain should get mo...,0,0,0,0,0,0
4,143038,fcdf5d299437f2a9,b' blocked hi i blocked you for hours for bein...,0,0,0,0,0,0


In [9]:
print(f"{n_classes} classes: {class_names.tolist()}")

6 classes: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [10]:
tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

sequences = [tokenizer.tokenize(str(text)) for text in train['clean_comment']]

all_lengths = [len(seq) for seq in sequences]
ROW_LENGTH = int(np.percentile(all_lengths, 95))
print(f"Row length: {ROW_LENGTH}")

word_index = {}
for seq in sequences:
    for word in seq:
        if word not in word_index:
            word_index[word] = len(word_index) + 1

sequences_numeric = [[word_index.get(word, 0) for word in seq] for seq in sequences]

X = np.zeros((len(sequences_numeric), ROW_LENGTH), dtype=int)
for i, seq in enumerate(sequences_numeric):
    length = min(len(seq), ROW_LENGTH)
    X[i, :length] = seq[:length]

y = train[class_names].values

VOCAB_SIZE = len(word_index) + 1
print(f"Vocab size: {VOCAB_SIZE}")

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Row length: 238
Vocab size: 177362
Shape of X: (111699, 238)
Shape of y: (111699, 6)


In [11]:
vocab_size = VOCAB_SIZE
max_len = ROW_LENGTH
embedding_dim = 128
rnn_units = 64

model = keras.Sequential([
    keras.layers.Input(shape=(max_len,), dtype='int32'),
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    keras.layers.SimpleRNN(units=rnn_units),
    keras.layers.Dense(units=n_classes, activation='sigmoid')
])

model.summary()

In [12]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'f1_score'])

model.fit(
    X, y,
    batch_size=1024,
    epochs=3
)

Epoch 1/3
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 66ms/step - accuracy: 0.4011 - f1_score: 0.0380 - loss: 0.2639
Epoch 2/3
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 56ms/step - accuracy: 0.9940 - f1_score: 0.0291 - loss: 0.1404
Epoch 3/3
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 55ms/step - accuracy: 0.9940 - f1_score: 0.0291 - loss: 0.1402


<keras.src.callbacks.history.History at 0x7945d9c9dac0>

In [13]:
def preprocess_comment(comment, tokenizer, word_index, max_len):
    comment = comment.lower()
    comment = re.sub(r'[^\w\s]', '', comment)
    tokens = tokenizer.tokenize(comment)
    numeric_sequence = [word_index.get(word, 0) for word in tokens]

    padded_sequence = np.zeros((max_len,), dtype=int)
    length = min(len(numeric_sequence), max_len)
    padded_sequence[:length] = numeric_sequence[:length]

    return padded_sequence

insult_sequence = preprocess_comment(insult_comment, tokenizer, word_index, ROW_LENGTH)
obscene_sequence = preprocess_comment(obscene_comment, tokenizer, word_index, ROW_LENGTH)
wholesome_sequence = preprocess_comment(wholesome_comment, tokenizer, word_index, ROW_LENGTH)

X_test = np.array([insult_sequence, obscene_sequence, wholesome_sequence])

print("Shape of X_test:", X_test.shape)

Shape of X_test: (3, 238)


In [14]:
preds = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 586ms/step


In [15]:
for i, comment_preds in enumerate(preds):
    print(f"Predictions for comment {i+1}:")
    for j, class_name in enumerate(class_names):
        print(f"  {class_name}: {comment_preds[j]:.4f}")
    print("-" * 20)

Predictions for comment 1:
  toxic: 0.1053
  severe_toxic: 0.0094
  obscene: 0.0538
  threat: 0.0031
  insult: 0.0494
  identity_hate: 0.0084
--------------------
Predictions for comment 2:
  toxic: 0.1053
  severe_toxic: 0.0094
  obscene: 0.0538
  threat: 0.0031
  insult: 0.0494
  identity_hate: 0.0084
--------------------
Predictions for comment 3:
  toxic: 0.1053
  severe_toxic: 0.0094
  obscene: 0.0538
  threat: 0.0031
  insult: 0.0494
  identity_hate: 0.0084
--------------------


In [23]:
vocab_size = VOCAB_SIZE
max_len = ROW_LENGTH
embedding_dim = 128
rnn_units = 64

model = keras.Sequential([
    keras.layers.Input(shape=(max_len,), dtype='int32'),
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    keras.layers.Bidirectional(keras.layers.LSTM(128)),
    keras.layers.Dense(units=n_classes, activation='sigmoid')
])

model.summary()

In [24]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'f1_score'])

model.fit(
    X, y,
    batch_size=1024,
    epochs=5
)

Epoch 1/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 193ms/step - accuracy: 0.7694 - f1_score: 0.0408 - loss: 0.2039
Epoch 2/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 199ms/step - accuracy: 0.9940 - f1_score: 0.0291 - loss: 0.0679
Epoch 3/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 202ms/step - accuracy: 0.9940 - f1_score: 0.0291 - loss: 0.0474
Epoch 4/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 203ms/step - accuracy: 0.9940 - f1_score: 0.0291 - loss: 0.0405
Epoch 5/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 199ms/step - accuracy: 0.9940 - f1_score: 0.0291 - loss: 0.0382


<keras.src.callbacks.history.History at 0x79453e80b020>

In [25]:
preds = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297ms/step


In [26]:
for i, comment_preds in enumerate(preds):
    print(f"Predictions for comment {i+1}:")
    class_predictions = list(zip(class_names, comment_preds))
    sorted_predictions = sorted(class_predictions, key=lambda item: item[1], reverse=True)
    for class_name, probability in sorted_predictions:
        print(f"  {class_name}: {probability:.4f}")
    print("-" * 20)

Predictions for comment 1:
  toxic: 0.9628
  obscene: 0.8397
  insult: 0.6163
  severe_toxic: 0.1822
  identity_hate: 0.0963
  threat: 0.0308
--------------------
Predictions for comment 2:
  toxic: 0.8660
  obscene: 0.6236
  insult: 0.3955
  identity_hate: 0.0796
  severe_toxic: 0.0627
  threat: 0.0277
--------------------
Predictions for comment 3:
  toxic: 0.0745
  obscene: 0.0185
  insult: 0.0151
  identity_hate: 0.0032
  severe_toxic: 0.0015
  threat: 0.0010
--------------------
