In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip


# Importing Libraries

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import re

from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
from tensorflow.keras.metrics import AUC

In [3]:
# Set global seeds
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Ensure deterministic behavior
os.environ['TF_DETERMINISTIC_OPS'] = '1'


In [4]:
df_train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
df_test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
df_test_labels = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')

In [5]:
df_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [6]:
df_test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [7]:
df_test_labels

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,-1,-1,-1,-1,-1,-1
153160,fffd7a9a6eb32c16,-1,-1,-1,-1,-1,-1
153161,fffda9e8d6fafa9e,-1,-1,-1,-1,-1,-1
153162,fffe8f1340a79fc2,-1,-1,-1,-1,-1,-1


In [8]:
df_train.loc[df_train['toxic'] == 1, ['comment_text']].iloc[0]


comment_text    COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
Name: 6, dtype: object

# 1. Preprocessing:

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [10]:
df_train['comment_text'] = df_train['comment_text'].map(lambda com : clean_text(com))
df_test['comment_text'] = df_test['comment_text'].map(lambda com : clean_text(com))

In [11]:
X = df_train['comment_text']
y = df_train.drop(['id', 'comment_text'], axis = 1).values

In [12]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [13]:
Max_Features = 200000

In [14]:
vectorizer = TextVectorization(max_tokens = Max_Features, output_sequence_length = 1800, output_mode = 'int' )

In [15]:
vectorizer.adapt(X.values)

In [16]:
#vectorizer.get_vocabulary()

In [17]:
X_test = df_test['comment_text']

In [18]:
vectorizer('HI, MAMA I WANNA EAT')[:7]

<tf.Tensor: shape=(7,), dtype=int64, numpy=array([  159, 10212,     4,  2884,   981,     0,     0])>

In [19]:
X_vectorized = vectorizer(X.values)
X_test_vectorized = vectorizer(X_test.values)

In [20]:
dataset = tf.data.Dataset.from_tensor_slices((X_vectorized, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [21]:
train = dataset.take(int(len(dataset)*0.7))
val = dataset.skip(int(0.7*len(dataset))).take(int(len(dataset)*0.2))

In [22]:
train_generator = train.as_numpy_iterator()

In [23]:
train_generator.next()

(array([[ 2077,     9,    12, ...,     0,     0,     0],
        [  293, 58938,  1593, ...,     0,     0,     0],
        [   43,    38,  4175, ...,     0,     0,     0],
        ...,
        [  190,    28,    55, ...,     0,     0,     0],
        [ 9789,    28,   104, ...,     0,     0,     0],
        [ 1430,    76,    40, ...,     0,     0,     0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

In [24]:
len(train)

6981

In [25]:
model = Sequential()

# Embedding Layer
model.add(Embedding(Max_Features + 1, 64))  # 64-dimension for embedding space

# Bidirectional LSTM
model.add(Bidirectional(LSTM(64, activation='tanh')))  # Larger LSTM units to capture more complex patterns

# Dense layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))  # Dropout for regularization
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))  # Another Dropout layer
model.add(Dense(128, activation='relu'))

# Output Layer (6 labels)
model.add(Dense(6, activation='sigmoid'))

In [26]:
model.compile(loss = 'binary_crossentropy',  optimizer = Adam(learning_rate=0.0012))

In [27]:
model.fit(train, epochs = 1, validation_data = val)

[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m890s[0m 126ms/step - loss: 0.0856 - val_loss: 0.0472


<keras.src.callbacks.history.History at 0x7e06e9a575b0>

In [28]:
model.save("NonToxicMe.h5")

In [29]:
model.save_weights("NonToxicMe.weights.h5")


In [30]:
model.summary()

In [31]:
text = vectorizer('YOU ARE A DOG')

In [32]:
res = model.predict(np.expand_dims(text,0))
(res > 0.5).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 317ms/step


array([[1, 0, 0, 0, 0, 0]])

In [33]:
df_train.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [34]:
data_test = tf.data.Dataset.from_tensor_slices((X_test_vectorized))
data_test = data_test.cache()
data_test = data_test.batch(16)
data_test = data_test.prefetch(8)

In [35]:
test = data_test.take(int(len(dataset)))

In [36]:
pred = model.predict(test)

[1m9573/9573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 34ms/step


In [37]:
predictions_binary = (pred > 0.32).astype(int)

label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

df_test[label_columns] = predictions_binary

In [38]:
df_test

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,yo bitch ja rule is more succesful then you wi...,1,1,1,0,1,1
1,0000247867823ef7,from rfc the title is fine as it is imo,0,0,0,0,0,0
2,00013b17ad220c46,sources zawe ashton on lapland,0,0,0,0,0,0
3,00017563c3f7919a,if you have a look back at the source the info...,0,0,0,0,0,0
4,00017695ad8997eb,i do not anonymously edit articles at all,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,i totally agree this stuff is nothing but too ...,1,0,0,0,0,0
153160,fffd7a9a6eb32c16,throw from out field to home plate does it get...,0,0,0,0,0,0
153161,fffda9e8d6fafa9e,okinotorishima categories i see your changes a...,0,0,0,0,0,0
153162,fffe8f1340a79fc2,one of the founding nations of the eu germany ...,0,0,0,0,0,0


In [39]:
submission = df_test.drop(['comment_text'], axis = 1)
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1,1,1,0,1,1
1,0000247867823ef7,0,0,0,0,0,0
2,00013b17ad220c46,0,0,0,0,0,0
3,00017563c3f7919a,0,0,0,0,0,0
4,00017695ad8997eb,0,0,0,0,0,0


In [40]:
submission.to_csv('submission.csv', index=False)

In [47]:
# Initialize an empty list to store the labels
y_val = []

# Iterate through the validation dataset to extract labels
for _, labels in val:
    y_val.append(labels.numpy())

# Convert the list of labels into a NumPy array
y_val = np.concatenate(y_val, axis=0)

# Now y_val is the ground truth labels for the validation set
print(y_val.shape)  # Check the shape to ensure it's correct

(31904, 6)


In [49]:
y_val_pred = model.predict(val)

[1m1994/1994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 34ms/step


In [65]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report


# Apply threshold (0.5) to convert probabilities to binary predictions
y_val_pred_binary = (y_val_pred >= 0.3).astype(int)

# y_val is the true labels from the validation set
# Evaluate accuracy
accuracy = accuracy_score(y_val, y_val_pred_binary)

# Evaluate AUC
auc = roc_auc_score(y_val, y_val_pred_binary, average='macro', multi_class='ovr')  # for multi-label classification

# Print classification report (precision, recall, f1-score for each label)
print(f'Accuracy: {accuracy}')
print(f'AUC: {auc}')
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_binary))


Accuracy: 0.8536860581745236
AUC: 0.4999083758839428

Classification Report:
              precision    recall  f1-score   support

           0       0.09      0.05      0.06      3063
           1       0.00      0.00      0.00       323
           2       0.05      0.02      0.03      1680
           3       0.00      0.00      0.00       110
           4       0.05      0.00      0.00      1532
           5       0.00      0.00      0.00       306

   micro avg       0.08      0.03      0.04      7014
   macro avg       0.03      0.01      0.02      7014
weighted avg       0.06      0.03      0.04      7014
 samples avg       0.00      0.00      0.00      7014



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [66]:
y_val_pred_binary

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])