In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from sklearn.model_selection import train_test_split
import re


In [None]:
df = pd.read_csv("toxic-comments.csv")
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [None]:
df.shape

(159571, 8)

In [None]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [None]:
#Define target variables (adapt based on your needs)

toxicities = ["toxic","severe_toxic","obscene","threat","insult",
               "identity_hate"]

In [None]:
def clean_text(text):
    text = text.lower()  #lowercase
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text) #remove non-alphanumeric characters
    return text

In [None]:
df["comment_text"] = df["comment_text"].apply(clean_text)

In [None]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i m se...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i m really not trying to edit war it s...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i can t make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,and for the second time of asking when your v...,0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


### Define input and output variables

In [None]:
# Feature and target preparation

comments = df["comment_text"].tolist()
targets = df[toxicities].values

In [None]:
targets

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [None]:
targets.shape

(159571, 6)

#### Prepare the data

In [None]:
# tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=200)

In [None]:
padded_sequences.shape

(159571, 200)

In [None]:
padded_sequences

array([[   0,    0,    0, ..., 4539, 2252,  972],
       [   0,    0,    0, ...,  980,  577,  185],
       [   0,    0,    0, ...,    1,  732,  464],
       ...,
       [   0,    0,    0, ...,   12, 3463, 4381],
       [   0,    0,    0, ...,  153,   36,   10],
       [   0,    0,    0, ..., 1614, 2037,   89]])

### cross Validate

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, targets,
                                                    test_size=0.2, random_state=0)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((127656, 200), (127656, 6), (31915, 200), (31915, 6))

### Build Model

In [None]:
model=Sequential()
model.add(Embedding(5000,120, input_length=200))
model.add(GRU(64))
model.add(Dense(6, activation="sigmoid"))

#Multi label output with sigmoid activations





In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 120)          600000    
                                                                 
 gru (GRU)                   (None, 64)                35712     
                                                                 
 dense (Dense)               (None, 6)                 390       
                                                                 
Total params: 636102 (2.43 MB)
Trainable params: 636102 (2.43 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
from keras.utils import plot_model

In [None]:
plot_model(model, show_dtype=True, show_layer_activations=True,
           show_layer_names=True, show_shapes=True)

AttributeError: module 'pydot' has no attribute 'InvocationException'

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])




In [None]:
#Train the model
model.fit(X_train, y_train, epochs=3,
         batch_size=32, validation_data=(X_test, y_test))

Epoch 1/3


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x27f7f3e6390>

In [None]:
# Predict on new data

new_comment= "This is an awful and offensive commnet"

In [None]:
new_sequences = tokenizer.texts_to_sequences([clean_text(new_comment)])
padded_new_sequence = pad_sequences(sequences, maxlen=200)
prediction = model.predict(padded_new_sequence)[0]



In [None]:
prediction

array([1.0947000e-03, 5.5665150e-05, 1.7184627e-04, 5.1757552e-06,
       3.7895644e-04, 2.5187201e-05], dtype=float32)

In [None]:
for toxicity, prob in zip(toxicities, prediction):
    print(f"{toxicity}: {prob:.2f}")

toxic: 0.00
severe_toxic: 0.00
obscene: 0.00
threat: 0.00
insult: 0.00
identity_hate: 0.00
