In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, GRU
from sklearn.model_selection import train_test_split
import re

##### Load dataset

In [2]:
df = pd.read_csv("toxic-comments.csv")


In [3]:

df.shape

(159571, 8)

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
df.drop(["id"], axis = True, inplace = True)

In [6]:
toxicities = df.columns[1:].to_list()
toxicities

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
x = df["comment_text"]
y = df.iloc[:, 1:]

In [8]:
z = y.copy()
x[z.sum(axis = 1) == 0].count() # Neutral Comments

143346

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9]+", " ", text)
    return text
x = x.apply(clean_text)

In [10]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)
padded_sequences = pad_sequences(sequences, maxlen = 200)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, y,
                                                    test_size = 0.2,
                                                    random_state = 0)

In [12]:
model = Sequential()
model.add(Embedding(5000, 64, input_length = 200))
model.add(GRU(64))
model.add(Dense(6, activation = "sigmoid"))



In [13]:
model.compile(loss = "binary_crossentropy",
              optimizer = "adam",
              metrics = ["accuracy"])

In [14]:
history = model.fit(x_train, y_train, epochs = 3, batch_size = 32,
          validation_data = (x_test, y_test))

Epoch 1/3
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 10ms/step - accuracy: 0.9588 - loss: 0.0969 - val_accuracy: 0.9936 - val_loss: 0.0515
Epoch 2/3
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 10ms/step - accuracy: 0.9905 - loss: 0.0481 - val_accuracy: 0.9914 - val_loss: 0.0494
Epoch 3/3
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 10ms/step - accuracy: 0.9888 - loss: 0.0424 - val_accuracy: 0.9785 - val_loss: 0.0488


In [15]:
sample1 = "You are waste of space. World is better without you."
sample2 = "Your life will be so beautifyl with her."
sample3 = "You are stupid. No one cares about your opinion. Just shut up."

In [16]:
def predict(new_comment):
    new_sequences = tokenizer.texts_to_sequences(clean_text(new_comment))
    new_seq_pad = pad_sequences(new_sequences, maxlen = 200)
    prediction = model.predict(new_seq_pad, verbose = False)[0]
    return prediction

In [17]:
prediction = predict(sample1)
for toxicity, prob in zip(toxicities, prediction):
    print(toxicity, "-->", prob)

toxic --> 0.02575773
severe_toxic --> 0.0020827686
obscene --> 0.02797228
threat --> 0.000119254546
insult --> 0.009806298
identity_hate --> 0.00086628756


In [18]:
prediction = predict(sample2)
for toxicity, prob in zip(toxicities, prediction):
    print(toxicity, "-->", prob)

toxic --> 0.02575773
severe_toxic --> 0.0020827686
obscene --> 0.02797228
threat --> 0.000119254546
insult --> 0.009806298
identity_hate --> 0.00086628756


In [19]:
prediction = predict(sample3)
for toxicity, prob in zip(toxicities, prediction):
    print(toxicity, "-->", prob)

toxic --> 0.02575773
severe_toxic --> 0.0020827686
obscene --> 0.02797228
threat --> 0.000119254546
insult --> 0.009806298
identity_hate --> 0.00086628756
