In [1]:
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM , Dense, GRU
from sklearn.model_selection import train_test_split




In [2]:
df = pd.read_csv('../Day_09/toxic-comments.csv')

In [3]:
df.shape

(159571, 8)

In [4]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [5]:
toxicity = ['toxic', 'severe_toxic', 'obscene', 'threat',
            'insult', 'identity_hate']

In [6]:
def clean_text(text):
    text = text.lower() # Lower case
    text = re.sub(r"[\W]+", " ", text) # Remove Non-Alphanumeric characters
    return text

In [7]:
df['comment_text'] = df['comment_text'].apply(clean_text)
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i m se...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i m really not trying to edit war it s...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i can t make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,and for the second time of asking when your v...,0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


### Define Input and Output variables

In [8]:
# Features and target preparation

comments = df['comment_text'].tolist()
targets = df[toxicity].values

In [9]:
targets

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [10]:
targets.shape

(159571, 6)

### Data Preparation

In [11]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=200) # Default is pre-padding

In [12]:
padded_sequences.shape

(159571, 200)

In [13]:
padded_sequences

array([[   0,    0,    0, ..., 4538, 2252,  972],
       [   0,    0,    0, ...,  980,  577,  185],
       [   0,    0,    0, ...,    1,  729,  464],
       ...,
       [   0,    0,    0, ...,   12, 3477, 4378],
       [   0,    0,    0, ...,  153,   36,   10],
       [   0,    0,    0, ..., 1613, 2037,   89]])

### Cross Validation

In [14]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, targets, test_size=0.2, random_state=0)

In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((127656, 200), (31915, 200), (127656, 6), (31915, 6))

### Model Building

In [16]:
model = Sequential()

model.add(Embedding(5000, 128, input_length = 200))
model.add(GRU(64))
model.add(Dense(6, activation = 'sigmoid')) # 6 neurons, each would have its own activation; Multi Label O/P with sigmoid activation




In [17]:
model.summary()

# Number of parameters = 5000 * 128 = 640000
# GRU Layer: 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          640000    
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense (Dense)               (None, 6)                 390       
                                                                 
Total params: 677638 (2.58 MB)
Trainable params: 677638 (2.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
from keras.utils import plot_model

In [19]:
# plot_model(model, show_shapes = True, show_layer_names = True, show_layer_activations=True, show_dtype = True)

In [20]:
# !pip install pydot graphviz

In [21]:
model.compile(loss='binary_crossentropy',
                optimizer = 'adam',
                metrics=['accuracy'])




In [22]:
model.fit(X_train, y_train, epochs = 3, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/3


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1f49bd62450>

### Inferencing on New Data

In [23]:
prediction = "This is an awful and offensive comment"

In [24]:
text = clean_text(prediction)

seq = tokenizer.texts_to_sequences(text)

pad = pad_sequences(seq, maxlen=200)

# Predict the sentiment
pred_prob = model.predict(pad)[0]



In [25]:
pred_prob

array([0.04817288, 0.00073952, 0.02943421, 0.00160738, 0.02164207,
       0.00739084], dtype=float32)

In [26]:
for toxicity, prob in zip(toxicity, pred_prob):
    print(f'{toxicity}: {prob:.2f}')

toxic: 0.05
severe_toxic: 0.00
obscene: 0.03
threat: 0.00
insult: 0.02
identity_hate: 0.01
