In [2]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [3]:
df = pd.read_csv(os.path.join('dataset_tox/train.csv'))

In [4]:
from tensorflow.keras.layers import TextVectorization

In [5]:
df.iloc[0] # this will grab the first complete row
df.iloc[0]['comment_text'] # this will grab the comment_text in first row

df[df['toxic'] == 1].head() # this will grab the toxic equal to 1 row in head i.e first 5 elements
df.columns

X = df['comment_text']
y = df[df.columns[2:]].values

y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [6]:
MAX_FEATURES = 200000 # number of words in the vocab
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [7]:
vectorizer.adapt(X.values)

In [8]:
vectorizer('I am going to kill you')[:6]

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([  8,  74, 164,   3, 942,   7], dtype=int64)>

In [9]:
vectorized_text = vectorizer(X.values)

In [10]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [11]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [12]:
train.as_numpy_iterator().next()

(array([[  1525,    477,    390, ...,      0,      0,      0],
        [  4631, 100749,      0, ...,      0,      0,      0],
        [    88,      7,    130, ...,      0,      0,      0],
        ...,
        [   428,    180,      3, ...,      0,      0,      0],
        [   414,   1496,     36, ...,      0,      0,      0],
        [     1,      1,     13, ...,      0,      0,      0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [17]:
model = Sequential()
# Create the embedding layer 
# model.add(vectorizer)
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [18]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 6)                 774       
                                                      

In [19]:
history = model.fit(train, epochs=2, validation_data=val)

Epoch 1/2
Epoch 2/2


In [20]:
input_text = ' I am going to kill you'

In [22]:
res = model.predict(np.expand_dims(vectorizer(input_text),0))

flat_array = np.array(res).flatten()

# flat_array

output_array = [1 if value > 0.4 else 0 for value in flat_array]

# output_array

res





array([[0.63701105, 0.00314104, 0.09066574, 0.01846391, 0.19383292,
        0.03145851]], dtype=float32)

In [23]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 6)                 774       
                                                      

In [24]:
model.save('Model_comments_toxicity/Comments_Toxicity.keras')



In [25]:
model.save_weights('Model_comments_toxicity/Comments_Toxicity_weights.keras')

In [26]:
new_model =  tf.keras.models.load_model("Model_comments_toxicity/Comments_Toxicity.keras")


In [27]:
new_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 6)                 774       
                                                      

In [29]:
input_text = ' I am going to kill you'

res = new_model.predict(np.expand_dims(vectorizer(input_text),0))

flat_array = np.array(res).flatten()

# flat_array

output_array = [1 if value > 0.4 else 0 for value in flat_array]

# output_array

res




array([[0.63701105, 0.00314104, 0.09066574, 0.01846391, 0.19383292,
        0.03145851]], dtype=float32)