In [3]:
import os
import numpy as np
import pandas as pd
import zipfile
import timeit
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import metrics
from tensorflow.keras.models import Sequential




In [4]:
full_data = pd.read_csv(r"C:\Users\aakas\Downloads\datasets all\train.csv")
test_data_x = pd.read_csv(r"C:\Users\aakas\Downloads\datasets all\test.csv")
test_data_y = pd.read_csv(r"C:\Users\aakas\Downloads\datasets all\test_labels.csv")

In [8]:
full_data.shape

(159571, 8)

In [23]:
full_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [24]:
test_data_x.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [25]:
test_data_y.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [26]:
# Merging the X and y part together
test_dataframe = pd.merge(test_data_x, test_data_y, how="inner", on="id")

# Remove all the rows having missing values (-1)
test_dataframe = test_dataframe[test_dataframe["toxic"] != -1].reset_index(drop=True)

test_dataframe.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
42418,a92dd151871276c4,:Didn't realize there was such a disparity on ...,0,0,0,0,0,0
13523,35e38a97d0fff9bc,Should not the article say how universally det...,0,0,0,0,0,0
57259,e4a998728074eb04,==Cause or Effect?== \n The Boxing Day Tsunami...,0,0,0,0,0,0
52673,d2236585537ca236,""" \n\n == Edit summary == \n\n You're right. R...",0,0,0,0,0,0
42378,a901d6c6a31bfb1a,""" \n\n Please stop adding unreferenced contro...",0,0,0,0,0,0


In [27]:
print(f"We have {len(test_dataframe)} observations in test data.")

We have 63978 observations in test data.


In [28]:
# Remove the unnecessary data

del test_data_x
del test_data_y

In [29]:
full_data.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [30]:
full_data["comment_text"].values

array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communit

In [31]:
full_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [32]:
# Convert the dataframes into tensorflow Dataset objects

train_data = tf.data.Dataset.from_tensor_slices(
    (
        full_data["comment_text"].tolist(), 
        full_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()
    )
)

test_data = tf.data.Dataset.from_tensor_slices(
    (
        test_dataframe["comment_text"].tolist(), 
        test_dataframe[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()
    )
)

In [33]:
batch_size = 64

train_data = train_data.cache().shuffle(1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_data = test_data.cache().batch(batch_size)

In [34]:
max_tokens = 100000

start = timeit.default_timer()

text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    standardize='lower_and_strip_punctuation',
    output_mode="int",
    output_sequence_length=1800
)

text_vectorization.adapt(full_data["comment_text"].tolist())

end = timeit.default_timer()

print(f"It took {end - start} seconds to adapt.")

It took 16.258457799995085 seconds to adapt.


In [35]:
del full_data
del test_dataframe

In [36]:
model = Sequential([
    text_vectorization,
    layers.Embedding(max_tokens+1, 32),
    layers.Bidirectional(layers.LSTM(32, return_sequences=False)),
    layers.Dense(256, activation="relu"),
    layers.Dense(256, activation="relu"),
    layers.Dense(128, activation="relu"),
    layers.Dense(6, activation="sigmoid")
])

In [37]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=metrics.BinaryAccuracy()
)

In [38]:
train_split = 0.8

train_data = train_data.take(int(len(train_data) * train_split))
valid_data = train_data.skip(int(len(train_data) * train_split))

In [39]:
print("The cardinality of train data is ", train_data.cardinality().numpy())
print("The cardinality of validation data is ", valid_data.cardinality().numpy())
print("The cardinality of test data is ", test_data.cardinality().numpy())

The cardinality of train data is  1995
The cardinality of validation data is  399
The cardinality of test data is  1000


In [40]:
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5, 
        restore_best_weights=True, 
        start_from_epoch=1, 
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", 
        factor=0.5, 
        patience=3, 
        verbose=1, 
        min_lr=1e-6
    )
]

In [41]:
start = timeit.default_timer()

model_history = model.fit(
    train_data,
    epochs=15,
    verbose=1,
    callbacks=callbacks,
    validation_data=valid_data
).history

end = timeit.default_timer()

print(f"It took {end - start} seconds to train the model.")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
It took 26570.956885400003 seconds to train the model.


In [42]:
model.evaluate(test_data)



[0.15989956259727478, 0.9566934704780579]

In [43]:
text = "Yo bitch he is more succesful then you'll be in your life"
predictions = model.predict(np.array([text])) > 0.5
prediction_text = ""
for i, col in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    prediction_text += f"{col:<{20}}: {predictions[0][i]:<{5}}\n"
print(prediction_text)

toxic               : 1    
severe_toxic        : 1    
obscene             : 1    
threat              : 0    
insult              : 1    
identity_hate       : 0    



In [44]:
text = "hello world"
predictions = model.predict(np.array([text])) > 0.5
prediction_text = ""
for i, col in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    prediction_text += f"{col:<{20}}: {predictions[0][i]:<{5}}\n"
print(prediction_text)

toxic               : 0    
severe_toxic        : 0    
obscene             : 0    
threat              : 0    
insult              : 0    
identity_hate       : 0    



In [45]:
text = "how is everyone"
predictions = model.predict(np.array([text])) > 0.5
prediction_text = ""
for i, col in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    prediction_text += f"{col:<{20}}: {predictions[0][i]:<{5}}\n"
print(prediction_text)

toxic               : 0    
severe_toxic        : 0    
obscene             : 0    
threat              : 0    
insult              : 0    
identity_hate       : 0    



In [46]:
text = "fuck you nigger"
predictions = model.predict(np.array([text])) > 0.5
prediction_text = ""
for i, col in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    prediction_text += f"{col:<{20}}: {predictions[0][i]:<{5}}\n"
print(prediction_text)

toxic               : 1    
severe_toxic        : 0    
obscene             : 1    
threat              : 0    
insult              : 1    
identity_hate       : 1    



In [47]:
text = "jevlis ka <3"
predictions = model.predict(np.array([text])) > 0.5
prediction_text = ""
for i, col in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    prediction_text += f"{col:<{20}}: {predictions[0][i]:<{5}}\n"
print(prediction_text)

toxic               : 0    
severe_toxic        : 0    
obscene             : 0    
threat              : 0    
insult              : 0    
identity_hate       : 0    



In [48]:
text = "I think that you are a Faggot get a life and burn in Hell I hate you, im going to find out where u live and kill everyone of you"
predictions = model.predict(np.array([text])) > 0.5
prediction_text = ""
for i, col in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    prediction_text += f"{col:<{20}}: {predictions[0][i]:<{5}}\n"
print(prediction_text)

toxic               : 1    
severe_toxic        : 0    
obscene             : 1    
threat              : 1    
insult              : 1    
identity_hate       : 0    



In [49]:
text = "how are you"
predictions = model.predict(np.array([text])) > 0.5
prediction_text = ""
for i, col in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    prediction_text += f"{col:<{20}}: {predictions[0][i]:<{5}}\n"
print(prediction_text)

toxic               : 0    
severe_toxic        : 0    
obscene             : 0    
threat              : 0    
insult              : 0    
identity_hate       : 0    



In [50]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (Text  (None, 1800)              0         
 Vectorization)                                                  
                                                                 
 embedding_1 (Embedding)     (None, 1800, 32)          3200032   
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                16640     
 onal)                                                           
                                                                 
 dense_4 (Dense)             (None, 256)               16640     
                                                                 
 dense_5 (Dense)             (None, 256)               65792     
                                                                 
 dense_6 (Dense)             (None, 128)              

In [59]:
model.save('C:/Users/aakas/Downloads/datasets all/TCC_model', save_format='tf')

INFO:tensorflow:Assets written to: C:/Users/aakas/Downloads/datasets all/TCC_model\assets


INFO:tensorflow:Assets written to: C:/Users/aakas/Downloads/datasets all/TCC_model\assets
