# **Project : Toxic Comment Detection**

# **0. Install Dependencies and Bring in Data**

In [1]:
!pip install tensorflow pandas matplotlib sklearn

Collecting sklearn
  Downloading sklearn-0.0.post11.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post11-py3-none-any.whl size=2959 sha256=866c9a3e914bfb5b1bcb184948cefd06f1c57ab70ce2c628402065f6a937107e
  Stored in directory: /root/.cache/pip/wheels/aa/9c/60/f67813603a52fc35057868f1aba0003cc75b72583dcaa2c341
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post11


In [2]:
# import dependencies
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [3]:
# as the input of the input layer in the deep learning model is BATCH form, we need to expand the dimesion of the input, hence used "np.expand_dims()" function.
np.expand_dims??

In [4]:
os.listdir('/content/drive/MyDrive/Datasets')

['train.csv', 'test.csv', 'test_labels.csv', 'sample_submission.csv']

In [5]:
# load the train dataset
df = pd.read_csv('/content/drive/MyDrive/Datasets/train.csv')

In [6]:
# checking the dataframe
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
# lets grab one of the comment from the csv
df.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [8]:
#to retrive the columns from 2 to last of 4th row
df[df.columns[2:]].iloc[5]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 5, dtype: int64

In [9]:
# get the rows that have the toxic =1
df[df['toxic']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


In [10]:
# lets retrive the comment text of 6th row
df['comment_text'].iloc[16]

"Bye! \n\nDon't look, come or think of comming back! Tosser."

# **1. Preprocess Data**

In [11]:
# import the dependencies
from tensorflow.keras.layers import TextVectorization

In [12]:
# splitting the dataset into Input and Label dataset
X = df['comment_text']
y = df[df.columns[2:]].values    # TO convert the dataframe into numpy array we used, ".values()" function

In [13]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [14]:
MAX_FEATURE = 200000  # Number of words in vocabulary

In [15]:
# Tokenization of words
vectorizer = TextVectorization(max_tokens = MAX_FEATURE, output_sequence_length=1800)

# output_sequence_length refers to maximum length of the comment

In [16]:
  # Vectorize the Comments
vectorizer.adapt(X.values)

In [17]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'to',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 'for',
 'this',
 'not',
 'on',
 'be',
 'as',
 'have',
 'are',
 'your',
 'with',
 'if',
 'article',
 'was',
 'or',
 'but',
 'page',
 'my',
 'an',
 'from',
 'by',
 'do',
 'at',
 'about',
 'me',
 'so',
 'wikipedia',
 'can',
 'what',
 'there',
 'all',
 'has',
 'will',
 'talk',
 'please',
 'would',
 'its',
 'no',
 'one',
 'just',
 'like',
 'they',
 'he',
 'dont',
 'which',
 'any',
 'been',
 'should',
 'more',
 'we',
 'some',
 'other',
 'who',
 'see',
 'here',
 'also',
 'his',
 'think',
 'im',
 'because',
 'know',
 'how',
 'am',
 'people',
 'why',
 'edit',
 'articles',
 'only',
 'out',
 'up',
 'when',
 'were',
 'use',
 'then',
 'may',
 'time',
 'did',
 'them',
 'now',
 'being',
 'their',
 'than',
 'thanks',
 'even',
 'get',
 'make',
 'good',
 'had',
 'very',
 'information',
 'does',
 'could',
 'well',
 'want',
 'such',
 'sources',
 'way',
 'name',
 'these',
 'deletion',
 'pages',
 'first',
 'help'

In [18]:
# tokenize all the comments
vectorized_text = vectorizer(X.values)

In [19]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [20]:
# Creating the Data Pipeline for the deep Learning model
# MCSHBAP : Map, Cache, Shuffle, Batch, Prefetch

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)  #creating buffer
dataset = dataset.batch(16)  #preparing the batchs
dataset = dataset.prefetch(8)

In [21]:
# preview our dataset
dataset.as_numpy_iterator().next()

(array([[   484,     28,   2809, ...,      0,      0,      0],
        [  1368,    865,   1540, ...,      0,      0,      0],
        [   171,    265,     21, ...,      0,      0,      0],
        ...,
        [    16,    413,    324, ...,      0,      0,      0],
        [   171,   1586,    139, ...,      0,      0,      0],
        [134538,    288,    312, ...,      0,      0,      0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0]]))

In [22]:
batch_X , batch_y = dataset.as_numpy_iterator().next()

In [23]:
int(len(dataset))

9974

**1.1 Splitting into Train, Test, Validation set**

In [24]:
train_set = dataset.take(int(len(dataset)*.7))
validation_set = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)+ 1)
test_set = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1) + 1)

In [25]:
print(int(len(train_set)))
print(int(len(validation_set)))
print(int(len(test_set)))


6981
1995
998


In [26]:
# generate the generator for train set numpy array
train_generator = train_set.as_numpy_iterator()

In [27]:
# used to give the batch from the train set only ----> To train the model
train_generator.next()

(array([[  136,    58,   782, ...,     0,     0,     0],
        [   67,    41,    20, ...,     0,     0,     0],
        [   76, 60446,  2756, ...,     0,     0,     0],
        ...,
        [ 1467,     2,   565, ...,     0,     0,     0],
        [    8,   710,   136, ...,     0,     0,     0],
        [   23, 10800,     9, ...,     0,     0,     0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

# **2. Create Sequential Model**

**2.1 Import Dependencies**

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional

**2.2 Building Model**

In [29]:
model = Sequential()
model.add(Embedding(MAX_FEATURE+1, 32))
model.add(Bidirectional(LSTM(32, activation = 'tanh')))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(6, activation = 'sigmoid'))

In [30]:
model.compile(optimizer = 'Adam',loss = 'BinaryCrossentropy')

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

**2.3 Train Model**

In [32]:
hist = model.fit(train_set , epochs = 5, validation_data = validation_set)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# **3. Model Prediction**

In [33]:
input_text = vectorizer('You Freaking Suck!')

In [34]:
# we used np.expand_dims() : function that used to increase the dimension of the array
res = model.predict(np.expand_dims(input_text ,0))



In [35]:
res

array([[0.999696  , 0.47539067, 0.99388653, 0.03180437, 0.9467536 ,
        0.0952545 ]], dtype=float32)

In [36]:
batch_X1 , batch_y1 = test_set.as_numpy_iterator().next()

In [37]:
y1hat = (model.predict(batch_X1) > 0.5 ).astype(int)



In [38]:
y1hat

array([[0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [39]:
(batch_y1 > 0.5).astype(int)

array([[0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

# **4. Evalutaion**

**4.1 Import Dependencies**

In [40]:
from tensorflow.keras.metrics import Precision , Recall ,CategoricalAccuracy

In [41]:
# create the instances of each of the classes
pre = Precision()
re = Recall()
Cacc = CategoricalAccuracy()

In [None]:
# Evaluating
for batch in test_set.as_numpy_iterator():

  # unpack the batch
  X_true,y_true = batch

  # make a prediction
  yhat = model.predict(X_true)

  # Flatten the values
  y_true = y_true.flatten()
  yhat = yhat.flatten()

  # Evaluating the model
  pre.update_state(y_true, yhat)
  re.update_state(y_true, yhat)
  Cacc.update_state(y_true, yhat)

In [48]:
print(f"Precision : {pre.result().numpy()}, Recall : {re.result().numpy()}, CategoricalAccuracy:{Cacc.result().numpy()}")

Precision : 0.8535397052764893, Recall : 0.8498858213424683, CategoricalAccuracy:0.5120240449905396


# **5. Test and Gradio**

**5.1 Import Dependencies**

In [None]:
!pip install gradio

In [50]:
# saving the model
model.save('Toxicity.h5')

  saving_api.save_model(


In [51]:
Model = tf.keras.models.load_model('Toxicity.h5')

In [76]:
# input the text
input_str = vectorizer('may i help you')

In [77]:
# making the prediction
res = model.predict(np.expand_dims(input_str,0))



In [78]:
(res > 0.5).astype(int)

array([[0, 0, 0, 0, 0, 0]])

In [56]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

**Model**

In [101]:
def score_comment(comment):
  input_comment = vectorizer(comment)
  result = model.predict(np.expand_dims(input_comment,0))

  text = ''
  for idx, col in enumerate(df.columns[2:]):
    text+= (f"{col} :{result[0][idx]>0.5}   ")

  return text

In [111]:
score_comment("hey i kill you")



'toxic :True   severe_toxic :False   obscene :False   threat :False   insult :True   identity_hate :True   '