# **0.INSTALLING DEPENDENCIES**

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

# **1.LOADING DATA**

In [2]:
data=pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv')

In [3]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
data.iloc[42][1:] #SAMPLE COMMENT

comment_text     You are gay or antisemmitian? \n\nArchangel WH...
toxic                                                            1
severe_toxic                                                     0
obscene                                                          1
threat                                                           0
insult                                                           1
identity_hate                                                    1
Name: 42, dtype: object

# **2.PRE-PROCESSING**

In [5]:
from tensorflow.keras.layers import TextVectorization

In [6]:
x=data['comment_text']
y=data[data.columns[2:]].values

In [7]:
MAX_WORDS=300000

In [8]:
vectorizer=TextVectorization(max_tokens=MAX_WORDS,output_mode='int',
    output_sequence_length=1500)

In [9]:
vectorizer.adapt(x.values)

In [10]:
vectorizer.get_vocabulary()[:15]

['',
 '[UNK]',
 'the',
 'to',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 'for',
 'this']

In [11]:
vectorized_text=vectorizer(x.values)

In [12]:
vectorized_text

<tf.Tensor: shape=(159571, 1500), dtype=int64, numpy=
array([[   645,     76,      2, ...,      0,      0,      0],
       [219427,     54,   2489, ...,      0,      0,      0],
       [   425,    441,     70, ...,      0,      0,      0],
       ...,
       [ 32445,   7392,    383, ...,      0,      0,      0],
       [     5,     12,    534, ...,      0,      0,      0],
       [     5,      8,    130, ...,      0,      0,      0]])>

In [13]:
dataset=tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset=dataset.cache()
dataset=dataset.shuffle(160000)
dataset=dataset.batch(16)
dataset=dataset.prefetch(8)

In [14]:
batch_x,batch_y=dataset.as_numpy_iterator().next()

In [15]:
batch_x

array([[  171, 14384,     8, ...,     0,     0,     0],
       [ 1262,  1048,  1102, ...,     0,     0,     0],
       [  865,   635,   179, ...,     0,     0,     0],
       ...,
       [   73,    74,     8, ...,     0,     0,     0],
       [  125,     2,   474, ...,     0,     0,     0],
       [ 2051,  1171,   171, ...,     0,     0,     0]])

In [16]:
batch_y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [17]:
train=dataset.take(int(len(dataset)*0.7))
val=dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test=dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

# **3.SEQUENTIAL MODEL**

In [18]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional,Dense, Dropout

In [19]:
model=Sequential()
model.add(Embedding(MAX_WORDS+1,32))

model.add(Bidirectional(LSTM(32,activation='tanh')))

model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))

model.add(Dense(6,activation='sigmoid'))

In [20]:
model.compile(loss='BinaryCrossentropy',optimizer='Adam')

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          9600032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [22]:
history=model.fit(train,validation_data=val,epochs=1)



In [23]:
history.history

{'loss': [0.063534215092659], 'val_loss': [0.048871755599975586]}

In [24]:
import matplotlib.pyplot as plt

# **4.PREDICTIONS**

In [28]:
input_text=vectorizer('You look freaking ugly')

In [31]:
batch=test.as_numpy_iterator().next()

In [33]:
test_x,test_y=test.as_numpy_iterator().next()

In [34]:
(model.predict(test_x)>0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [36]:
result=model.predict(np.expand_dims(input_text,0))



In [44]:
data.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [41]:
result

array([[0.95858455, 0.0454697 , 0.7335786 , 0.02539132, 0.4629536 ,
        0.05035597]], dtype=float32)

# **5.MODEL EVALUATIONS**

In [38]:
from tensorflow.keras.metrics import Precision,Recall,CategoricalAccuracy

In [39]:
precision=Precision()
recall=Recall()
accuracy=CategoricalAccuracy()

In [52]:
for batch in test.as_numpy_iterator():
    x_true,y_true=batch
    y_pred=model.predict(x_true)
    y_true=y_true.flatten()
    y_pred=y_pred.flatten()
    precision.update_state(y_true,y_pred)
    recall.update_state(y_true,y_pred)
    accuracy.update_state(y_true,y_pred)




In [53]:
print(f'Precision: {precision.result().numpy()}, Recall:{recall.result().numpy()}, Accuracy:{accuracy.result().numpy()}')

Precision: 0.8315646648406982, Recall:0.6559800505638123, Accuracy:0.46759846806526184
