In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Reading the .csv file

In [7]:
df = pd.read_csv(os.path.join('C:\\Users\\ameym\\Downloads\\jigsaw-toxic-comment-classification-challenge', 'train.csv', 'train.csv'))
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [16]:
df.shape

(159571, 8)

In [17]:
df.dtypes

id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

In [18]:
df.nunique()

id               159571
comment_text     159571
toxic                 2
severe_toxic          2
obscene               2
threat                2
insult                2
identity_hate         2
dtype: int64

In [19]:
df.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [14]:
df.iloc[14]['comment_text']

"Oh, and the girl above started her arguments with me. She stuck her nose where it doesn't belong. I believe the argument was between me and Yvesnimmo. But like I said, the situation was settled and I apologized. Thanks,"

In [15]:
df.iloc[14][2:]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 14, dtype: object

## Preprocessing

In [21]:
from tensorflow.keras.layers import TextVectorization

In [34]:
x = df['comment_text']
y = df[df.columns[2:]].values

In [35]:
x.head(5)

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [38]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [39]:
max_features = 200000  #noof words to be tokenized

In [40]:
vectorizer = TextVectorization(max_tokens = max_features,
                               output_sequence_length = 1800,
                               output_mode = 'int')

In [42]:
vectorizer.adapt(x.values)

In [54]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'to',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 'for',
 'this',
 'not',
 'on',
 'be',
 'as',
 'have',
 'are',
 'your',
 'with',
 'if',
 'article',
 'was',
 'or',
 'but',
 'page',
 'my',
 'an',
 'from',
 'by',
 'do',
 'at',
 'about',
 'me',
 'so',
 'wikipedia',
 'can',
 'what',
 'there',
 'all',
 'has',
 'will',
 'talk',
 'please',
 'would',
 'its',
 'no',
 'one',
 'just',
 'like',
 'they',
 'he',
 'dont',
 'which',
 'any',
 'been',
 'should',
 'more',
 'we',
 'some',
 'other',
 'who',
 'see',
 'here',
 'also',
 'his',
 'think',
 'im',
 'because',
 'know',
 'how',
 'am',
 'people',
 'why',
 'edit',
 'articles',
 'only',
 'out',
 'up',
 'when',
 'were',
 'use',
 'then',
 'may',
 'time',
 'did',
 'them',
 'now',
 'being',
 'their',
 'than',
 'thanks',
 'even',
 'get',
 'make',
 'good',
 'had',
 'very',
 'information',
 'does',
 'could',
 'well',
 'want',
 'such',
 'sources',
 'way',
 'name',
 'these',
 'deletion',
 'pages',
 'first',
 'help'

In [52]:
vectorizer('hi everyone. how are you?')[0 : 5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([171, 523,  73,  20,   7], dtype=int64)>

In [55]:
vectorized_test = vectorizer(x.values)

In [58]:
vectorized_test.shape

TensorShape([159571, 1800])

### Creating the Pipeline

In [60]:
#MCSBAP
dataset = tf.data.Dataset.from_tensor_slices((vectorized_test, y))
dataset = dataset.cache()  # C - CACHE
dataset = dataset.shuffle(160000)  # S - SHUFFLE
dataset = dataset.batch(16)  # B -BATCH
dataset = dataset.prefetch(8)   # P - PREFETCH - HELPS PREVENT BOTTLENECKS

In [65]:
batch_x, batch_y = dataset.as_numpy_iterator().next()

In [66]:
batch_x.shape

(16, 1800)

In [67]:
batch_y.shape

(16, 6)

### Train-Test-Validation split

In [71]:
int(len(dataset)*0.7)

6981

In [72]:
train = dataset.take(int(len(dataset)*0.7))
validation = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [76]:
len(train)

6981

In [77]:
len(validation)

1994

In [75]:
len(test)

997

In [78]:
train_generator = train.as_numpy_iterator()

In [82]:
train_generator.next()

(array([[   458,      4, 134201, ...,      0,      0,      0],
        [  1288,  17752,    526, ...,      0,      0,      0],
        [168146,      9,     34, ...,      0,      0,      0],
        ...,
        [  1219,     16,   1643, ...,      0,      0,      0],
        [  1673,  66276,    425, ...,      0,      0,      0],
        [   170,  25734,    138, ...,      0,      0,      0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

## Buiding the model

In [85]:
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [91]:
model = Sequential()

# EMBEDDING LAYER
model.add(Embedding(max_features+1, 32))

# RNN LAYER - LSTM
model.add(Bidirectional(LSTM(32, activation = 'tanh')))  #GPU requires LSTM layers to be having tanh activation

# DENSE LAYERS
model.add(Dense(128, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(6, activation = 'sigmoid'))

In [92]:
model.compile(optimizer = 'rmsprop',
              loss = 'binary_crossentropy')

In [93]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_5 (Dense)             (None, 128)               8320      
                                                                 
 dense_6 (Dense)             (None, 256)               33024     
                                                                 
 dense_7 (Dense)             (None, 128)               32896     
                                                                 
 dense_8 (Dense)             (None, 64)                8256      
                                                      

In [99]:
history = model.fit(train, epochs = 1 , validation_data = validation)



## Making Predictions

In [117]:
input_text = vectorizer('you suck! i hate you! GO TO HELL!')

In [118]:
model.predict(np.expand_dims(input_text, 0))



array([[0.9617963 , 0.3114075 , 0.92030746, 0.04552443, 0.78974414,
        0.18155797]], dtype=float32)

In [119]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [120]:
batch = test.as_numpy_iterator().next()

In [122]:
(batch_x, batch_y) = test.as_numpy_iterator().next()

In [124]:
(model.predict(batch_x) > 0.5).astype(int)



array([[1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [125]:
batch_y

array([[1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

## Calculating Results and Evaluating Model

In [131]:
from keras.metrics import Precision, Recall, CategoricalAccuracy

In [132]:
pre = Precision()
rec = Recall()
acc = CategoricalAccuracy()

In [134]:
for batch in test.as_numpy_iterator():
    (x_true, y_true)  = batch
    y_pred = model.predict(x_true)
    
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    
    pre.update_state(y_true, y_pred)
    rec.update_state(y_true, y_pred)
    acc.update_state(y_true, y_pred)



In [136]:
print(f'Precision : {pre.result().numpy()}, Recall : {rec.result().numpy()}, Accuracy : {acc.result().numpy()}')

Precision : 0.8394449949264526, Recall : 0.6179474592208862, Accuracy : 0.48028045892715454
