In [1]:
import numpy as np
import pandas as pd
import keras
import tensorflow as tf

In [2]:
from keras.layers import TextVectorization,Dense,LSTM,Bidirectional,Dropout,Embedding
from collections import Counter,OrderedDict
from keras.models import Model,Sequential
from keras.metrics import Precision,Recall,CategoricalAccuracy
from sklearn.metrics import precision_score
import string

In [3]:
df=pd.read_csv('train.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
x=df['comment_text'].values
y=df.iloc[:,2:].values

The below function is used to find out the no of unique words in the dataset

In [7]:
punctuationsCounter=Counter(string.punctuation)
words=OrderedDict()
n=0
for sentence in x:
    for word in sentence.split():
        if word not in string.punctuation and word not in words:
            words[n]=word
            n+=1

In [8]:
len(words)

10593734

In [7]:
max_features=500000
output_sequence_length=1000

Text Vectorization is a preprocessing layer that converts words in the sentence to integers that could be understood by the deep learning model. This layer automatically converts the words into lower case and also removes punctuation marks.

In [8]:
vectorizer=TextVectorization(max_tokens=max_features,
                            output_sequence_length=output_sequence_length,
                            output_mode='int')

In [9]:
vectorizer.adapt(x)

In [12]:
vectorizer(['She is a good human being explanation'])[:,:7]

<tf.Tensor: shape=(1, 7), dtype=int64, numpy=array([[220,   9,   6,  98, 764,  91, 645]], dtype=int64)>

In [10]:
vectorized_x=vectorizer(x)

The below code creates a tensorflow dataset pipeline.

dataset.cache() helps to cache a dataset in local storage so that some operations like opening and data reading is prevented during every epoch.

dataset.prefetch() overlaps the preprocessing and model building steps during training. When the model is executing training step s the pipeline prepares data for the next step(s+1).

In [11]:
dataset=tf.data.Dataset.from_tensor_slices((vectorized_x,y))
dataset=dataset.cache()
dataset=dataset.shuffle(160000)
dataset=dataset.batch(16)
dataset=dataset.prefetch(8)

There are 9974 batches with 16 observations per batch

In [15]:
len(dataset)

9974

In [16]:
batch_x,batch_y=dataset.as_numpy_iterator().next()

In [17]:
batch_x.shape

(16, 1000)

In [18]:
batch_y.shape

(16, 6)

In [12]:
##Takes the first 70% of the datset for training
train=dataset.take(int(len(dataset)*0.7))

##Skips the first 70% and takes the next 20% of the dataset for validation
val=dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))

##Skips the first 90% and takes the remaining 10% of the dataset for testing
test=dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [20]:
model=Sequential()

model.add(Embedding(max_features+1,50))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(6,activation='sigmoid'))

In [21]:
model.compile(loss='BinaryCrossentropy',metrics=['CategoricalAccuracy','AUC'],optimizer='adam')

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 50)          25000050  
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                21248     
_________________________________________________________________
dense (Dense)                (None, 128)               8320      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 390       
Total params: 25,079,480
Trainable params: 25,079,480
Non-trainable params: 0
____________________________________________

In [23]:
history=model.fit(train,epochs=1,validation_data=val)



In [55]:
model.save('toxicityClassifier.h5')

In the below statement, function numpy_iterator() is a generator function and next() returns a single batch from the test dataset.

In [29]:
batch_test_x,batch_test_y=test.as_numpy_iterator().next()

In [36]:
predictions_test=model.predict(batch_test_x)

In [46]:
precision=Precision()
recall=Recall()
accuracy=CategoricalAccuracy()

In [47]:
for batch_x,batch_y in test.as_numpy_iterator():
    y_pred=model.predict(batch_x)
    
    y_pred=y_pred.flatten()
    y_true=batch_y.flatten()

    precision.update_state(y_true,y_pred)
    recall.update_state(y_true,y_pred)
    accuracy.update_state(y_true,y_pred)

In [54]:
print('Precision ',precision.result().numpy(),'Recall ',recall.result().numpy(),'Accuracy ',accuracy.result().numpy())

Precision  0.7938018 Recall  0.7224239 Accuracy  0.4663992


In [13]:
import gradio as gr

  "class": algorithms.Blowfish,
  utils.DeprecatedIn35,


In [14]:
model=keras.models.load_model('toxicityClassifier.h5')

In [15]:
comment='I hate you idiot'

In [16]:
vectorized_comment=vectorizer([comment])

In [19]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [17]:
model.predict(vectorized_comment)

array([[0.9960322 , 0.0616574 , 0.94112086, 0.01346555, 0.7446796 ,
        0.03615546]], dtype=float32)

In [21]:
for idx,col in enumerate(df.columns[2:]):
    print(idx,' ',col)

0   toxic
1   severe_toxic
2   obscene
3   threat
4   insult
5   identity_hate


In [42]:
def classify_comment(comment):
    vectorized_comment=vectorizer([comment])
    results=model.predict(vectorized_comment)
    
    toxicity=''
    for i,col in enumerate(df.columns[2:]):
        toxicity+='{}: {}\n'.format(col,results[0][i]>0.5)
    
    return toxicity

In [44]:
interface=gr.Interface(fn=classify_comment,
                      inputs=gr.inputs.Textbox(lines=3,placeholder='Enter your comment to test the toxicity'),
                      outputs='text')

  utils.DeprecatedIn35,


In [46]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861/


  utils.DeprecatedIn35,


Running on public URL: https://40838.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


  utils.DeprecatedIn35,


(<fastapi.applications.FastAPI at 0x1952dc0d9b0>,
 'http://127.0.0.1:7861/',
 'https://40838.gradio.app')