#Importing dependencies and Data

In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras as keras
import os

In [11]:
df = pd.read_csv(os.path.join('train.csv'))

In [13]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


#Preprocessing the data- Tokenisation

In [19]:
from tensorflow.keras.layers import TextVectorization

In [23]:
X=df['comment_text']
Y=df[df.columns[2:]].values          ## This step converts the labeles into a numpy array (using values) to store in y

In [31]:
Max_Features= 200000                 ## Number of words in the vocab, which would be tokenised

In [35]:
vectorizer = TextVectorization(max_tokens=Max_Features, output_sequence_length=1800, output_mode='int')      ##1800 is the limit to number of words in any sentence

In [37]:
vectorizer.adapt(X.values)

In [41]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'to',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 'for',
 'this',
 'not',
 'on',
 'be',
 'as',
 'have',
 'are',
 'your',
 'with',
 'if',
 'article',
 'was',
 'or',
 'but',
 'page',
 'my',
 'an',
 'from',
 'by',
 'do',
 'at',
 'about',
 'me',
 'so',
 'wikipedia',
 'can',
 'what',
 'there',
 'all',
 'has',
 'will',
 'talk',
 'please',
 'would',
 'its',
 'no',
 'one',
 'just',
 'like',
 'they',
 'he',
 'dont',
 'which',
 'any',
 'been',
 'should',
 'more',
 'we',
 'some',
 'other',
 'who',
 'see',
 'here',
 'also',
 'his',
 'think',
 'im',
 'because',
 'know',
 'how',
 'am',
 'people',
 'why',
 'edit',
 'articles',
 'only',
 'out',
 'up',
 'when',
 'were',
 'use',
 'then',
 'may',
 'time',
 'did',
 'them',
 'now',
 'being',
 'their',
 'than',
 'thanks',
 'even',
 'get',
 'make',
 'good',
 'had',
 'very',
 'information',
 'does',
 'could',
 'well',
 'want',
 'such',
 'sources',
 'way',
 'name',
 'these',
 'deletion',
 'pages',
 'first',
 'help'

In [45]:
vectorizer('My name is Dhawal')[:4]

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([ 29, 109,   9,   1], dtype=int64)>

In [47]:
vectorized_text=vectorizer(X.values)

In [49]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

# Making tensorflow data pipeline

In [53]:
dataset=tf.data.Dataset.from_tensor_slices((vectorized_text,Y))
dataset=dataset.cache()
dataset=dataset.shuffle(160000)
dataset= dataset.batch(16)
dataset=dataset.prefetch(8)

In [55]:
train = dataset.take(int(len(dataset)*.7))
val= dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# Deep Learning Model

In [64]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Bidirectional,Dropout,Dense,Embedding

In [89]:
model= Sequential()
# Create an embedding layer
model.add(Embedding(Max_Features+1,32))
# Create the bidirectional layer
model.add(Bidirectional(LSTM(32,activation='tanh')))

# Feature extracting layers
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
# Final layer with 6 values 
model.add(Dense(6,activation='sigmoid'))

In [91]:
model.compile(loss='BinaryCrossentropy',optimizer='Adam')

In [93]:
history = model.fit(train, epochs=1, validation_data=val)

[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4696s[0m 672ms/step - loss: 0.0842 - val_loss: 0.0475


In [97]:
history.history

{'loss': [0.06397261470556259], 'val_loss': [0.047459740191698074]}

# Make prediction

In [110]:
input_text= vectorizer('You freaking idiot! I am gonna kill you')
res = model.predict(np.expand_dims(input_text,0))
res

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step


array([[0.9979334 , 0.319404  , 0.95060223, 0.02693745, 0.8021605 ,
        0.1398113 ]], dtype=float32)

# Evaluate the model

In [115]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [117]:
pre=Precision()
re=Recall()
acc= CategoricalAccuracy()

In [121]:
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, Y_true= batch
    # Make a prediction
    yhat=model.predict(X_true)

    Y_true= Y_true.flatten()
    yhat=yhat.flatten()

    pre.update_state(Y_true,yhat)
    re.update_state(Y_true,yhat)
    acc.update_state(Y_true,yhat)
    

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [124]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.7735849022865295, Recall:0.7279496192932129, Accuracy:0.4714142382144928


#Gradio App

In [126]:
!pip install gradio jinja2


[notice] A new release of pip is available: 23.3.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting gradio
  Downloading gradio-4.40.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.112.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.2.0 (from gradio)
  Downloading gradio_client-1.2.0-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.19.3 (from gradio)
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Collecting importlib-resources<7.0,>=1.3 (from gradio)
  Downloading importlib_resources-6.4.0-py3-none-any.whl.metadata (3.9 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.6-cp312-none-win_amd64.whl.metadata (51 kB)
     ---------------------------------------- 0.0/51.6 kB ? eta -:--:--
     ---------------------------------------- 51.6/51.6 kB 2.6 MB/s eta 0:00

In [128]:
import tensorflow as tf
import gradio as gr

In [140]:
model.save('toxicity.keras')

In [142]:
model = tf.keras.models.load_model('toxicity.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


In [144]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [148]:
interface = gr.Interface(
    fn=score_comment, 
    inputs=gr.Textbox(lines=2, placeholder='Comment to score'),
    outputs=gr.Textbox()
)

interface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


