# Comment Toxicity Model

### Installing Dependencies

In [1]:
!pip install numpy pandas matplotlib tensorflow scikit-learn




[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [3]:
df = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv','train.csv'))

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
df.shape

(159571, 8)

### Preprocessing Data

In [7]:
#Tokenising the data
from tensorflow.keras.layers import TextVectorization

In [8]:
X = df['comment_text'] # these will be the comments
y = df[df.columns[2:]].values # these are the features

In [9]:
X

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [14]:
df[df.columns[2:]].head() 

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


In [45]:
MAX_FEATURES = 100000 #number os words in vocab

In [46]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=1800, output_mode='int')

In [47]:
vectorizer.adapt(X.values)

In [48]:
vectorizer("Hello, you are great")[:5] #example of how vectorizer works

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([286,   7,  20, 275,   0], dtype=int64)>

In [49]:
vectorized_text = vectorizer(X.values)

In [50]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  643,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2506, ...,     0,     0,     0],
       [  425,   440,    70, ...,     0,     0,     0],
       ...,
       [32141,  7329,   383, ...,     0,     0,     0],
       [    5,    12,   533, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

#### Creating Data PipeLines

##### Here we use the pneumonic "MCHBAP" - map, cache, shuffle, batch, prefetch and this is done using the from_tensor_slices library or list_file. This helps handle data that cannot fit into memory because of its size

In [51]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) #help bottlenecks

In [52]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [53]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [54]:
train_generator = train.as_numpy_iterator()

In [55]:
train_generator.next()

(array([[  103,   191,   388, ...,     0,     0,     0],
        [  383,    67,     6, ...,     0,     0,     0],
        [62450,   107,     1, ...,     0,     0,     0],
        ...,
        [  794,    76,    88, ...,     0,     0,     0],
        [ 1611,   224,    56, ...,     0,     0,     0],
        [    8,   243,    10, ...,     0,     0,     0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

## Creating the Model

In [56]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [57]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh'))) #for tensorflow
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(6, activation='sigmoid')) #number of layers

In [58]:
model.compile(loss="BinaryCrossentropy", optimizer='Adam')

In [59]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          3200032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 6)                 774       
                                                      

In [60]:
history = model.fit(train, epochs=1, validation_data=val)



In [61]:
history.history

{'loss': [0.062171146273612976], 'val_loss': [0.0462687723338604]}

## Make Predictions

In [62]:
input_text = vectorizer("you suck") #to test

In [68]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [66]:
res = model.predict(np.expand_dims(input_text,0))



In [67]:
res

array([[0.99703574, 0.3954282 , 0.9320081 , 0.02949994, 0.83166945,
        0.1238213 ]], dtype=float32)

In [69]:
model.predict(batch_X)



array([[8.70228163e-04, 1.91457072e-09, 1.11807793e-04, 8.00275359e-07,
        1.84668788e-05, 2.91648757e-06],
       [1.78981072e-03, 1.07694555e-08, 2.40223366e-04, 2.50276162e-06,
        4.78610018e-05, 8.54981863e-06],
       [1.03085963e-02, 6.83529493e-07, 1.49320764e-03, 4.44110228e-05,
        4.85236815e-04, 1.23417543e-04],
       [6.46968465e-03, 2.44577848e-07, 9.54716816e-04, 2.20271195e-05,
        2.70559889e-04, 6.34616517e-05],
       [9.53770280e-01, 9.03560594e-02, 6.44936085e-01, 2.72559319e-02,
        5.71926773e-01, 7.93238580e-02],
       [1.01807930e-01, 1.35595750e-04, 1.60078779e-02, 1.60635670e-03,
        1.04388390e-02, 3.77518730e-03],
       [3.03791072e-02, 7.77527475e-06, 4.45130374e-03, 2.38872410e-04,
        1.96537119e-03, 5.90603217e-04],
       [3.21767875e-03, 4.63454732e-08, 4.60404844e-04, 6.97367341e-06,
        1.07156171e-04, 2.18088608e-05],
       [1.18242577e-03, 4.52934534e-09, 1.68536688e-04, 1.51070708e-06,
        2.99005023e-05, 

## Evaluate Model

In [76]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [77]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [79]:
for batch in test.as_numpy_iterator():
    X_true, y_true = batch
    yhat = model.predict(X_true)
    
    y_true = y_true.flatten()
    yhat= yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)
    















In [80]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8098966479301453, Recall:0.6975675821304321, Accuracy:0.4824473559856415


## Gradio

In [113]:
!pip install gradio jinja2




[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [119]:
pip install --upgrade gradio

Collecting gradio
  Using cached gradio-4.37.2-py3-none-any.whl.metadata (15 kB)
Collecting gradio-client==1.0.2 (from gradio)
  Using cached gradio_client-1.0.2-py3-none-any.whl.metadata (7.1 kB)
Using cached gradio-4.37.2-py3-none-any.whl (12.3 MB)
Using cached gradio_client-1.0.2-py3-none-any.whl (318 kB)
Installing collected packages: gradio-client, gradio
  Attempting uninstall: gradio-client
    Found existing installation: gradio_client 0.5.0
    Uninstalling gradio_client-0.5.0:
      Successfully uninstalled gradio_client-0.5.0
  Attempting uninstall: gradio
    Found existing installation: gradio 3.43.1
    Uninstalling gradio-3.43.1:
      Successfully uninstalled gradio-3.43.1
Successfully installed gradio-4.37.2 gradio-client-1.0.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [120]:
import gradio as gr

model.save('toxicity.h5')

In [121]:
model=tf.keras.models.load_model('toxicity.h5')

In [125]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [126]:
interface = gr.Interface( fn=score_comment, inputs=gr.Textbox(lines=2, placeholder="Enter Comment"),  # Input text box
outputs='text')

In [128]:
interface.launch(share=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


