0. Install Dependencies and Bring in Data

In [101]:
!pip3 install tensorflow pandas matplotlib scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [102]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [103]:
df = pd.read_csv('/Users/swami/Documents/IITK Stuff/Reference Books/ML/Comment toxicity model/jigsaw-toxic-comment-classification-challenge/train.csv')

In [104]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0



1. Preprocess

In [105]:
from tensorflow.keras.layers import TextVectorization

In [106]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [107]:
MAX_FEATURES = 200000 # number of words in the vocab

In [108]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [109]:
vectorizer.adapt(X.values)

In [110]:
vectorized_text = vectorizer(X.values)

In [111]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps prevent bottlenecks

In [112]:
# partition the data
train = dataset.take(int(len(dataset)*.7)) 
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

2. Create Sequential Model

In [113]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [114]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh'))) #required for GPU acceleration
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [115]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [116]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_4 (Bidirecti  (None, 64)                16640     
 onal)                                                           
                                                                 
 dense_16 (Dense)            (None, 128)               8320      
                                                                 
 dense_17 (Dense)            (None, 256)               33024     
                                                                 
 dense_18 (Dense)            (None, 128)               32896     
                                                                 
 dense_19 (Dense)            (None, 6)                 774       
                                                      

In [117]:
history = model.fit(train, epochs=5, validation_data=val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [118]:
model.save('toxicity.h5')

  saving_api.save_model(


In [133]:
from matplotlib import pyplot as plt
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

<Figure size 800x500 with 0 Axes>

<Figure size 640x480 with 1 Axes>

3. Make Predictions

In [121]:
input_text = vectorizer('You freaking suck! I am going to hit you.') # testing for single example

In [122]:
res = model.predict(np.expand_dims(input_text,0)) # gives the array of predictons where a vlaue of >0.5 indicates its prescence



In [123]:
(res > 0.5).astype(int)

array([[1, 0, 1, 0, 1, 0]])

In [124]:
batch_X, batch_y = test.as_numpy_iterator().next() # testing for a batch 

In [125]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

4. Evaluate Model

In [126]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [127]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [128]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



In [129]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8818973898887634, Recall:0.8104982376098633, Accuracy:0.5045135617256165


5. Test and Gradio

In [134]:
!pip3 install gradio jinja2 

Defaulting to user installation because normal site-packages is not writeable
Collecting typing-extensions~=4.0 (from gradio)
  Using cached typing_extensions-4.9.0-py3-none-any.whl.metadata (3.0 kB)
Using cached typing_extensions-4.9.0-py3-none-any.whl (32 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.5.0
    Uninstalling typing_extensions-4.5.0:
      Successfully uninstalled typing_extensions-4.5.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-macos 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.9.0 which is incompatible.[0m[31m
[0mSuccessfully installed typing-extensions-4.9.0


In [135]:
import tensorflow as tf
import gradio as gr
!pip3 install --upgrade gradio

Defaulting to user installation because normal site-packages is not writeable
Collecting gradio
  Downloading gradio-4.11.0-py3-none-any.whl.metadata (17 kB)
Downloading gradio-4.11.0-py3-none-any.whl (16.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: gradio
  Attempting uninstall: gradio
    Found existing installation: gradio 4.10.0
    Uninstalling gradio-4.10.0:
      Successfully uninstalled gradio-4.10.0
Successfully installed gradio-4.11.0


In [137]:
model = tf.keras.models.load_model('toxicity.h5')

In [138]:
input_str = vectorizer('hey i freaken hate you!')

In [139]:
res = model.predict(np.expand_dims(input_str,0))



In [140]:
res

array([[0.8800649 , 0.00237633, 0.07773779, 0.00760644, 0.35426593,
        0.02690181]], dtype=float32)

In [141]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [142]:

interface = gr.Interface(fn=score_comment, inputs="text", outputs="text")

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://34667996bf0f80cf96.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




