# 0. Install Dependencies and Bring in Data

In [1]:
# !pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

In [51]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [52]:
df = pd.read_csv(os.path.join('dataset', 'train.csv'))

In [53]:
df=df.head(100000)

In [54]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             100000 non-null  object
 1   comment_text   100000 non-null  object
 2   toxic          100000 non-null  int64 
 3   severe_toxic   100000 non-null  int64 
 4   obscene        100000 non-null  int64 
 5   threat         100000 non-null  int64 
 6   insult         100000 non-null  int64 
 7   identity_hate  100000 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 6.1+ MB


# 1. Preprocess

In [56]:
# !pip list

In [57]:
from tensorflow.keras.layers import TextVectorization

In [9]:
def clean_text(text):
    # Remove non-ASCII characters
    return text.encode("ascii", "ignore").decode("ascii")

In [10]:
df['comment_text']=df['comment_text'].apply(clean_text)

In [58]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [59]:
MAX_FEATURES = 200000 # number of words in the vocab

In [60]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [61]:
vectorizer.adapt(X.values)# Create a temporary model to wrap the TextVectorization layer


In [15]:
import pickle
pickle.dump(vectorizer,open('vectorizer.pkl','wb'))

In [16]:
vectorized_text = vectorizer(X.values)

In [17]:
vectorized_text

<tf.Tensor: shape=(100000, 1800), dtype=int64, numpy=
array([[   649,     76,      2, ...,      0,      0,      0],
       [154448,     54,   2520, ...,      0,      0,      0],
       [   408,    431,     70, ...,      0,      0,      0],
       ...,
       [    12,    289,     11, ...,      0,      0,      0],
       [ 24165,     23,      7, ...,      0,      0,      0],
       [   171,    454,    325, ...,      0,      0,      0]])>

In [18]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [19]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [20]:
train

<_TakeDataset element_spec=(TensorSpec(shape=(None, 1800), dtype=tf.int64, name=None), TensorSpec(shape=(None, 6), dtype=tf.int64, name=None))>

# 2. Create Sequential Model

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

callback

In [22]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [23]:
checkpoint_callback= ModelCheckpoint(
    filepath="comment_checkpoint/toxicity.keras",
    save_best_only=False,
    monitor='loss',
    verbose=1
)

In [24]:
INPUT_LENGTH = 1800  # Adjust according to your preprocessing
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))
model.build(input_shape=(None, INPUT_LENGTH))

In [25]:
model.summary()

In [26]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [27]:
# history = model.fit(train, epochs=6, validation_data=val , callbacks=[checkpoint_callback])

In [126]:
from matplotlib import pyplot as plt

In [28]:
# plt.figure(figsize=(8,5))
# pd.DataFrame(history.history).plot()
# plt.show()

# 3. Make Predictions

In [29]:
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import LSTM
from tensorflow.keras.saving import register_keras_serializable

In [30]:
# Register LSTM as a serializable layer
register_keras_serializable()(LSTM)
model = load_model('comment_checkpoint/toxicity.keras', custom_objects={'LSTM': LSTM})

In [31]:
input_text = vectorizer('You freaking suck! I am going to hit you.')

In [32]:
input_text

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([   7, 7041,  410, ...,    0,    0,    0])>

In [33]:
res = model.predict(np.array([input_text]))
res

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


array([[1.9249821e-02, 6.6645580e-06, 9.7844226e-04, 9.8925352e-04,
        3.5230853e-03, 1.0456189e-03]], dtype=float32)

In [34]:
(res > 0.5).astype(int)

array([[0, 0, 0, 0, 0, 0]])

In [35]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [36]:
(model.predict(batch_X) > 0.5).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 864ms/step


array([[1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]])

In [37]:
res.shape

(1, 6)

# 4. Evaluate Model

In [38]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [39]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [151]:
# for batch in test.as_numpy_iterator(): 
#     # Unpack the batch 
#     X_true, y_true = batch
#     # Make a prediction 
#     yhat = model.predict(X_true)
    
#     # Flatten the predictions
#     y_true = y_true.flatten()
#     yhat = yhat.flatten()
    
#     pre.update_state(y_true, yhat)
#     re.update_state(y_true, yhat)
#     acc.update_state(y_true, yhat)

In [140]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.814711332321167, Recall:0.7932910323143005, Accuracy:0.5120000243186951


# 5. Test and Gradio

In [40]:
# !pip install gradio jinja2

In [62]:
import tensorflow as tf
import gradio as gr

In [63]:
# model.save('toxicity.h5')

In [64]:
model = tf.keras.models.load_model('comment_checkpoint/toxicity.keras')

In [65]:
input_str = vectorizer('hey i freaken hate you!')

In [66]:
res = model.predict(np.expand_dims(input_str,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 644ms/step


In [67]:
res

array([[0.7414135 , 0.01107552, 0.20539875, 0.02094989, 0.30451754,
        0.07696238]], dtype=float32)

In [68]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [69]:
interface = gr.Interface(
    fn=score_comment, 
    inputs=gr.Textbox(lines=2, placeholder='Comment to score'),
    outputs="text"
)

In [70]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://1c1be4b4b2d07fd95d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step
