In [None]:
!pip3 install tensorflow tensorflow-gpu pandas matplotlib sklearn

In [38]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [39]:
df = pd.read_csv(os.path.join('/Users/admin/Desktop/AI:ML/Comment_Toxicity/Comments/train.csv'))

In [40]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
!pip3 list

In [42]:
from tensorflow.keras.layers import TextVectorization

In [43]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [44]:
MAX_FEATURES = 200000 # number of words in the vocab

In [45]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [46]:
vectorizer.adapt(X.values)

In [47]:
vectorized_text = vectorizer(X.values)

In [48]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [49]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [50]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [51]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [52]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [53]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                16640     
 onal)                                                           
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 6)                 774       
                                                      

In [54]:
history = model.fit(train, epochs=5, validation_data=val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [80]:
from matplotlib import pyplot as plt

In [84]:

plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot(figsize=(8,5))
plt.show()


<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 1 Axes>

In [85]:
input_text = vectorizer(['You freaking suck! I am going to hit you.'])


In [86]:
res = model.predict(input_text)



In [87]:
(res > 0.5).astype(int)

array([[1, 0, 1, 0, 1, 0]])

In [88]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [89]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [90]:
res.shape

(1, 6)

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy



In [64]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [66]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8901331424713135, Recall:0.8124309182167053, Accuracy:0.49949848651885986


In [None]:
!pip3 install gradio jinja2

In [68]:
import tensorflow as tf
import gradio as gr

In [None]:
model.save('toxicity.keras')



In [70]:
model = tf.keras.models.load_model('toxicity.h5')

In [71]:
input_str = vectorizer('hey i freaken hate you!')
res = model.predict(np.expand_dims(input_str,0))



In [72]:
res

array([[0.8523209 , 0.00156536, 0.03623167, 0.02344071, 0.21181281,
        0.06226681]], dtype=float32)

In [73]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [93]:
import gradio as gr

interface = gr.Interface(fn=score_comment, 
                         inputs="text",
                         outputs="text")

interface.launch()


Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




In [75]:
interface.launch(share=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Running on public URL: https://45664d23d9e30ce138.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


