In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np




In [17]:
df = pd.read_csv(os.path.join('dataset' , 'train.csv' , 'train.csv'))

In [18]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [19]:
df['comment_text'][5]

'"\n\nCongratulations from me as well, use the tools well. \xa0Â· talk "'

In [20]:
df[df.columns[2:]].iloc[5]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 5, dtype: int64

In [21]:
from tensorflow.keras.layers import TextVectorization

In [22]:
X = df['comment_text']
Y = df[df.columns[2:]].values

In [23]:
X

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [24]:
Y


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [25]:
VOCAB = 200000 # basically the number of words our dictionary consists of

In [26]:
vectors = TextVectorization(max_tokens = VOCAB , output_sequence_length=2000 , output_mode = 'int')
#making vectors of comments, linking every word to a integer
vectors.adapt(X.values)





In [27]:
vectors.get_vocabulary()[500]

'able'

In [28]:
vectors('This is my deep learning project')[:6]

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([  14,    9,   29, 2630, 1988,  463], dtype=int64)>

In [29]:
vectorized_test = vectors(X.values)
vectorized_test

<tf.Tensor: shape=(159571, 2000), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [30]:
#input pipeline
dataset = tf.data.Dataset.from_tensor_slices((vectorized_test , Y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(32)
dataset = dataset.prefetch(8)


In [31]:
batch_X , batch_Y = dataset.as_numpy_iterator().next() #next batch

In [32]:
batch_X.shape , batch_Y.shape

((32, 2000), (32, 6))

In [33]:
train = dataset.take(int(len(dataset) * .7))
validation = dataset.skip(int(len(dataset) * .7)).take(int(len(dataset) * .2))
test = dataset.skip(int(len(dataset) * .9)).take(int(len(dataset) * .1))

In [34]:
train_gen = train.as_numpy_iterator().next()


In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM , Bidirectional , Dense , Embedding

In [36]:
model = Sequential()
model.add(Embedding(input_dim=VOCAB + 1, output_dim=32, input_length=2000))


model.add(Bidirectional(LSTM(32 , activation = 'tanh')))
#feature extraction layers(to 6 diff features)
model.add(Dense(128 , activation = 'relu'))
model.add(Dense(256 , activation = 'relu'))
model.add(Dense(128 , activation = 'relu'))
#final layer
model.add(Dense(6 , activation = 'sigmoid'))

In [37]:
model.compile(loss = 'BinaryCrossentropy' , optimizer = 'Adam' , metrics = ['accuracy'])




In [38]:
model.build(input_shape=(None, 2000))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2000, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [39]:
trained_model = model.fit(train , epochs = 3 , validation_data = validation)

Epoch 1/3

Epoch 2/3
Epoch 3/3


In [40]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [54]:
text = vectors('You freaking idiot')
model.predict(np.array([text]))



array([[0.98793554, 0.10535523, 0.80579853, 0.02998092, 0.68297017,
        0.1286179 ]], dtype=float32)

In [46]:
batch_X, batch_Y = test.as_numpy_iterator().next()

In [47]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [48]:
from tensorflow.keras.metrics  import Precision , Recall, CategoricalAccuracy

In [58]:
pre = Precision()
re = Recall()


In [59]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)



In [67]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}')

Precision: 0.8422289490699768, Recall:0.6868327260017395


In [57]:
pip install gradio jinja2

Collecting gradio
  Downloading gradio-5.33.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.6.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.10.3 (from gradio)
  Downloading gradio_client-1.10.3-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting huggingface-hub>=0.28.1 (from gradio)
  Downloading huggingface_hub-0.32.6-py3-none-any.whl.metadata (14 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.18-cp311-cp311-win_amd64.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.0 kB ? eta -:--:--
     ------------------- -------------------- 20.5/43.0 kB ? eta -:--:--
     -----------


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [68]:
import gradio as gr

In [69]:
model.save('toxicity.h5')

  saving_api.save_model(


In [70]:
model = tf.keras.models.load_model('toxicity.h5')

In [84]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [89]:
input_com = vectors(tf.constant(['You piece of shit, keep yourself off the social media']))
res = model.predict(input_com)
res



array([[0.99344486, 0.09705123, 0.9322427 , 0.01054964, 0.6398469 ,
        0.06109979]], dtype=float32)

In [105]:
def score_comment(comment):
    # Vectorize input correctly
    vectorized_comment = vectors(tf.constant([comment]))
    
    # Predict
    results = model.predict(vectorized_comment)

    text = ''
    for idx, col in enumerate(df.columns[2:]):
        label = 'True' if results[0][idx] > 0.5 else 'false'
        text += '{}: {}\n'.format(col, label)
    
    return text


In [106]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [107]:
interface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7864
* Running on public URL: https://2624f3981e5786ba88.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




