In [None]:
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

!pip list

In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import TextVectorization

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding, Input

In [None]:
#df = pd.read_csv(os.path.join('datasets','train.csv', 'train.csv'))
df = pd.read_csv(r'/content/train.csv')

df.head()

X = df['comment_text']
y = df[df.columns[2:]].values

MAX_FETURES = 200000 # number of words in the vocab
SEQUENCE_LENGTH = 1800

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FETURES,
                               output_sequence_length=1800,
                               output_mode='int')

vectorizer.adapt(X.values)

vectorized_txt = vectorizer(X.values)

#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_txt, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))




In [None]:
model = Sequential()
model.add(Input(shape=(SEQUENCE_LENGTH,)))
model.add(Embedding(input_dim=MAX_FETURES+1,  # +1 for padding token
                   output_dim=32,
                   input_length=SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))



In [None]:

model.compile(loss='BinaryCrossentropy', optimizer='Adam')

model.summary()

In [None]:
history = model.fit(train, epochs=10, validation_data=val)

Epoch 1/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m759s[0m 107ms/step - loss: 0.0840 - val_loss: 0.0445
Epoch 2/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m746s[0m 107ms/step - loss: 0.0467 - val_loss: 0.0405
Epoch 3/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m741s[0m 106ms/step - loss: 0.0420 - val_loss: 0.0368
Epoch 4/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m744s[0m 107ms/step - loss: 0.0381 - val_loss: 0.0354
Epoch 5/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m738s[0m 106ms/step - loss: 0.0352 - val_loss: 0.0310
Epoch 6/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m732s[0m 105ms/step - loss: 0.0321 - val_loss: 0.0282
Epoch 7/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m742s[0m 105ms/step - loss: 0.0289 - val_loss: 0.0277
Epoch 8/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m731s[0m 105ms/step - loss: 0.0273 - val_loss:

In [None]:
# Vectorize and reshape single text input
input_text = vectorizer('You freaking suck! I am going to hit you.')
input_text = tf.expand_dims(input_text, 0)  # Add batch dimension

# Make prediction
res = model.predict(input_text)
print("Prediction probabilities:", res)
print("Binary predictions:", (res > 0.5).astype(int))

# Shape of the result
print("Result shape:", res.shape)

# Test batch prediction
batch_X, batch_y = test.as_numpy_iterator().next()
batch_predictions = model.predict(batch_X)
print("\nBatch predictions (binary):", (batch_predictions > 0.5).astype(int))
print("Batch predictions shape:", batch_predictions.shape)

# Optional: Print the toxicity categories alongside predictions
categories = df.columns[2:]  # Get the toxicity category names
for i, category in enumerate(categories):
    print(f"{category}: {res[0][i]:.4f} -> {int(res[0][i] > 0.5)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 366ms/step
Prediction probabilities: [[0.9980788  0.06591689 0.9807509  0.0068792  0.9053064  0.00975615]]
Binary predictions: [[1 0 1 0 1 0]]
Result shape: (1, 6)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 376ms/step

Batch predictions (binary): [[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Batch predictions shape: (16, 6)
toxic: 0.9981 -> 1
severe_toxic: 0.0659 -> 0
obscene: 0.9808 -> 1
threat: 0.0069 -> 0
insult: 0.9053 -> 1
identity_hate: 0.0098 -> 0


In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

In [None]:
!pip install gradio jinja2

In [None]:
import gradio as gr

In [None]:
model.save('toxicityModel.h5')

model = tf.keras.models.load_model('toxicityModel.h5')

input_str = vectorizer('hey i freaken hate you!')

res = model.predict(np.expand_dims(input_str,0))

res



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step


array([[0.89187646, 0.00154522, 0.0268265 , 0.01062214, 0.14537996,
        0.01921617]], dtype=float32)

In [None]:
import gradio as gr

def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)

    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)

    return text

interface = gr.Interface(
    fn=score_comment,
    inputs=gr.Textbox(lines=2, placeholder='Comment to score'),
    outputs=gr.Textbox(),
    title="Toxicity Classifier",
    description="Enter a comment to check for different types of toxicity."
)

interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7782f35da6c63ab689.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import pickle
with open('model.pkl', 'wb') as f:
  pickle.dump(model,f)