In [5]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [6]:
df = pd.read_csv('train.csv')

In [7]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:

from tensorflow.keras.layers import TextVectorization

In [9]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [10]:
MAX_FEATURES = 80000 # number of words in the vocab

In [13]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=800,
                               output_mode='int')

In [14]:
vectorizer.adapt(X.values)

In [16]:
vectorized_text = vectorizer(X.values)

In [17]:
# Save to JSON
config = vectorizer.get_config()
vocab = vectorizer.get_vocabulary()

# Dump to a file
import json

with open('vectorizer_config.json', 'w') as f:
    json.dump(config, f, indent=4)

with open('vectorizer_vocab.json', 'w') as f:
    json.dump(vocab, f, indent=4)


In [18]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks


In [19]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, LSTM, Dropout, Bidirectional, Dense, Embedding

In [21]:
model = Sequential()
# Create the embedding layer
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer
model.add(Dense(6, activation='sigmoid'))

In [22]:
model.build(input_shape=(None, None))

In [23]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')


In [24]:
model.summary()

In [None]:
history = model.fit(train, epochs=5, validation_data=val)

Epoch 1/5
[1m 852/6981[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m4:33[0m 45ms/step - loss: 0.1567

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

In [None]:
input_text = vectorizer('You freaking suck! I am going to kill you.')

In [None]:
df.columns[2:]

In [None]:
batch = test.as_numpy_iterator().next()

In [None]:
test.as_numpy_iterator().next()

In [None]:
model.predict(np.expand_dims(input_text, 0))

In [None]:
res = model.predict(np.expand_dims(input_text, 0))

In [None]:
(res > 0.4).astype(int)

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.4).astype(int)

In [None]:
res.shape

In [None]:
from tensorflow.keras.metrics import Precision, Recall, BinaryAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = BinaryAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')


In [None]:
df[df.columns[2:]].sum().sort_values(ascending=False)

In [None]:
import tensorflow as tf
import gradio as gr

In [None]:
# Add this to the end of your original code to properly save models

# Save the trained model
model.save('toxicity_model.keras')

# Save the vectorizer separately using pickle (more compatible)
import pickle
vectorizer_config = vectorizer.get_config()
vectorizer_weights = vectorizer.get_weights()

# Save vectorizer configuration and weights
with open('vectorizer_config.pkl', 'wb') as f:
    pickle.dump(vectorizer_config, f)

with open('vectorizer_weights.pkl', 'wb') as f:
    pickle.dump(vectorizer_weights, f)

# Alternative: Save vectorizer as a simple function
def save_vectorizer_data():
    """Save vectorizer vocabulary and settings"""
    vocab = vectorizer.get_vocabulary()
    config = {
        'max_tokens': vectorizer.max_tokens,
        'output_sequence_length': vectorizer.output_sequence_length,
        'vocabulary': vocab
    }
    with open('vectorizer_data.pkl', 'wb') as f:
        pickle.dump(config, f)

save_vectorizer_data()

# Download the files (if in Colab)
from google.colab import files
files.download('toxicity_model.keras')
files.download('vectorizer_config.pkl')
files.download('vectorizer_weights.pkl')
files.download('vectorizer_data.pkl')

print("Models and vectorizer saved successfully!")
print("Copy these files to your project's 'models' folder")

In [None]:
model = tf.keras.models.load_model('toxicity_model.keras')

In [None]:
input_str = vectorizer('hey i freaken hate you!')

In [None]:
res = model.predict(np.expand_dims(input_str,0))

In [None]:
res

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)

    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)

    return text

In [None]:
input_text = vectorizer('I will murder you')

In [None]:
model.predict(np.expand_dims(input_text, 0))

In [None]:
import gradio as gr

interface = gr.Interface(
    fn=score_comment,
    inputs=gr.Textbox(lines=2, placeholder='Comment to score'),
    outputs=gr.Text()
)
interface.launch()


In [None]:
interface.launch(share=True)