Loading the data

In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle

In [None]:
file_path = 'C:\\Users\\dell\\OneDrive\\Desktop\\Toxic_comments_classification'

In [None]:
tests = pd.read_csv(os.path.join(file_path, 'test.csv'))
labels = pd.read_csv(os.path.join(file_path, 'test_labels.csv'))
trains = pd.read_csv(os.path.join(file_path, 'train.csv'))

In [None]:
tests.head()


In [None]:
labels.head()

In [None]:
trains.head()

Preprocessing the data

In [None]:
from tensorflow.keras.layers import TextVectorization #TextVectorization converts text data to numerical format

In [None]:
X = trains['comment_text']
Y = trains[trains.columns[2:]].values

In [None]:
MAX_WORDS = 200000

In [None]:
vectorizer = TextVectorization(max_tokens = MAX_WORDS, output_sequence_length = 1800, output_mode = 'int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorized_data = vectorizer(X.values)

In [None]:
print(vectorized_data)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_data, Y)) #TensorFlow Dataset
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16) #batches dataset to 16 samples
dataset = dataset.prefetch(8) #prepare next batch while current one is being processed

In [None]:
batch_X, batch_Y = dataset.as_numpy_iterator().next()

In [None]:
print(batch_X.shape)
print(batch_Y.shape)

In [None]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int (len(dataset)*.1))

In [None]:
train_gen = train.as_numpy_iterator() #iterates over batches of data during training and evaluation.

In [None]:
train_gen.next()

Creating the Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
#Sequetial allows to define a linear stack of layers
model = Sequential()

# Embedding converts integer indices(representing words) into dense vectors of fixed size
model.add(Embedding(MAX_WORDS+1, 32))

# Bidirectional(Both forward and backward directions) LSTM(Process sequential data and updates its internal state and memory over time) 
model.add(Bidirectional(LSTM(32, activation='tanh')))

# Dense layers used for feature extraction and non-linear transformation
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))

# Output layer
model.add(Dense(6, activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer= 'Adam')

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs = 15, validation_data=val)

In [None]:
hist = model.fit(train, epochs = 1, validation_data=val)

In [None]:
history = model.fit(train, epochs = 10, validation_data=val)

In [None]:
history.history

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

Make Predictions

In [None]:
input_text = 'You nigga!'

In [None]:
vectorized_input_text = vectorizer(input_text)

In [None]:
vectorized_input_text

In [None]:
trains.columns[2:]

In [None]:
model.predict(np.expand_dims(vectorized_input_text, 0))

In [None]:
batch = test.as_numpy_iterator().next()

In [None]:
batch_X, batch_Y = test.as_numpy_iterator().next()

In [None]:
batch_Y

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

Model Evaluation

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    X_true, Y_true = batch

    yhat = model.predict(X_true)

    Y_true = Y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(Y_true, yhat)
    re.update_state(Y_true, yhat)
    acc.update_state(Y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

Saving the model and vectorizer

In [None]:
model.save('toxicity.h5')

In [None]:
vectorizer_config = vectorizer.get_config()
vectorizer_weights = vectorizer.get_weights()

vectorizer_data = {'config': vectorizer_config, 'weights': vectorizer_weights}

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer_data, f)