**Training toxicity classifier model**

In [10]:
! pip install -q kaggle

Upload kaggle.json

In [None]:
from google.colab import files
files.upload()

In [12]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Download dataset

In [None]:
! kaggle competitions download -c 'jigsaw-toxic-comment-classification-challenge'
! unzip 'jigsaw-toxic-comment-classification-challenge.zip'

In [None]:
! unzip 'train.csv.zip'
! unzip 'test.csv.zip'
! unzip 'test_labels.csv.zip'

Download pretrained glove word embedding

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [7]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding
from tensorflow.keras import layers

Load and preprocess dataset

In [16]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test_labels = pd.read_csv('test_labels.csv')

## combine df_test and its labels then throw away rows with -1 values
df_test_labels_normalized = df_test_labels[df_test_labels['toxic']!=-1]
df_test_normalized = df_test.set_index('id').join(df_test_labels_normalized.set_index('id'), how='right')

In [17]:
feature = ['comment_text']
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## convert into tf.data.Dataset
train_data = tf.data.Dataset.from_tensor_slices((df_train[feature], df_train[target]))
test_data = tf.data.Dataset.from_tensor_slices((
    df_test_normalized[feature],
    df_test_normalized[target]
))

Use TextVectorization to convert text to sequences

In [18]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=50, ## max_tokens denotes number of words to be tokenized
                               pad_to_max_tokens=True)

vectorizer.adapt(train_data.map(lambda x, y: x).batch(2000)) ## use .map() to get the input only since in the dataset there are input and label

voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

Load pretrained word embedding and put it into dictionary

In [19]:
path_to_glove_file = "glove.6B.200d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
  for line in f:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


Only use words from pretrained word embedding that exist in vectorizer.get_vocabulary()

In [20]:
num_tokens = len(voc) + 2  ## TextVectorization already includes OOV and padding, but pretrained glove file also includes OOV and padding so we add 2
embedding_dim = 200  ## 200 as dimension comes from pretrained word embedding
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    # Words not found in embedding index will be all-zeros.
    # This includes the representation for "padding" and "OOV"
    embedding_matrix[i] = embedding_vector
    hits += 1
  else:
    misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18416 words (1584 misses)


Initialize embedding layer with 20000 of pretrained embedding vectors

In [None]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

Create sequential model

In [None]:
model = tf.keras.Sequential([
    layers.Input(shape=(50,), dtype='int64'),
    embedding_layer,
    layers.Bidirectional(layers.LSTM(32, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(32, return_sequences=False)),
    layers.Dense(32, activation='relu'),
    layers.Dense(6, activation='sigmoid')
])

In [None]:
model.summary()

Transform train data and test data from raw text to sequences

In [28]:
## define function to convert raw text to sequences
def to_sequence(x, y):
  return vectorizer(x), y

## batch, cache, and prefetch
## set batch of 2000, since we have big vram and so we can better utilize the GPU
## batch size tradeoff:
## - big batch -> better GPU utilization -> faster training time -> lower accuracy
## - small batch -> worse GPU utilization -> slower training time -> higher accuracy
train_data = train_data.batch(32).map(to_sequence).cache().prefetch(tf.data.AUTOTUNE) ## batch_size of 32 is small and will result in slow training time but in our case gives better f1 score
test_data = test_data.batch(20000).map(to_sequence).cache().prefetch(tf.data.AUTOTUNE)  ## for test data we can use big batch since we don't use it for training

Compile and train model

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(), 
              optimizer='adam',
              metrics=[tf.keras.metrics.Recall(thresholds=0.5), 
                       tf.keras.metrics.Precision(thresholds=0.5)])

history = model.fit(train_data,
                    epochs=3, 
                    validation_data=test_data,
                    verbose=2)

Find out f1 score of the model

In [36]:
## make prediction to all train and test dataset
train_prediction = model.predict(train_data.map(lambda x, y: x).rebatch(20000)) ## rebatch to big batch size so prediction is much faster
test_prediction = model.predict(test_data.map(lambda x, y: x))

## define the metrics
train_precision = tf.keras.metrics.Precision(thresholds=0.5)
train_recall = tf.keras.metrics.Recall(thresholds=0.5)
test_precision = tf.keras.metrics.Precision(thresholds=0.5)
test_recall = tf.keras.metrics.Recall(thresholds=0.5)

## prepare the label data
# i don't know how to feed tf.data.Dataset directly into update_state() below,
# so here i get only label from dataset -> unbatch the data -> force unpack them 
# into tensors with list(). This operation is slow!
train_label = list(train_data.map(lambda x, y: y).unbatch())  ## get correct label from train data
test_label = list(test_data.map(lambda x, y: y).unbatch())  ## get correct label from test data

## calculate the metrics
train_precision.update_state(train_label, train_prediction)
train_recall.update_state(train_label, train_prediction)
test_precision.update_state(test_label, test_prediction)
test_recall.update_state(test_label, test_prediction)

## get and turn result into numpy so we can do calculation
train_precision = train_precision.result().numpy()
train_recall = train_recall.result().numpy()
test_precision = test_precision.result().numpy()
test_recall = test_recall.result().numpy()

## print the metric scores
print('precision train: ', train_precision)
print('recall train: ', train_recall)
print('precision test: ', test_precision)
print('recall test: ', test_recall)
print('f1 train: ', 2 * train_precision * train_recall / (train_precision + train_recall))
print('f1 test: ', 2 * test_precision * test_recall / (test_precision + test_recall))

precision train:  0.8649898
recall train:  0.67704713
precision test:  0.63464135
recall test:  0.68161124
f1 train:  0.7595652663133241
f1 test:  0.6572882447429294


Try predicting with the model

In [None]:
np.set_printoptions(precision=8, suppress=True)  ## print in decimal number, not scientific
seq = vectorizer([['my nigga']])  ## turn text into sequence first using vectorizer
model.predict(seq)

Append TextVectorization layer to the model, so we don't need to do separate preprocessing and can directly input raw text to the model

In [None]:
# Start by creating an explicit input layer. It needs to have a shape of  
# (1,) (because we need to guarantee that there is exactly one string  
# input per batch), and the dtype needs to be 'string'.
end_to_end_model = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    vectorizer,
    model
])

end_to_end_model.summary()

Try the end-to-end model

In [None]:
end_to_end_model.predict([['my nigga']])  ## can directly input raw text, no need vectorizer

Save the models with TextVectorization layer and without TextVectorization layer as saved_model format

In [None]:
model.save('./saved_model/model')   ## model without TextVectorization layer
end_to_end_model.save('./saved_model/end-to-end')  ## model with TextVectorization layer

Save to google drive

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')
# ! cp -r saved_model gdrive/MyDrive

Mounted at /content/gdrive


Try to load the saved models

In [34]:
## load model without TextVectorization layer
loaded_model = tf.keras.models.load_model('saved_model/model') 

## load model with TextVectorization layer
loaded_end_to_end_model = tf.keras.models.load_model('saved_model/end-to-end')

Make prediction with loaded models

In [24]:
print('loaded_model: ', loaded_model.predict( vectorizer([['my nigga']]) )) ## need vectorizer
print('loaded_end_to_end_model: ', loaded_end_to_end_model.predict( [['my nigga']] ))  ## no need vectorizer

loaded_model:  [[0.9763663  0.19614795 0.79277015 0.01046047 0.81799424 0.7410841 ]]
loaded_end_to_end_model:  [[0.9763663  0.19614795 0.79277015 0.01046047 0.81799424 0.7410841 ]]
