# Bert Model for Classification Toxic Comments

# Notebook content:

1. Import Libraries
2. Run Bert Model on TPU
3. Functions and Variables<br>
    3.1 Function for Encoding the comment<br>
    3.2 Function for Neural Network model<br>
4. Preprocessing
    4.1 Import Datasets<br>
    4.2 tokenaizer <br>
    4.3 Encode The Comments<br>
    4.4 Prepare tensorflow dataset for modeling<br>
5. Machine Learning<br>
    5.1 Training The Model, Tuning Hyper-Parameters<br>
    5.2 Testing The Model

    
    

# 1. Import Libraries

In [30]:
from numpy.random import seed
seed(40)
import tensorflow as tf
tf.random.set_seed(40)

In [31]:
import numpy as np 
import pandas as pd 
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

import os
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tokenizers import BertWordPieceTokenizer, Tokenizer, models, pre_tokenizers, decoders, processors
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from tqdm.notebook import tqdm


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



# 2. Run Bert Model on TPU

In [33]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


# 3. Functions and Variables

## 3.1 Function for Encoding the comment

In [34]:
def regular_encode(texts, tokenizer, maxlen=512):
        # encode the word to vector of integer

    encode_dictionary = tokenizer.batch_encode_plus(texts, return_attention_masks=False, return_token_type_ids=False,
    pad_to_max_length=True,max_length=maxlen)
    
    return np.array(encode_dictionary['input_ids'])

## 3.2 Function for Neural Network Model

In [38]:
def build_model(transformer, max_len=512):

#Input: for define input layer
#shape is vector with 512-dimensional vectors
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
# to get the vector
    cls_token = sequence_output[:, 0, :]
# define output layer
    out = Dense(1, activation='sigmoid')(cls_token)
# initiate the model with inputs and outputs
    model = Model(inputs=input_word_ids, outputs=out)
# get the learning rate adam(1e-5) and the metrica
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])
    
    return model

# 4. Preprocessing

In [40]:

# define variables for modeling use
EPOCHS = 3 #number of epochs in model
BATCH_SIZE = 16 * strategy.num_replicas_in_sync # the batch size in each epoch (128)
MAX_LEN = 192

# distilbert pre-trained model is faster than the bert base model, but it give lower accuracy than the bert base
#MODEL ='distilbert-base-multilingual-cased'

MODEL='bert-base-multilingual-cased'

In [41]:
#API to build highly flexible and efficient TensorFlow input pipelines.
AUTO = tf.data.experimental.AUTOTUNE

## 4.1 Import Datasets


In [42]:
train1 = pd.read_csv("../data/jigsaw-toxic-comment-train.csv")

valid = pd.read_csv('../data/validation.csv')
test = pd.read_csv('../data/test.csv')
sub = pd.read_csv('../data/sample_submission.csv')

## 4.2 tokenaizer 

In [43]:
#use the pre-trained model bert as a tokenizer 
#bert tokenizer has vocabulary for emoji. this is the reason we don't need to remove emoji from 
#datasets, for more details see the (EDA & data cleaning) notebook

tokenizer = AutoTokenizer.from_pretrained(MODEL)


## 4.3 Encode The Comments

In [44]:
%%time 
#call the function regular encode on for all the 3 dataset to convert each words after the tokenizer
#into a vector
#x_train,x_test, and x_validation will have the comment text column only,(in test called "content")
x_train = regular_encode(train1.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(test.content.values, tokenizer, maxlen=MAX_LEN)

#y_train,y_valid will have te target column "toxic"
y_train = train1.toxic.values
y_valid = valid.toxic.values

CPU times: user 14min 22s, sys: 442 ms, total: 14min 22s
Wall time: 14min 22s


## 4.4 Prepare Tensorflow Dataset For Modeling

In [45]:
# Create and prepare a source dataset from your input data to fit the model in the next step.
# Apply dataset transformations to preprocess the data.
# Iterate over the dataset and process the elements.

train_dataset = (
    tf.data.Dataset # create dataset
    .from_tensor_slices((x_train, y_train)) # Once you have a dataset, you can apply transformations 
    .repeat()
    .shuffle(2048,seed=40) # Combines consecutive elements of this dataset into batches.
    .batch(BATCH_SIZE)
    .prefetch(AUTO)) #This allows later elements to be prepared while the current element is being processed (pipline).


valid_dataset = (tf.data.Dataset.from_tensor_slices((x_valid, y_valid)).batch(BATCH_SIZE)
    .cache().prefetch(AUTO))

test_dataset = (tf.data.Dataset.from_tensor_slices(x_test).batch(BATCH_SIZE))

# 5. Machine Learning

In [46]:
%%time
# in the TPU
with strategy.scope():
    #take the encoder results of bert from transformers and use it as an input in the NN model
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_bert_model_2 (TFBertModel ((None, 192, 768), (None, 177853440 
_________________________________________________________________
tf_op_layer_strided_slice_2  [(None, 768)]             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 769       
Total params: 177,854,209
Trainable params: 177,854,209
Non-trainable params: 0
_________________________________________________________________
CPU times: user 13.4 s, sys: 9.68 s, total: 23.1 s
Wall time: 23.5 s


## 5.1 Training The Model, Tuning Hyper-Parameters

In [47]:
#train the model
# training the data and tune our model with the results of the metrics we get from the validation dataset
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(train_dataset, steps_per_epoch=n_steps, validation_data=valid_dataset,
                epochs=EPOCHS)

Train for 1746 steps, validate for 63 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3


## 5.2 Testing The Mode

In [48]:
#test the model on validation
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(valid_dataset.repeat(), steps_per_epoch=n_steps,epochs=EPOCHS*2)

Train for 62 steps
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [49]:
#predict and submit
sub['toxic'] = model.predict(test_dataset, verbose=1)
sub.to_csv('submission.csv', index=False)



In [52]:
sub.head()

Unnamed: 0,id,toxic
0,0,0.001167953
1,1,3.72529e-06
2,2,0.9748423
3,3,2.104044e-05
4,4,9.536743e-07
