# BERT Transformer Classifier
### with HuggingFace and Tensorflow 2

In [1]:
# slient install
!pip install -q -r requirements.txt
!pip install  -q -i https://test.pypi.org/simple/ EuroPy

## TODO: 
- [ ] publish model to a s3.

In [None]:
# all imports
import os, time
from datetime import datetime
from tqdm.notebook import trange, tqdm

import europy
from europy.utils import load_global_params
from europy.decorators import using_params
from europy.decorators import model_details
from europy.lifecycle import reporting

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten
print(f'TensorFlow Version: {tf.__version__}')
print(f'GPU Devices: {tf.config.list_physical_devices("GPU")}')

from sklearn.model_selection import train_test_split

from transformers import BertTokenizer
from transformers import TFBertModel
from transformers import create_optimizer

from model import BertClassifier

In [2]:
params = load_global_params('params.yml')

  - global.pre_trained_model: bert-base-uncased
  - global.max_seq_len: 128
  - global.train_percent: 0.1
  - global.batch_size: 32
  - global.num_epochs: 1
  - global.label_cols: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
  - global.test_size: 0.1
  - global.learning_rate: 2e-05


## Load Data

In [6]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading jigsaw-toxic-comment-classification-challenge.zip to /home/b/dev/EuroPy-Examples/toxic_comment_classification
 97%|████████████████████████████████████▊ | 51.0M/52.6M [00:02<00:00, 29.4MB/s]
100%|██████████████████████████████████████| 52.6M/52.6M [00:02<00:00, 25.8MB/s]


In [7]:
!mkdir data
!unzip -o jigsaw-toxic-comment-classification-challenge.zip -d data/
!rm jigsaw-toxic-comment-classification-challenge.zip
!unzip -o data/sample_submission.csv.zip -d data/
!rm data/sample_submission.csv.zip
!unzip -o data/train.csv.zip -d data/
!rm data/train.csv.zip
!unzip -o data/test.csv.zip -d data/
!rm data/test.csv.zip
!unzip -o data/test_labels.csv.zip -d data/
!rm data/test_labels.csv.zip

mkdir: cannot create directory ‘data’: File exists
Archive:  jigsaw-toxic-comment-classification-challenge.zip
  inflating: data/sample_submission.csv.zip  
  inflating: data/test.csv.zip       
  inflating: data/test_labels.csv.zip  
  inflating: data/train.csv.zip      
Archive:  data/sample_submission.csv.zip
  inflating: data/sample_submission.csv  
Archive:  data/train.csv.zip
  inflating: data/train.csv          
Archive:  data/test.csv.zip
  inflating: data/test.csv           
Archive:  data/test_labels.csv.zip
  inflating: data/test_labels.csv    


In [3]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'
test_labels_path = 'data/test_labels.csv'
subm_path = 'data/sample_submission'

In [4]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_test_labels = pd.read_csv(test_labels_path).set_index('id')

df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Tokenization

In [5]:
# define DistilBERT Tokenizer
tokenizer = BertTokenizer.from_pretrained(
    params['pre_trained_model'],
    do_lower_case=True
)

In [6]:
@using_params('params.yml')
def tokenize_sentences(sentences, tokenizer, max_seq_len=128):
    tokenized_sentences = []
    
    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
            sentence,
            add_special_tokens=True,
            max_length=max_seq_len
        )
        tokenized_sentences.append(tokenized_sentence)
    return tokenized_sentences

In [7]:
def create_attention_masks(tokenized_add_padded_sentences):
    attention_masks = []
    
    for sentence in tqdm(tokenized_add_padded_sentences):
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)
    return np.asarray(attention_masks)

In [8]:
# create tokenized sentences (will take a few minutes)
input_ids = tokenize_sentences(
    df_train['comment_text'],
    tokenizer
)

HBox(children=(FloatProgress(value=0.0, max=159571.0), HTML(value='')))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



  - global.max_seq_len: 128


In [9]:
input_ids = pad_sequences(
    input_ids,
    maxlen=params['max_seq_len'],
    dtype='long',
    value=0,
    truncating='post',
    padding='post'
)

In [10]:
attention_masks = create_attention_masks(input_ids)

HBox(children=(FloatProgress(value=0.0, max=159571.0), HTML(value='')))




## Training

In [11]:
# Split data
labels = df_train[params['label_cols']].values

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids,
    labels,
    random_state=0,
    test_size=params['test_size']
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks,
    labels,
    random_state=0,
    test_size=params['test_size']
)

train_size = len(train_inputs)
validation_size = len(validation_inputs)

In [13]:
@using_params('params.yml')
def create_dataset(data_tuple, num_epochs=1, batch_size=32, buffer_size=10000, train=True):
    dataset = tf.data.Dataset.from_tensor_slices(data_tuple)
    if train:
        dataset = dataset.shuffle(buffer_size=buffer_size)
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    if train:
        dataset.prefetch(1)
    
    return dataset

In [13]:
# train & validation datasets
train_dataset = create_dataset(
    (train_inputs, train_masks, train_labels)
)
validation_dataset = create_dataset(
    (validation_inputs, validation_masks, validation_labels)
)

  - global.batch_size: 32
  - global.num_epochs: 1
  - global.batch_size: 32
  - global.num_epochs: 1


In [8]:
# define the model
class BertClassifier(tf.keras.Model):
    def __init__(self, bert: TFBertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = Dense(num_classes, activation='sigmoid')
        
    @tf.function
    def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask
        )
        
        cls_output = outputs[1]
        cls_output = self.classifier(cls_output)
        
        return cls_output

In [9]:
# init the model
model = BertClassifier(TFBertModel.from_pretrained(params['pre_trained_model']), len(params['label_cols']))

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [16]:
# setup optimizer
steps_per_epoch = train_size // params['batch_size']
validation_steps = validation_size // params['batch_size']

loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=False)
train_loss = tf.keras.metrics.Mean(name='train_loss')
validation_loss = tf.keras.metrics.Mean(name='test_loss')

warmup_steps = steps_per_epoch // 3
total_steps = steps_per_epoch * params['num_epochs'] - warmup_steps
optimizer, lr_scheduler = create_optimizer(
    init_lr=params['learning_rate'],
    num_train_steps=total_steps,
    num_warmup_steps=warmup_steps
)

train_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(params['label_cols']))]
validation_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(params['label_cols']))]

In [21]:
# training loops
@tf.function
def train_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)
    
    with tf.GradientTape() as tape:
        predictions = model(token_ids, attention_mask=masks)
        loss = loss_object(labels, predictions)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    print(optimizer)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    
    for i, auc in enumerate(train_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])

@tf.function
def validation_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)
    
    predictions = model(token_ids, attention_mask=masks, training=False)
    v_loss = loss_object(labels, predictions)
    
    validation_loss(v_loss)
    for i, auc in enumerate(validation_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])
        

@model_details('model_details.yml')
@using_params('params.yml')
def train(model, train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch, num_epochs=1, label_cols=[]):
    for epoch in range(num_epochs):
        start = time.time()
        
        for i, (token_ids, masks, labels) in enumerate(tqdm(train_dataset, total=train_steps_per_epoch)):
            train_step(model, token_ids, masks, labels)
            if i % 1000 == 0:
                print(f'\nTrain Step: {i}, Loss: {train_loss.result()}')
                for i, label_name in enumerate(label_cols):
                    print(f'{label_name} roc_auc {train_auc_metrics[i].result()}')
                    train_auc_metrics[i].reset_states()
                    
        for i, (token_ids, masks, labels) in enumerate(tqdm(val_dataset, total=val_steps_per_epoch)):
            validation_step(model, token_ids, masks, labels)
            print(f'\nEpoch {epoch+1}, Validation Loss: {validation_loss.result()}, Time: {time.time()-start}\n')
            for i, label_name in enumerate(label_cols):
                print(f'{label_name} roc_auc {validation_auc_metrics[i].result()}')
                validation_auc_metrics[i].reset_states()
            print('\n')

In [22]:
# run the training loop (GPU Required)
#  - ~45 minutes per epoch on GTC 1080 8gb GDDR5
train(
    model,
    train_dataset,
    validation_dataset,
    train_steps_per_epoch=steps_per_epoch,
    val_steps_per_epoch=validation_steps
)

HBox(children=(FloatProgress(value=0.0, max=4487.0), HTML(value='')))

<tensorflow.python.keras.optimizer_v2.adam.Adam object at 0x7f47829100d0>

Train Step: 0, Loss: 0.0479954294860363
toxic roc_auc 0.9859084486961365
severe_toxic roc_auc 0.9881337285041809
obscene roc_auc 0.9925582408905029
threat roc_auc 0.9754266142845154
insult roc_auc 0.9881128072738647
identity_hate roc_auc 0.9877621531486511

Train Step: 1000, Loss: 0.046741850674152374
toxic roc_auc 0.9872294664382935
severe_toxic roc_auc 0.9913434386253357
obscene roc_auc 0.9892247319221497
threat roc_auc 0.9791333079338074
insult roc_auc 0.9895721077919006
identity_hate roc_auc 0.9878745675086975

Train Step: 2000, Loss: 0.04517212137579918
toxic roc_auc 0.9924564361572266
severe_toxic roc_auc 0.9913961291313171
obscene roc_auc 0.9951384663581848
threat roc_auc 0.9867134690284729
insult roc_auc 0.9931793808937073
identity_hate roc_auc 0.9916273355484009

Train Step: 3000, Loss: 0.04408381134271622
toxic roc_auc 0.9921501874923706
severe_toxic roc_auc 0.9888016581535339
obscene roc_auc 0.9931632

HBox(children=(FloatProgress(value=0.0, max=498.0), HTML(value='')))


Epoch 1, Validation Loss: 0.04559934511780739, Time: 2752.4213852882385

toxic roc_auc 0.9888392686843872
severe_toxic roc_auc 0.0
obscene roc_auc 0.9916665554046631
threat roc_auc 1.0
insult roc_auc 0.9672130942344666
identity_hate roc_auc 0.0



Epoch 1, Validation Loss: 0.04397928714752197, Time: 2752.6515316963196

toxic roc_auc 0.977011501789093
severe_toxic roc_auc 0.0
obscene roc_auc 1.0
threat roc_auc 0.0
insult roc_auc 0.0
identity_hate roc_auc 0.0



Epoch 1, Validation Loss: 0.04496576637029648, Time: 2752.8782136440277

toxic roc_auc 0.98333340883255
severe_toxic roc_auc 0.0
obscene roc_auc 0.9999998807907104
threat roc_auc 0.0
insult roc_auc 0.9942529201507568
identity_hate roc_auc 0.0



Epoch 1, Validation Loss: 0.04607111215591431, Time: 2753.1121504306793

toxic roc_auc 1.0
severe_toxic roc_auc 1.0
obscene roc_auc 1.0
threat roc_auc 0.0
insult roc_auc 1.0
identity_hate roc_auc 1.0



Epoch 1, Validation Loss: 0.0488148033618927, Time: 2753.352360010147

toxic roc_auc 

## Testing Model Inference

In [15]:
# create testing tokens (will take a few minutes)
test_input_ids = tokenize_sentences(df_test['comment_text'], tokenizer)
test_input_ids = pad_sequences(test_input_ids, maxlen=params['max_seq_len'], dtype='long', value=0, truncating='post', padding='post')
test_attention_masks = create_attention_masks(test_input_ids)

HBox(children=(FloatProgress(value=0.0, max=153164.0), HTML(value='')))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



  - global.max_seq_len: 128


HBox(children=(FloatProgress(value=0.0, max=153164.0), HTML(value='')))




In [16]:
# run inference
# - ~15 minutes on GTX 1080
test_step = len(df_test) // params['batch_size']
print(test_step)

test_dataset = create_dataset(
    (test_input_ids, test_attention_masks),
    batch_size=params['batch_size'],
    train=False, 
    num_epochs=1
)

4786
  - global.batch_size: 32
  - global.num_epochs: 1


In [18]:
test_dataset

<BatchDataset shapes: ((None, 128), (None, 128)), types: (tf.int64, tf.int64)>

In [None]:
df_submission = pd.read_csv('data/sample_submission.csv', index_col='id')

for i, (token_ids, masks) in enumerate(tqdm(test_dataset, total=test_step)):
    sample_ids = df_test.iloc[i*params['batch_size']:(i+1)*params['batch_size']]['id']
    predictions = model(token_ids, attention_mask=masks).numpy()
    
    df_submission.loc[sample_ids, params['label_cols']] = predictions

In [36]:
# dump results
df_submission.to_csv('data/submission.csv')

## Save Model

In [37]:
!mkdir models
now = datetime.now()
model.save_weights(f'models/{now:%Y-%m-%d %H:%M}_bert.h5')


mkdir: cannot create directory ‘models’: File exists


## Model Testing

In [None]:
# load model
model()
model.load_weights('models/2020-11-23 08:42_bert.h5')

In [11]:
model

<__main__.BertClassifier at 0x7f83bf4b9790>