# BERT Transformer Classifier
### with HuggingFace and Tensorflow 2

In [1]:
# slient install
!pip install -q -r requirements.txt
!pip install  -q -i https://test.pypi.org/simple/ EuroPy

In [2]:
# all imports
import os, time
from datetime import datetime
from tqdm import trange, tqdm

import europy
from europy.helpers import load_global_params
from europy.decorators import using_params
from europy.decorators import model_details
from europy.lifecycle import reporting

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten
print(f'TensorFlow Version: {tf.__version__}')
print(f'GPU Devices: {tf.config.list_physical_devices("GPU")}')

from sklearn.model_selection import train_test_split

from transformers import BertTokenizer
from transformers import TFBertModel
from transformers import create_optimizer



TensorFlow Version: 2.3.1
GPU Devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
params = load_global_params('params.yml')

## Load Data

In [6]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading jigsaw-toxic-comment-classification-challenge.zip to /home/b/dev/EuroPy-Examples/toxic_comment_classification
 97%|████████████████████████████████████▊ | 51.0M/52.6M [00:02<00:00, 29.4MB/s]
100%|██████████████████████████████████████| 52.6M/52.6M [00:02<00:00, 25.8MB/s]


In [7]:
!mkdir data
!unzip -o jigsaw-toxic-comment-classification-challenge.zip -d data/
!rm jigsaw-toxic-comment-classification-challenge.zip
!unzip -o data/sample_submission.csv.zip -d data/
!rm data/sample_submission.csv.zip
!unzip -o data/train.csv.zip -d data/
!rm data/train.csv.zip
!unzip -o data/test.csv.zip -d data/
!rm data/test.csv.zip
!unzip -o data/test_labels.csv.zip -d data/
!rm data/test_labels.csv.zip

mkdir: cannot create directory ‘data’: File exists
Archive:  jigsaw-toxic-comment-classification-challenge.zip
  inflating: data/sample_submission.csv.zip  
  inflating: data/test.csv.zip       
  inflating: data/test_labels.csv.zip  
  inflating: data/train.csv.zip      
Archive:  data/sample_submission.csv.zip
  inflating: data/sample_submission.csv  
Archive:  data/train.csv.zip
  inflating: data/train.csv          
Archive:  data/test.csv.zip
  inflating: data/test.csv           
Archive:  data/test_labels.csv.zip
  inflating: data/test_labels.csv    


In [4]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'
test_labels_path = 'data/test_labels.csv'
subm_path = 'data/sample_submission'

In [5]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_test_labels = pd.read_csv(test_labels_path).set_index('id')

df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Tokenization

In [6]:
# define DistilBERT Tokenizer
tokenizer = BertTokenizer.from_pretrained(
    params['pre_trained_model'],
    do_lower_case=True
)

In [7]:
@using_params('params.yml')
def tokenize_sentences(sentences, tokenizer, max_seq_len=128):
    tokenized_sentences = []
    
    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
            sentence,
            add_special_tokens=True,
            max_length=max_seq_len
        )
        tokenized_sentences.append(tokenized_sentence)
    return tokenized_sentences

In [8]:
def create_attention_masks(tokenized_add_padded_sentences):
    attention_masks = []
    
    for sentence in tqdm(tokenized_add_padded_sentences):
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)
    return np.asarray(attention_masks)

In [9]:
# create tokenized sentences (will take a few minutes)
input_ids = tokenize_sentences(
    df_train['comment_text'],
    tokenizer
)

  0%|          | 0/159571 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 159571/159571 [02:22<00:00, 1121.56it/s]


In [10]:
input_ids = pad_sequences(
    input_ids,
    maxlen=params['max_seq_len'],
    dtype='long',
    value=0,
    truncating='post',
    padding='post'
)

In [11]:
attention_masks = create_attention_masks(input_ids)

100%|██████████| 159571/159571 [00:07<00:00, 21584.30it/s]


## Training

In [12]:
# Split data
labels = df_train[params['label_cols']].values

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids,
    labels,
    random_state=0,
    test_size=params['test_size']
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks,
    labels,
    random_state=0,
    test_size=params['test_size']
)

train_size = len(train_inputs)
validation_size = len(validation_inputs)

In [13]:
@using_params('params.yml')
def create_dataset(data_tuple, num_epochs=1, batch_size=32, buffer_size=10000, train=True):
    dataset = tf.data.Dataset.from_tensor_slices(data_tuple)
    if train:
        dataset = dataset.shuffle(buffer_size=buffer_size)
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    if train:
        dataset.prefetch(1)
    
    return dataset

In [14]:
# train & validation datasets
train_dataset = create_dataset(
    (train_inputs, train_masks, train_labels)
)
validation_dataset = create_dataset(
    (validation_inputs, validation_masks, validation_labels)
)

In [15]:
# define the model
class BertClassifier(tf.keras.Model):
    def __init__(self, bert: TFBertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = Dense(num_classes, activation='sigmoid')
        
    @tf.function
    def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask
        )
        
        cls_output = outputs[1]
        cls_output = self.classifier(cls_output)
        
        return cls_output

In [16]:
# init the model
model = BertClassifier(TFBertModel.from_pretrained(params['pre_trained_model']), len(params['label_cols']))

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [17]:
# setup optimizer
steps_per_epoch = train_size // params['batch_size']
validation_steps = validation_size // params['batch_size']

loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=False)
train_loss = tf.keras.metrics.Mean(name='train_loss')
validation_loss = tf.keras.metrics.Mean(name='test_loss')

warmup_steps = steps_per_epoch // 3
total_steps = steps_per_epoch * params['num_epochs'] - warmup_steps
optimizer, lr_scheduler = create_optimizer(
    init_lr=params['learning_rate'],
    num_train_steps=total_steps,
    num_warmup_steps=warmup_steps
)

train_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(params['label_cols']))]
validation_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(params['label_cols']))]

In [20]:
# training loops
@tf.function
def train_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)
    
    with tf.GradientTape() as tape:
        predictions = model(token_ids, attention_mask=masks)
        loss = loss_object(labels, predictions)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    print(optimizer)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    
    for i, auc in enumerate(train_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])

@tf.function
def validation_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)
    
    predictions = model(token_ids, attention_mask=masks, training=False)
    v_loss = loss_object(labels, prediction)
    
    validation_loss(v_loss)
    for i, auc in enumerate(validation_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])
        

@model_details('model_details.yml')
@using_params('params.yml')
def train(model, train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch, num_epochs=1, label_cols=[]):
    for epoch in range(num_epochs):
        start = time.time()
        
        for i, (token_ids, masks, labels) in enumerate(tqdm(train_dataset, total=train_steps_per_epoch)):
            train_step(model, token_ids, masks, labels)
            if i % 1000 == 0:
                print(f'\nTrain Step: {i}, Loss: {train_loss.result()}')
                for i, label_name in enumerate(label_cols):
                    print(f'{label_name} roc_auc {train_auc_metrics[i].result()}')
                    train_auc_metrics[i].reset_states()
                    
        for i, (token_ids, masks, labels) in enumerate(tqdm(val_dataset, val_steps_per_epoch)):
            validation_step(model, token_ids, masks, labels)
            print(f'\n{Epoch} {epoch+1}, Validation Loss: {validation_loss.result()}, Time: {time.time()-start}\n')
            for i, label_name in enumerate(label_cols):
                print(f'{label_name} roc_auc {validation_auc_metrics[i].result()}')
                validation_auc_metrics[i].reset_states()
            print('\n')

In [None]:
# run the training loop (GPU Required)
#  - ~45 minutes per epoch on GTC 1080 8gb GDDR5
train(
    model,
    train_dataset,
    validation_dataset,
    train_steps_per_epoch=steps_per_epoch,
    val_steps_per_epoch=validation_steps
)

  0%|          | 0/4487 [00:00<?, ?it/s]

<tensorflow.python.keras.optimizer_v2.adam.Adam object at 0x7fc598cb2040>


  0%|          | 1/4487 [00:03<3:55:32,  3.15s/it]


Train Step: 0, Loss: 1.0569530725479126
toxic roc_auc 0.3895631730556488
severe_toxic roc_auc 0.7484276294708252
obscene roc_auc 0.6311532855033875
threat roc_auc 0.2912317216396332
insult roc_auc 0.5377174019813538
identity_hate roc_auc 0.4254571199417114


  4%|▎         | 167/4487 [01:39<41:42,  1.73it/s]

## Testing Model Inference

In [31]:
# create testing tokens (will take a few minutes)
test_input_ids = tokenize_sentences(df_test['comment_text'], tokenizer)
test_input_ids = pad_sequences(test_input_ids, maxlen=params.max_len, dtype='long', value=0, truncating='post', padding='post')
test_attention_masks = create_attention_masks(test_input_ids)

100%|██████████| 153164/153164 [02:10<00:00, 1172.72it/s]


AttributeError: 'dict' object has no attribute 'max_len'

In [None]:
# run inference
# - ~15 minutes on GTX 1080
test_step = len(df_test) // params.batch_size
print(test_step)

test_dataset = create_dataset(
    (test_input_ids, test_attention_masks),
    batch_size=params.batch_size,
    train=False, 
    epochs=1
)

df_submission = pd.read_csv('data/sample_submission.csv', index_col='id')

for i, (token_ids, masks) in enumerate(tqdm(test_dataset, total=test_step)):
    sample_ids = df_test.iloc[i*params.batch_size:(i+1)*params.batch_size]['id']
    predictions = model(token_ids, attention_mask=masks).numpy()
    
    def_submission.loc[sample_ids, label_cols] = predictions

In [None]:
# dump results
df_submission.to_csv('data/submission.csv')

## Save Model

In [None]:
!mkdir models
now = datetime.now()
model.save_weights(f'models/{now:%Y-%m-%d %H:%M}_bert.h5')


## TODO: Explore Results

In [None]:
reporting.flush()