In 2018 we saw the rise of pretraining and fine-tuning in natural language processing. Large neural networks have been trained on general tasks like language modelling and then fine-tuned for classification tasks. One of the latest milestones in this development is the release of BERT. BERT is a model that broke several records for how well models can handle language-based tasks. The model is based on a transformer architecture for “Attention is all you need”. They pre-trained it in a bidirectional way on several language modelling tasks. So probably the new slogan should read “Attention and pre-training is all you need”. If you want more details about the model and it’s pre-training, you find some resources [here](https://github.com/google-research/bert).

### Data preperation

In [1]:
import os
from urllib.request import urlretrieve
import zipfile
import glob

if not os.path.exists('data'):
    os.makedirs('data')
    
# Download data
url ='https://storage.googleapis.com/kaggle-datasets/1014/4361/entity-annotated-corpus.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1568009584&Signature=hSdo87OA6a0gwPDkkK1eHg1l3bfLg%2FO9CTZ%2Fma6B%2F%2BmcwwU7OQiEmjKpgJ8ROWbPXrwjhED3u3dkas63MRbL1Rin3XUeWKU3y6TqgK%2FmleA3SVf6jBqXTOfRjyDaPXPNYdJLYFCWIDbygZPxoNEmXel3ZV%2B3MQgDOKH%2FzAP1NLuU5y6VHaFePdsruHAb1KICRY6qvsl5gFTYyBkJw3xO0qoF8oNkG3C4uUDaTEaqVK7FOfAw7OkkpTXqc9GtjUdsI3Dr11QNYgTmIOdreqk0fgr89QaenXBTfZlS8hqMu46Ik1VrX0Y5zfOSH7Rd3T5ltDvNNANlh%2FA%2BpJr0y16cHA%3D%3D'

urlretrieve(url, 'data/kaggle_ner.zip')

with zipfile.ZipFile('data/kaggle_ner.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')
    
import glob

glob.glob('data/*')

['data/ner_dataset.csv', 'data/kaggle_ner.zip', 'data/ner.csv']

In [0]:
import pandas as pd    
import numpy as np

data = pd.read_csv("data/ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
getter = SentenceGetter(data)
sentences = getter.sentences

max_len = 50
max_len_char = 10

words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words)

tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags
tag2idx = {t: i for i, t in enumerate(tags)}

In [3]:
!pip install bert-tensorflow
import bert
from bert import tokenization



In [4]:
import tensorflow as tf
#from tensorflow.python.framework.ops import disable_eager_execution
import tensorflow_hub as hub

#disable_eager_execution()


# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

W0906 07:21:35.690908 139686195558272 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



In [0]:
X = []
y = []
for sentence in sentences:
  orig_to_tok_map = []
  bert_tokens = []
  ner_tag = []

  bert_tokens.append("[CLS]")
  ner_tag.append('O')
  
  for orig_token in sentence:
    orig_token, org_tag = orig_token[0], orig_token[2]
    orig_to_tok_map.append(len(bert_tokens))
    
    tks = tokenizer.tokenize(orig_token)
    bert_tokens.extend(tks)
    ner_tag.extend([org_tag] * len(tks))
  bert_tokens.append("[SEP]")
  ner_tag.append('O')
  
  X.append(bert_tokens)
  y.append(ner_tag)

In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_LEN = 75
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in X],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in y],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

The Bert model supports something called attention_mask, which is similar to the masking in keras. So here we create the mask to ignore the padded elements in the sequences.

In [0]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [0]:
from sklearn.model_selection import train_test_split

tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

### Building model

#### Wrap the BERT model into a tensorflow keras layer

In [0]:
from tensorflow.keras import backend as K

class BertLayer(tf.keras.layers.Layer):
    def __init__(self, n_fine_tune_layers=10, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            BERT_MODEL_HUB,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )
        trainable_vars = self.bert.variables
        
        # Remove unused layers
        trainable_vars = [var for var in trainable_vars if not ("/cls/" in var.name or 'pooler' in var.name)]
        
        # Select how many layers to fine tune
        trainable_vars = trainable_vars[-self.n_fine_tune_layers :]
        
        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
        
        # Add non-trainable weights
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)
        
        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        
        # Use "pooled_output" for classification tasks on an entire sentence.
        # Use "sequence_outputs" for token-level output.
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
        return result['sequence_output']

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

Build the model

In [34]:
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D, Lambda
from tensorflow.keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D, add

in_id = Input(shape=(MAX_LEN,), name="input_ids")
in_mask = Input(shape=(MAX_LEN,), name="input_masks")
in_segment = Input(shape=(MAX_LEN,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]


# Instantiate the custom Bert Layer defined above
bert_output = BertLayer(n_fine_tune_layers=10)(bert_inputs)

x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(bert_output)

x_1 = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(x)

x_add = add([x, x_1])

out = TimeDistributed(Dense(n_tags, activation="softmax"))(x_add)


model = Model(bert_inputs, out)

adam = tf.keras.optimizers.Adam(clipnorm = 1.)
model.compile(optimizer=adam, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 75)]         0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 75)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 75)]         0                                            
__________________________________________________________________________________________________
bert_layer_6 (BertLayer)        (None, None, 768)    108931396   input_ids[0][0]                  
                                                                 input_masks[0][0]          

In [0]:
tr_y = np.expand_dims(tr_tags, 2)
val_y = np.expand_dims(val_tags, 2)
batch_size = 32

with tf.Session() as sess:
    sess.run(tf.compat.v1.global_variables_initializer())
    history = model.fit([tr_inputs, np.array(tr_masks), np.zeros_like(tr_inputs)], tr_y, validation_data=([val_inputs, np.array(val_masks), np.zeros_like(val_inputs)], val_y),
                                batch_size=batch_size, epochs=5, verbose=1)

Train on 43163 samples, validate on 4796 samples
Epoch 1/5
Epoch 2/5

[The Illustrated BERT, ELMo, and co. (How NLP Cracked Transfer Learning)](https://jalammar.github.io/illustrated-bert/)