# Name Entity Recognition
## Bidirectional-LSTM-CRF model

In [3]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import tensorflow_addons

In [5]:
tf.keras.backend.clear_session

<function tensorflow.python.keras.backend.clear_session>

In [10]:
!git clone https://github.com/quocdat32461997/BiLSTM-CRF.git
!mv BiLSTM-CRF BiLSTM_CRF

fatal: destination path 'BiLSTM-CRF' already exists and is not an empty directory.


### Download Entity-Annotated-Corpus

#### Option 1: through Google Drive
* Download corpus dataset from link: https://drive.google.com/file/d/1JZ4JXuJrEG1e9OiM1PEoRVtd9wcAIVz9/view?usp=sharing
* Upload corpus dataset back to your Google Colab under **/content** directory


#### Option 2: using Kaggle API 
* Generate and download Kaggle API token as **kaggle.json** file to **/content** directory of Google Colab
* Move **kaggle.json** to **~/.kaggle/kaggle.json** by command: **!mv kaggle.json ~/.kaggle/kaggle.json**
* Provide access by command: **!chmod 600 ~/.kaggle/kaggle.json**
* Download corpus dataset: **!kaggle datasets download -d abhinavwalia95/entity-annotated-corpus**

In [15]:
try:
    !mv kaggle.json ~/.kaggle/kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    !kaggle datasets download -d abhinavwalia95/entity-annotated-corpus --unzip --force
except:
    print("Please see Option 1 to get Entity-Annotated-Corpus")

Downloading entity-annotated-corpus.zip to /content
 64% 17.0M/26.4M [00:00<00:00, 18.4MB/s]
100% 26.4M/26.4M [00:01<00:00, 26.0MB/s]


In [16]:
"""
utils.py - module to implement utils for BiLSTM-CRF
"""

import tensorflow as tf
import pandas as pd
from sklearn.utils import shuffle
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

class SentenceGetter(Sequence):
    """
    Inheritted class from tf.keras.utils.Sequence class to efficiently load
    data to Tensorflow/Keras model
    """
    def __init__(self, data, words, tags, maxlen, batch_size = 16, shuffle = False):
        """
        __init__ - initializer for SentenceGetter class
        Inputs:
            - data : String or Pandas DataFrame object
                Path string to file or dataframe object
            - words : set
                Set of distinct words
            - tags : set
                Set of distinct tags
        """

        if isinstance(data, str):
            # load data from Pandas file path
            data = pd.read_csv(data, encoding = 'latin1')
        elif isinstance(data, pd.DataFrame):
            # load data from Pandas DataFrame
            data = data
        else:
            raise Exception('Data is None or not found')
        self.word2dix = {w : i + 1 for i, w in enumerate(words)}
        self.tag2dix = {t : i for i, t in enumerate(tags)}
        self.n_tags = len(tags)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.maxlen = maxlen
        n_sent = 1
        self.grouped = data.groupby('Sentence #').apply(self.agg_func)\
            .reset_index().rename(columns = {0 : 'sentence'})['sentence']
        self.sentences = [s for s in self.grouped]

    def agg_func(self, input):
        """
        agg_func - function to group words/tags of sentences together
        """
        return [(w, p, t) for w, p, t in zip(input['Word'].values.tolist(),
            input['POS'].values.tolist(), input['Tag'].values.tolist())]

    def pad_sentences(self, input):
        input = [[self.word2dix[w[0]] for w in s] for s in input]
        return pad_sequences(maxlen = self.maxlen, sequences = input, padding = 'post', value = 0)
    
    def generate_labels(self, input):
        input = [[self.tag2dix[w[2]] for w in s] for s in input]
        input = pad_sequences(maxlen = self.maxlen, sequences = input, padding = 'post', value = self.tag2dix['O'])
        #return input
        return np.array([to_categorical(x, num_classes = self.n_tags) for x in input])
    def __len__(self):
        """
        __len__ - function to compute length of SentenceGetter
        """
        return int(self.grouped.shape[0] // self.batch_size)

    def __getitem__(self, index):
        if index == 0 and self.shuffle:
            # shuffle dataset for every iteration
            self.grouped = shuffle(self.grouped).reset_index()['sentence']

        # get batch of data
        sentences = self.grouped[index * self.batch_size : (index + 1) * self.batch_size]
        
        # generate sentences and labels
        labels = self.generate_labels(sentences)
        sentences = self.pad_sentences(sentences)
        
        return sentences, labels

### Import corpus

In [17]:
data = pd.read_csv('ner_dataset.csv', encoding = 'latin1')
data = data.fillna(method="ffill")

In [18]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


#### Build Dictionary of Words and Tags

In [19]:
# build list of distinct words
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

35179

In [20]:
# build list of distinct tags
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

17

In [21]:
# build dictionary of words and tags
max_len = 75
word2dix = {w : i + 1 for i, w in enumerate(words)}
tag2dix = {t : i for i, t in enumerate(tags)}

In [22]:
word2dix['Obama']

23870

In [23]:
tag2dix['O']

7

#### Generate Sentence Getter

In [37]:
batch_size = 32
getter = SentenceGetter(data = data, words = words, tags = tags, maxlen = max_len, batch_size = batch_size)

In [26]:
sentences = getter.sentences

#### Tokenize and prepare sentences

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
X = [[word2dix[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen = max_len, sequences = X, padding = 'post', value = 0)

In [29]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [15]:
X[0].shape

(75,)

In [16]:
y = [[tag2dix[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2dix["O"])

In [17]:
y[1]

array([ 8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  0,
        0,  0, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0], dtype=int32)

#### Build BiLSTM-CRF model

In [38]:
"""
output.py - module to store BiLSTM-CRF model
"""

from tensorflow.keras import Model
from tensorflow.keras.layers import Input, LSTM, Lambda, Embedding, TimeDistributed, Dropout, Bidirectional, Dense, Layer, InputSpec
import tensorflow_addons as tfa
from tensorflow_addons.text import crf_log_likelihood, viterbi_decode, crf_decode
import tensorflow.keras.backend as K

def embedding_layer(input_dim, output_dim, input_length, mask_zero):
    return Embedding(input_dim = input_dim, output_dim = output_dim, input_length = input_length, mask_zero = mask_zero)
    
def bilstm_crf(maxlen, n_tags, embedding_dim, n_words, mask_zero, training = True):
    """
    bilstm_crf - module to build BiLSTM-CRF model
    Inputs:
        - input_shape : tuple
            Tensor shape of inputs, excluding batch size
    Outputs:
        - output : tensorflow.keras.outputs.output
            BiLSTM-CRF output
    """
    input = Input(shape = (maxlen,))
    # Embedding layer
    embeddings = embedding_layer(input_dim = n_words + 1, output_dim = embedding_dim, input_length = maxlen, mask_zero = mask_zero)
    output = embeddings(input)

    # BiLSTM layer
    output = Bidirectional(LSTM(units = 50, return_sequences = True, recurrent_dropout = 0.1))(output)

    # Dense layer
    output = TimeDistributed(Dense(n_tags, activation = 'relu'))(output)
    
    output = CRF(n_tags, name = 'crf_layer')(output)
    return Model(input, output)


class CRF(Layer):
    def __init__(self,
                 output_dim,
                 sparse_target=True,
                 **kwargs):
        """    
        Args:
            output_dim (int): the number of labels to tag each temporal input.
            sparse_target (bool): whether the the ground-truth label represented in one-hot.
        Input shape:
            (batch_size, sentence length, output_dim)
        Output shape:
            (batch_size, sentence length, output_dim)
        """
        super(CRF, self).__init__(**kwargs)
        self.output_dim = int(output_dim) 
        self.sparse_target = sparse_target
        self.input_spec = InputSpec(min_ndim=3)
        self.supports_masking = False
        self.sequence_lengths = None
        self.transitions = None

    def build(self, input_shape):
        assert len(input_shape) == 3
        f_shape = tf.TensorShape(input_shape)
        input_spec = InputSpec(min_ndim=3, axes={-1: f_shape[-1]})

        if f_shape[-1] is None:
            raise ValueError('The last dimension of the inputs to `CRF` '
                             'should be defined. Found `None`.')
        if f_shape[-1] != self.output_dim:
            raise ValueError('The last dimension of the input shape must be equal to output'
                             ' shape. Use a linear layer if needed.')
        self.input_spec = input_spec
        self.transitions = self.add_weight(name='transitions',
                                           shape=[self.output_dim, self.output_dim],
                                           initializer='glorot_uniform',
                                           trainable=True)
        self.built = True

    def compute_mask(self, inputs, mask=None):
        # Just pass the received mask from previous layer, to the next layer or
        # manipulate it if this layer changes the shape of the input
        return mask

    def call(self, inputs, sequence_lengths=None, training=None, **kwargs):
        sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
        if sequence_lengths is not None:
            assert len(sequence_lengths.shape) == 2
            assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32'
            seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
            assert seq_len_shape[1] == 1
            self.sequence_lengths = K.flatten(sequence_lengths)
        else:
            self.sequence_lengths = tf.ones(tf.shape(inputs)[0], dtype=tf.int32) * (
                tf.shape(inputs)[1]
            )

        viterbi_sequence, _ = crf_decode(sequences,
                                         self.transitions,
                                         self.sequence_lengths)
        output = K.one_hot(viterbi_sequence, self.output_dim)
        return K.in_train_phase(sequences, output)

    @property
    def loss(self):
        def crf_loss(y_true, y_pred):
            y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
            log_likelihood, self.transitions = crf_log_likelihood(
                y_pred,
                tf.cast(K.argmax(y_true), dtype=tf.int32) if self.sparse_target else y_true,
                self.sequence_lengths,
                transition_params=self.transitions,
            )
            return tf.reduce_mean(-log_likelihood)
        return crf_loss

    @property
    def accuracy(self):
        def viterbi_accuracy(y_true, y_pred):
            # -1e10 to avoid zero at sum(mask)
            mask = K.cast(
                K.all(K.greater(y_pred, -1e10), axis=2), K.floatx())
            shape = tf.shape(y_pred)
            sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            y_pred, _ = crf_decode(y_pred, self.transitions, sequence_lengths)
            if self.sparse_target:
                y_true = K.argmax(y_true, 2)
            y_pred = K.cast(y_pred, 'int32')
            y_true = K.cast(y_true, 'int32')
            corrects = K.cast(K.equal(y_true, y_pred), K.floatx())
            return K.sum(corrects * mask) / K.sum(mask)
        return viterbi_accuracy

    def compute_output_shape(self, input_shape):
        tf.TensorShape(input_shape).assert_has_rank(3)
        return input_shape[:2] + (self.output_dim,)

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'sparse_target': self.sparse_target,
            'supports_masking': self.supports_masking,
            'transitions': K.eval(self.transitions)
        }
        base_config = super(CRF, self).get_config()
        return dict(base_config, **config)

In [39]:
model = bilstm_crf(maxlen = max_len, n_tags = n_tags, embedding_dim = 20, n_words = n_words, mask_zero = True)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 75)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 75, 20)            703600    
_________________________________________________________________
bidirectional (Bidirectional (None, 75, 100)           28400     
_________________________________________________________________
time_distributed (TimeDistri (None, 75, 17)            1717      
_________________________________________________________________
crf_layer (CRF)              (None, 75, 17)            289       
Total params: 734,006
Trainable params: 734,006
Non-trainable params: 0
_________________________________________________________________


### Configure and train model

In [40]:
from tensorflow.keras.optimizers import Adam
model.compile(optimizer = Adam(learning_rate = 0.01), loss = model.layers[-1].loss, metrics = model.layers[-1].accuracy)

In [None]:
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
logging = TensorBoard(log_dir = './logs')
checkpoint = ModelCheckpoint('logs' + '/v1/ep{epoch:03d}-oss{loss:.3f}.h5', monitor = 'loss', save_weights_only = True, save_best_only = True, period = 3)
reduce_lr = ReduceLROnPlateau(monitor = 'loss', factor = 0.1, patience = 5, verbose = 1)
early_stopping = EarlyStopping(monitor = 'loss', min_delta = 0, patience = 10, verbose = 1)
steps_per_epoch = getter.__len__() / 4
model.fit(getter, epochs = 200, initial_epoch = 0, steps_per_epoch=steps_per_epoch, callbacks = [logging, checkpoint, reduce_lr], verbose = 1, shuffle = True)

Epoch 1/200
Instructions for updating:
use `tf.profiler.experimental.stop` instead.

In [None]:
tf.keras.models.save_model(model, filepath = 'bilstm_crf.tf', save_format = 'tf')