#### Importing libraries

In [43]:
import os
import numpy as np
import pandas as pd

import csv
from collections import Counter

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Dropout

from tensorflow.keras import Model, Input, Sequential
from tensorflow.keras import backend as K

import tensorflow_addons as tfa

#### Getting the data

For
this task, the Groningen Meaning Bank (GMB) data set will be used. This dataset is
not considered a gold standard. This means that this data set is built using automatic
tagging software, followed by human raters updating subsets of the data. 

The following named entities are tagged in
this corpus:
* geo = Geographical entity
* org = Organization
* per = Person
* gpe = Geopolitical entity
* tim = Time indicator
* art = Artifact
* eve = Event
* nat = Natural phenomenon


To download dataset:

In [44]:
!wget https://gmb.let.rug.nl/releases/gmb-2.2.0.zip
!unzip gmb-2.2.0.zip

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


#### Looking at data

We will be using only
files named en.tags in various subdirectories. These files are tab-separated files with
each word of a sentence in a row. 

In [3]:
data_path = 'gmb-2.2.0'
output_fn = 'gmb-2.2.0/cleaned.csv'

In [3]:
def get_filenames_by_extension(data_path, extension):
    fnames = []
    
    for root, dirs, files in os.walk(data_path):
        for filename in files:
            if filename.endswith(extension):
                file_path = os.path.join(root, filename)
                fnames.append(file_path)
                
    return fnames

In [4]:
tags = get_filenames_by_extension(data_path, '.tags')

print('Length of tags: ', len(tags))

Length of tags:  10000


A few processing steps need to happen. Each file has a number of sentences, with
each words in a row. The entire sentence as a sequence and the corresponding
sequence of NER tags need to be fed in as inputs while training the model. As
mentioned above, the NER tags also need to be simplified to the top-level entities
only. Secondly, the NER tags need to be converted to the IOB format.

In [5]:
def strip_ner_subcat(tag):
    # NER tags are of form {cat}-{subcat}
    # eg tim-dow. We only want first part
    return tag.split("-")[0]

def iob_format(ners):
    # converts IO tags into IOB format
    # input is a sequence of IO NER tokens
    # convert this: O, PERSON, PERSON, O, O, LOCATION, O
    # into: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
    iob_tokens = []
    for idx, token in enumerate(ners):
        if token != 'O': # !other
            if idx == 0:
                token = "B-" + token #start of sentence
            elif ners[idx-1] == token:
                token = "I-" + token # continues
            else:
                token = "B-" + token
        iob_tokens.append(token)
        iob_tags[token] += 1
    return iob_tokens

def process_data(tags):
    total_sentences = 0
    outfiles = []
    rows = []
    for idx, file in enumerate(tags):
        with open(file, 'rb') as content:
            data = content.read().decode('utf-8').strip()
            sentences = data.split("\n\n")

            total_sentences += len(sentences)

            for sentence in sentences:
                toks = sentence.split('\n')
                words, pos, ner = [], [], []

                for tok in toks:
                    t = tok.split("\t")
                    words.append(t[0])
                    pos.append(t[1])
                    ner_tags[t[3]] += 1
                    ner.append(strip_ner_subcat(t[3]))
                rows.append([" ".join(words), " ".join(iob_format(ner)), " ".join(pos)])
    return rows

In [6]:
ner_tags = Counter()
iob_tags = Counter()

In [7]:
data = process_data(tags)
df = pd.DataFrame(data)
df.columns = ['text', 'label', 'pos']
df.to_csv(os.path.join(data_path, 'dataset.csv'), index=False)

In [4]:
df = pd.read_csv(os.path.join(data_path, 'dataset.csv'))

In [5]:
text_tok = Tokenizer(filters='[\\]^\t\n', lower=False,
                    split=' ', oov_token='<OOV>')

pos_tok = Tokenizer(filters='[\\]^\t\n', lower=False,
                    split=' ', oov_token='<OOV>')

ner_tok = Tokenizer(filters='[\\]^\t\n', lower=False,
                    split=' ', oov_token='<OOV>')

text_tok.fit_on_texts(df['text'])
pos_tok.fit_on_texts(df['pos'])
ner_tok.fit_on_texts(df['label'])

In [6]:
ner_config = ner_tok.get_config()
text_config = text_tok.get_config()

In [7]:
text_vocab = eval(text_config['index_word'])
ner_vocab = eval(ner_config['index_word'])

In [8]:
ner_vocab

{'1': '<OOV>',
 '2': 'O',
 '3': 'B-geo',
 '4': 'B-tim',
 '5': 'B-org',
 '6': 'I-per',
 '7': 'B-per',
 '8': 'I-org',
 '9': 'B-gpe',
 '10': 'I-geo',
 '11': 'I-tim',
 '12': 'B-art',
 '13': 'B-eve',
 '14': 'I-art',
 '15': 'I-eve',
 '16': 'I-gpe',
 '17': 'B-nat',
 '18': 'I-nat'}

In [9]:
x_tok = text_tok.texts_to_sequences(df['text'])
y_tok = ner_tok.texts_to_sequences(df['label'])

In [10]:
max_len = 50

x_pad = sequence.pad_sequences(x_tok, padding='post', maxlen=max_len)
y_pad = sequence.pad_sequences(y_tok, padding='post', maxlen=max_len)

print(x_pad.shape)

(62010, 50)


Since there are
multiple labels, each label token needs to be one-hot encoded like so:

In [11]:
num_classes = len(ner_vocab) + 1
Y = tf.keras.utils.to_categorical(y_pad, num_classes=num_classes)
Y.shape

(62010, 50, 19)

In [12]:
y_pad

array([[ 2,  2,  2, ...,  0,  0,  0],
       [ 2,  2,  2, ...,  0,  0,  0],
       [ 2,  2,  2, ...,  0,  0,  0],
       ...,
       [ 2,  2,  2, ...,  0,  0,  0],
       [ 2,  4, 11, ...,  0,  0,  0],
       [ 2,  2,  2, ...,  0,  0,  0]])

In [13]:
Y

array([[[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

BiLSTM

In [14]:
vocab_size = len(text_vocab) + 1
# The embedding dimension
embedding_dim = 64
# Number of RNN units
rnn_units = 100
#batch size
BATCH_SIZE=90
# num of NER classes
num_classes = len(ner_vocab)+1
dropout=0.2

In [15]:
def build_model_bilstm(vocab_size, embedding_dim, rnn_units, batch_size, classes):
    model = tf.keras.Sequential([
        
        Embedding(vocab_size, embedding_dim, mask_zero=True,
                  batch_input_shape=[batch_size,None]),
        
        Bidirectional(LSTM(units=rnn_units,
                           return_sequences=True,
                           dropout=dropout,
                           kernel_initializer=tf.keras.initializers.he_normal())),
        
        TimeDistributed(Dense(rnn_units, activation='relu')),
        Dense(num_classes, activation="softmax")
    ])
    
    return model

After the embedding layer,
there is a BiLSTM layer, followed by a TimeDistributed dense layer. This last
layer is different from the sentiment analysis model, where there was only a single
unit for binary output. In this problem, for each word in the input sequence, an
NER token needs to be predicted. So, the output has as many tokens as the input
sequence. Consequently, output tokens correspond 1-to-1 with input tokens and
are classified as one of the NER classes. The TimeDistributed layer provides this
capability.

In [30]:
model = build_model_bilstm(vocab_size = vocab_size,
                           embedding_dim=embedding_dim,
                           rnn_units=rnn_units,
                           batch_size=BATCH_SIZE,
                           classes=num_classes)
model.summary()

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (90, None, 64)            2523072   
_________________________________________________________________
bidirectional (Bidirectional (90, None, 200)           132000    
_________________________________________________________________
time_distributed (TimeDistri (None, None, 100)         20100     
_________________________________________________________________
dense_1 (Dense)              (None, None, 19)          1919      
Total params: 2,677,091
Trainable params: 2,677,091
Non-trainable params: 0
_________________________________________________________________


In [31]:
X = x_pad
# create training and testing splits
total_sentences = 62010
test_size = round(total_sentences / BATCH_SIZE * 0.2)

X_train = X[BATCH_SIZE*test_size:]
Y_train = Y[BATCH_SIZE*test_size:]
X_test = X[0:BATCH_SIZE*test_size]
Y_test = Y[0:BATCH_SIZE*test_size]

In [32]:
model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x18b7af349a0>

In [33]:
model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)



[0.10129605978727341, 0.9614012837409973]

### BiLSTM + Conditional Random Fields

In [16]:
!pip install tensorflow_addons==0.11.2

Collecting tensorflow_addons==0.11.2
  Downloading tensorflow_addons-0.11.2-cp38-cp38-win_amd64.whl (911 kB)
Collecting typeguard>=2.7
  Downloading typeguard-2.12.0-py3-none-any.whl (16 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.11.2 typeguard-2.12.0


In [45]:
class CRFLayer(Layer):
    def __init__(self, label_size, mask_id=0, trans_params=None, name='crf', **kwargs):
        
        super(CRFLayer, self).__init__(name=name, **kwargs)
        
        self.label_size = label_size
        self.mask_id = mask_id
        self.transition_params = None
        
        if trans_params is None:
            self.transition_params = tf.Variable(tf.random.uniform(shape=(label_size, label_size)),
                                                 trainable=False)
        else:
            self.transition_params = trans_params
            
    def call(self, inputs, seq_length, training=None):
        
        if training is None:
            training = L.learning_phase()
            
        if training:
            return inputs
        return inputs
    
    
    def loss(self, y_true, y_pred):
        y_pred = tf.convert_to_tensor(y_pred)
        y_true = tf.cast(self.get_proper_labels(y_true), y_pred.dtype)

        seq_lengths = self.get_seq_lengths(y_true)
        log_likelihoods, self.transition_params = tfa.text.crf_log_likelihood(y_pred, y_true, seq_lengths)
        
        self.transition_params = tf.Variable(self.transition_params, trainable=False)
        loss = - tf.reduce_mean(log_likelihoods)
        
        return loss
    
    def get_proper_labels(self, y_true):
        shape = y_true.shape
        if len(shape) > 2:
            return tf.argmax(y_true, -1, output_type=tf.int32)
        return y_true
    
    def get_seq_lengths(self, matrix):
        mask = tf.not_equal(matrix, self.mask_id)
        seq_lengths = tf.math.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=-1)
        return seq_lengths

In [46]:
class NerModel(tf.keras.Model):
    
    def __init__(self, hidden_num, vocab_size, label_size, embedding_size, name='BilstmCrfModel', **kwargs):
        
        super(NerModel, self).__init__(name=name, **kwargs)
        
        self.num_hidden = hidden_num
        self.vocab_size = vocab_size
        self.label_size = label_size
        
        self.embedding = Embedding(vocab_size, embedding_size, mask_zero=True, name='embedding')
        self.biLSTM = Bidirectional(LSTM(hidden_num, return_sequences=True, name='bilstm'))
        self.dense = TimeDistributed(tf.keras.layers.Dense(label_size), name='dense')
        self.crf = CRFLayer(self.label_size, name='crf')
        
    def call(self, text, labels=None, training=None):
        seq_length = tf.math.reduce_sum(tf.cast(tf.math.not_equal(text, 0), dtype=tf.int32), axis=-1)
        
        if training is None:
            training = K.learning_phase()
            
        inputs = self.embedding(text)
        bilstm = self.biLSTM(inputs)
        logits = self.dense(bilstm)
        outputs = self.crf(logits, seq_length, training)
        
        return outputs


In [47]:
vocab_size = len(text_vocab) + 1

embedding_dim = 64
# Number of RNN units
rnn_units = 100
#batch size
BATCH_SIZE=90
# num of NER classes
num_classes = len(ner_vocab) + 1
blc_model = NerModel(rnn_units, vocab_size, num_classes, embedding_dim, dynamic=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

In [48]:
total_sentences = 62010
test_size = round(total_sentences / BATCH_SIZE * 0.2)

X_train = x_pad[BATCH_SIZE*test_size:]
Y_train = Y[BATCH_SIZE*test_size:]
X_test = x_pad[0:BATCH_SIZE*test_size]
Y_test = Y[0:BATCH_SIZE*test_size]

Y_train_int = tf.cast(Y_train, dtype=tf.int32)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train,
Y_train_int))

train_dataset = train_dataset.batch(BATCH_SIZE,
drop_remainder=True)

In [49]:
loss_metric = tf.keras.metrics.Mean()
epochs = 5
# Iterate over epochs.
for epoch in range(epochs):
    print('Start of epoch %d' % (epoch,))
    # Iterate over the batches of the dataset.
    for step, (text_batch, labels_batch) in enumerate(train_dataset):
        
        labels_max = tf.argmax(labels_batch, -1, output_type=tf.int32)
        
        with tf.GradientTape() as tape:
            logits = blc_model(text_batch, training=True)
            loss = blc_model.crf.loss(labels_max, logits)
            
            grads = tape.gradient(loss, blc_model.trainable_weights)
            optimizer.apply_gradients(zip(grads, blc_model.trainable_weights))

            loss_metric(loss)
        if step % 50 == 0:
            print('step %s: mean loss = %s' % (step, loss_metric.result()))

Start of epoch 0
step 0: mean loss = tf.Tensor(62.06899, shape=(), dtype=float32)
step 50: mean loss = tf.Tensor(30.476456, shape=(), dtype=float32)
step 100: mean loss = tf.Tensor(23.678583, shape=(), dtype=float32)
step 150: mean loss = tf.Tensor(20.412226, shape=(), dtype=float32)
step 200: mean loss = tf.Tensor(17.945261, shape=(), dtype=float32)
step 250: mean loss = tf.Tensor(16.17172, shape=(), dtype=float32)
step 300: mean loss = tf.Tensor(14.79893, shape=(), dtype=float32)
step 350: mean loss = tf.Tensor(13.732441, shape=(), dtype=float32)
step 400: mean loss = tf.Tensor(12.8564625, shape=(), dtype=float32)
step 450: mean loss = tf.Tensor(12.078372, shape=(), dtype=float32)
step 500: mean loss = tf.Tensor(11.394629, shape=(), dtype=float32)
step 550: mean loss = tf.Tensor(10.793428, shape=(), dtype=float32)
Start of epoch 1
step 0: mean loss = tf.Tensor(10.781557, shape=(), dtype=float32)
step 50: mean loss = tf.Tensor(10.2322645, shape=(), dtype=float32)
step 100: mean loss =