# NER with LSTM-CRF, PoS and GloVe

In [1]:
import re
import numpy as np
import pickle

# Add functions in lib folder
import sys
import os
module_path = os.path.abspath(os.path.join('../code'))
if module_path not in sys.path:
    sys.path.append(module_path)
from lib.DataProcess import DataProcess
from lib.Jaccard import Jaccard

import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Bidirectional, concatenate, Flatten
from keras_contrib.layers import CRF
from keras_contrib.metrics import crf_accuracy
from keras_contrib.losses import crf_loss
from keras.utils import to_categorical

Using TensorFlow backend.


## Read the dataset

In [2]:
DATA_PATH = '../data/'

train_file = 'NCBI_corpus_training.txt'
test_file = 'NCBI_corpus_testing.txt'

with open(DATA_PATH + train_file, 'r') as fp:
    train_dataset = fp.readlines()

with open(DATA_PATH + test_file, 'r') as fp:
    test_dataset = fp.readlines()

print('Train texts: %d' % len(train_dataset))
print('Test texts: %d' % len(test_dataset))

Train texts: 593
Test texts: 100


In [3]:
jaccard = Jaccard()
data_process = DataProcess(jaccard)

## Prepare data

This is a minimal preprocess of the data, as I do not want to remove any essential information from texts.

The dataset that I am using contains a number in the beginning of every text, thus I need to remove that number. Also, I replace the `category` tags with `<entity>`, so I can add them to the vocabular of Spacy.

In [4]:
train_dataset = data_process.apply_initial_cleaner(train_dataset)
test_dataset = data_process.apply_initial_cleaner(test_dataset)

print(train_dataset[0]) # Sample

Identification of APC2, a homologue of the <entity>adenomatous polyposis coli tumour</entity> suppressor .	The <entity>adenomatous polyposis coli ( APC ) tumour</entity>-suppressor protein controls the Wnt signalling pathway by forming a complex with glycogen synthase kinase 3beta ( GSK-3beta ) , axin / conductin and betacatenin . Complex formation induces the rapid degradation of betacatenin . In <entity>colon carcinoma</entity> cells , loss of APC leads to the accumulation of betacatenin in the nucleus , where it binds to and activates the Tcf-4 transcription factor ( reviewed in [ 1 ] [ 2 ] ) . Here , we report the identification and genomic structure of APC homologues . Mammalian APC2 , which closely resembles APC in overall domain structure , was functionally analyzed and shown to contain two SAMP domains , both of which are required for binding to conductin . Like APC , APC2 regulates the formation of active betacatenin-Tcf complexes , as demonstrated using transient transcriptio

I split the texts using the **Spacy Tokenizer**. Note that I split not only the words but also the sentences, so I consider each sentence as an independent input of the model.

In [5]:
tok_train_dataset = data_process.tokenize_texts(train_dataset)
tok_test_dataset = data_process.tokenize_texts(test_dataset)

print(tok_train_dataset[0]) # Sample

[('identification', 'NOUN', Identification), ('of', 'ADP', of), ('apc2', 'PROPN', APC2), (',', 'PUNCT', ,), ('a', 'DET', a), ('homologue', 'NOUN', homologue), ('of', 'ADP', of), ('the', 'DET', the), ('<entity>', 'X', <entity>), ('adenomatous', 'ADJ', adenomatous), ('polyposis', 'NOUN', polyposis), ('coli', 'NOUN', coli), ('tumour', 'NOUN', tumour), ('</entity>', 'X', </entity>), ('suppressor', 'NOUN', suppressor), ('.', 'PUNCT', .)]


Now, let's prepare the input of the model:
- List of words.
- List of 1s and 0s, where 1 indicate that there is an entity and 0 that it is just a normal word.
- PoS tags.

In [6]:
train_words = data_process.get_texts_words(tok_train_dataset)
test_words = data_process.get_texts_words(tok_test_dataset)

train_pos = data_process.get_texts_pos(tok_train_dataset)
test_pos = data_process.get_texts_pos(tok_test_dataset)

print(train_words[0]) # Sample

['identification', 'of', 'apc2', ',', 'a', 'homologue', 'of', 'the', 'adenomatous', 'polyposis', 'coli', 'tumour', 'suppressor', '.']


In [7]:
train_indicators = data_process.get_indicator_sequences(tok_train_dataset)
test_indicators = data_process.get_indicator_sequences(tok_test_dataset)

print(train_indicators[0]) # Sample

[0 0 0 0 0 0 0 0 1 1 1 1 0 0]


The model only accepts numbers, thus I must to transform each word into a unique number. Similarly, I repeat the process with the PoS sequences.

In [8]:
additional = [DataProcess.UNK]

vocab_words, word2id, vocab_words_size = data_process.get_vocab_dictionary(train_words, additional)
vocab_pos, pos2id, vocab_pos_size = data_process.get_vocab_dictionary(train_pos, additional)

In [9]:
train_words_enc = data_process.encode_texts(train_words, word2id)
test_words_enc = data_process.encode_texts(test_words, word2id)

train_pos_enc = data_process.encode_texts(train_pos, pos2id)
test_pos_enc = data_process.encode_texts(test_pos, pos2id)

print(train_words_enc[0]) # Sample

[4093, 5418, 1287, 14, 904, 3963, 5418, 7171, 1026, 5844, 2104, 7367, 7047, 100]


Finally, I use the technique **zero-padding** such that all sequences are in the same length.

In [10]:
train_words_enc = data_process.to_sequences(train_words_enc, DataProcess.MAX_SEQUENCE_LENGTH)
test_words_enc = data_process.to_sequences(test_words_enc, DataProcess.MAX_SEQUENCE_LENGTH)
train_pos_enc = data_process.to_sequences(train_pos_enc, DataProcess.MAX_SEQUENCE_LENGTH)
test_pos_enc = data_process.to_sequences(test_pos_enc, DataProcess.MAX_SEQUENCE_LENGTH)
train_indicators = data_process.to_sequences(train_indicators, DataProcess.MAX_SEQUENCE_LENGTH)
test_indicators = data_process.to_sequences(test_indicators, DataProcess.MAX_SEQUENCE_LENGTH)

In [11]:
# For training the network we also need to change the bin_<>_dataset to categorial.
train_indicators_cat = data_process.to_categorical(train_indicators)

## Create model

In [12]:
model_params = {
    'embedding_dim': 300,
    'lstm_cells': 128,
    'word_lstm_dropout': 0.3,
    'word_lstm_rec_dropout': 0.3,
    'pos_lstm_dropout': 0.3,
    'pos_lstm_rec_dropout': 0.3
}

In [13]:
def load_embedding_matrix(vocab_words, embedding_dim=300):
    embeddings_index = {}

    print('Reading GloVe file...')
    
    with open(DATA_PATH + 'glove.840B.{}d.txt'.format(embedding_dim)) as f:
        i = 0
        for line in f:
            i = i + 1
            if i % 100000 == 0:
                print('- At line %d' % i)
            
            values = line.split(' ')
            word = values[0]

            coefs = np.array(values[1:])
            coefs[coefs == '.'] = '0.0'
            coefs = np.asarray(coefs, dtype='float32')

            embeddings_index[word] = coefs

    print('Building embedding matrix...')
    embedding_matrix = np.zeros((len(vocab_words) + 1, embedding_dim))
    found_embeddings = 0
    for i, word in enumerate(vocab_words):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            found_embeddings = found_embeddings + 1

    print('- Found %d embeddings of %d words' % (found_embeddings, len(vocab_words)))

    return embedding_matrix, embedding_dim

In [15]:
embedding_matrix, _ = load_embedding_matrix(vocab_words, model_params['embedding_dim'])

Reading GloVe file...
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
Building embedding matrix...
- Found 6103 embeddings of 7791 words


In [16]:
"""
model_params: dictionary:
- embedding_dim
- lstm_cells
- word_lstm_dropout
- word_lstm_rec_dropout
- pos_lstm_dropout
- pos_lstm_rec_dropout
"""
def create_model(model_params, vocab_words_size, vocab_pos_size, sequence_length, embedding_matrix):
    word_input = Input(shape=(sequence_length,), name='words_input')
    word_pipe = Embedding(input_dim=vocab_words_size + 1,
                          output_dim=model_params['embedding_dim'],
                          weights=[embedding_matrix],
                          trainable=True)(word_input)
    word_pipe = Bidirectional(
                    LSTM(model_params['lstm_cells'],
                         return_sequences=True,
                         dropout=model_params['word_lstm_dropout'],
                         recurrent_dropout=model_params['word_lstm_rec_dropout']),
                    merge_mode='concat')(word_pipe)
    word_pipe = TimeDistributed(Flatten())(word_pipe)

    pos_input = Input(shape=(sequence_length,), name='pos_input')
    pos_pipe = Embedding(input_dim=vocab_pos_size + 1,
                         output_dim=model_params['embedding_dim'],
                         input_length=sequence_length,
                         trainable=True)(pos_input)
    pos_pipe = Bidirectional(
                    LSTM(model_params['lstm_cells'],
                         return_sequences=True,
                         dropout=model_params['pos_lstm_dropout'],
                         recurrent_dropout=model_params['pos_lstm_rec_dropout']),
                    merge_mode='concat')(pos_pipe)
    pos_pipe = TimeDistributed(Flatten())(pos_pipe)
    
    # Concatenate both inputs
    comb_pipe = concatenate([word_pipe, pos_pipe])

    # Main BiLSTM model
    comb_pipe = Bidirectional(
        LSTM(model_params['lstm_cells'], return_sequences=True),
        merge_mode='concat')(comb_pipe)
    comb_pipe = TimeDistributed(Dense(64))(comb_pipe)
    
    output = CRF(2, name='output')(comb_pipe)
    
    model = Model(inputs=[word_input, pos_input], outputs=output)
    model.compile(
        loss=crf_loss,
        optimizer='adam',
        metrics=[crf_accuracy]
    )
    
    return model

## Training

I did not optimize the following parameters. If you want to get the best possible results, use some library to find the optimal parameter values: https://cloud.google.com/ai-platform/training/docs/hyperparameter-tuning-overview#whats_a_hyperparameter

In [17]:
model = create_model(model_params,
                     vocab_words_size,
                     vocab_pos_size,
                     DataProcess.MAX_SEQUENCE_LENGTH,
                     embedding_matrix)
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words_input (InputLayer)        (None, 100)          0                                            
__________________________________________________________________________________________________
pos_input (InputLayer)          (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     2337600     words_input[0][0]                
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 100, 300)     5700        pos_input[0][0]                  
____________________________________________________________________________________________

In [18]:
EPOCHS = 5

# Add early stop
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)

history = model.fit(
    {'words_input': train_words_enc, 'pos_input': train_pos_enc},
    train_indicators_cat,
    epochs=EPOCHS,
    callbacks=[early_stop],
    verbose=1
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Evaluation

I evaluate the model by computing the Jaccard Index with the pseudo-binary sequences. Details: https://en.wikipedia.org/wiki/Jaccard_index

In [19]:
test_pred_indicators_cat = model.predict({'words_input': test_words_enc, 'pos_input': test_pos_enc})
test_pred_indicators = np.array([np.argmax(s, axis=-1) for s in test_pred_indicators_cat])

In [20]:
jaccard_score = jaccard.bin_jaccard(test_indicators, test_pred_indicators)
print('Jaccard score: %.4f' % jaccard_score)

Jaccard score: 0.8435


## Save model

In [None]:
MODEL_NAME = 'lstm-crf-glove'

model_path = '../artifacts/' + MODEL_NAME + '/'
if not os.path.exists(model_path):
    os.mkdir(model_path)

# Save model
model.save(model_path + 'model.h5')

# Save vocabularies
with open(model_path + 'word_vocab.pickle', 'wb') as fp:
    pickle.dump(vocab_words, fp)
with open(model_path + 'pos_vocab.pickle', 'wb') as fp:
    pickle.dump(vocab_pos, fp)