# Advance Approach 9 for Covid19 NER

Utilize an embedding layer and the bidirectional GRU with optimizer Adam.

Testing with the size of the embedding layer and the size of the bidirectional GRU layer.

## Load the Datasets

In [1]:
import keras
from keras.utils import to_categorical
import numpy as np
import os
import pickle as pkl

train_dict = pkl.load(open("data/train.pkl", "rb"))
val_dict = pkl.load(open("data/val.pkl", "rb"))
test_dict = pkl.load(open("data/test.pkl", "rb"))
print("keys in train_dict:", train_dict.keys())
print("keys in val_dict:", val_dict.keys())
print("keys in test_dict:", test_dict.keys())

keys in train_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in val_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in test_dict: dict_keys(['id', 'word_seq'])


In [2]:
# an entry of the dataset
#print("index:", train_dict["id"][0])
#print(*zip(train_dict["word_seq"][0], train_dict["tag_seq"][0]))

In [3]:
# all the NER tags:
from itertools import chain
#print("count of the NER tags:", len(set(chain(*train_dict["tag_seq"]))))
#print("all the NER tags:", set(chain(*train_dict["tag_seq"])))
tag_set = set(chain(*train_dict["tag_seq"]));
print("All the NER tags:")
for idx, tag in zip(range(len(tag_set)), tag_set):
    print("{:2d}: {}".format(idx, tag))

All the NER tags:
 0: LABORATORY_OR_TEST_RESULT
 1: VIRAL_PROTEIN
 2: PHYSICAL_SCIENCE
 3: _t_pad_
 4: FAC
 5: INJURY_OR_POISONING
 6: GPE
 7: TIME
 8: PRODUCT
 9: ORGANISM
10: O
11: MATERIAL
12: WILDLIFE
13: GROUP_ATTRIBUTE
14: LABORATORY_PROCEDURE
15: MOLECULAR_FUNCTION
16: INDIVIDUAL_BEHAVIOR
17: ARCHAEON
18: LIVESTOCK
19: MACHINE_ACTIVITY
20: SIGN_OR_SYMPTOM
21: GENE_OR_GENOME
22: CARDINAL
23: SUBSTRATE
24: QUANTITY
25: BODY_SUBSTANCE
26: LOC
27: RESEARCH_ACTIVITY
28: SOCIAL_BEHAVIOR
29: HUMAN-CAUSED_PHENOMENON_OR_PROCESS
30: BODY_PART_ORGAN_OR_ORGAN_COMPONENT
31: DIAGNOSTIC_PROCEDURE
32: CELL_COMPONENT
33: DATE
34: VIRUS
35: ORGAN_OR_TISSUE_FUNCTION
36: FOOD
37: IMMUNE_RESPONSE
38: ORG
39: LAW
40: CELL
41: GROUP
42: LANGUAGE
43: TISSUE
44: EUKARYOTE
45: BACTERIUM
46: ORDINAL
47: CHEMICAL
48: EXPERIMENTAL_MODEL_OF_DISEASE
49: PERSON
50: THERAPEUTIC_OR_PREVENTIVE_PROCEDURE
51: WORK_OF_ART
52: CORONAVIRUS
53: DISEASE_OR_SYNDROME
54: CELL_OR_MOLECULAR_DYSFUNCTION
55: GOVERNMENTAL_OR_R

## Prepare the Data for Training

In [4]:
# prepare word vocab and tag vocab

vocab_dict = {'_unk_': 0, '_w_pad_': 1}

for doc in train_dict['word_seq']:
    for word in doc:
        if(word not in vocab_dict):
            vocab_dict[word] = len(vocab_dict)

tag_dict = {'_t_pad_': 0} # add a padding token

for tag_seq in train_dict['tag_seq']:
    for tag in tag_seq:
        if(tag not in tag_dict):
            tag_dict[tag] = len(tag_dict)
word2idx = vocab_dict
idx2word = {v:k for k,v in word2idx.items()}
tag2idx = tag_dict
idx2tag = {v:k for k,v in tag2idx.items()}

print("size of word vocab:", len(vocab_dict), "size of tag_dict:", len(tag_dict))

size of word vocab: 82275 size of tag_dict: 65


In [5]:
# The maximum length of a sentence is set to 128
max_sent_length = 128

train_tokens = np.array([[word2idx[w] for w in doc] for doc in train_dict['word_seq']])
val_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in val_dict['word_seq']])
test_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in test_dict['word_seq']])


train_tags = [[tag2idx[t] for t in t_seq] for t_seq in train_dict['tag_seq']]
#train_tags = np.array(train_tags)
train_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in train_tags])

val_tags = [[tag2idx[t] for t in t_seq] for t_seq in val_dict['tag_seq']]
#val_tags = np.array(val_tags)
val_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in val_tags])

# we don't have test tags

In [6]:
print("training size:", train_tokens.shape, "tag size:", train_tags.shape)
print("validating size:", val_tokens.shape, "tag size:", val_tags.shape)

training size: (23600, 128) tag size: (23600, 128, 65)
validating size: (2950, 128) tag size: (2950, 128, 65)


In [7]:
# an example of training instance and training tags.
#print(train_tokens[0,:10], train_tags[0, :10])
print(train_tokens[0,:10], np.argmax(train_tags[0, :10, :], axis=1))

[ 2  3  4  5  6  7  8  9 10 11] [1 1 2 1 1 3 3 1 4 4]


In [8]:
train_tokens[0]

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 17, 18, 21, 22, 23, 24, 25, 12, 13, 14, 26, 27, 28, 25, 29,
       19, 13, 30, 31, 21, 32, 33, 34, 35, 14, 35, 36, 33, 37, 23, 38, 39,
       21, 40, 33, 14, 35, 36, 33, 37, 23, 41, 42, 43, 44, 45, 46, 47, 42,
       48, 44, 49, 50, 22, 51, 52, 53, 54, 55, 56, 57, 51, 58, 59, 60, 14,
       26, 61, 62, 63, 64, 65, 38, 66, 67, 14, 26, 19, 20, 68,  3, 69, 70,
       71, 38, 70, 72,  4, 73, 74, 61, 75, 76, 77, 24, 78, 79, 80, 10, 11,
       81, 22, 82, 83, 84, 85,  9, 86, 31])

In [9]:
train_tags[0]

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Build the Model

In [10]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Dropout, BatchNormalization, Input, Add, Concatenate,\
    Bidirectional, SimpleRNN, LSTM, GRU, TimeDistributed

In [11]:
def build_RNN(input_length, vocab_size, embedding_size,
              hidden_size, output_size, num_tags,
              num_rnn_layers, num_mlp_layers,
              rnn_type="lstm",
              bidirectional=False,
              activation="tanh",
              dropout_rate=0.0,
              batch_norm=False,
              l2_reg=0.0,
              loss="categorical_crossentropy",
              optimizer="Adam",
              learning_rate=0.001,
              metric="accuracy"):
    """
    :param input_length: the maximum length of sentences, type: int
    :param vocab_size: the vacabulary size, type: int
    :param embedding_size: the dimension of word representations, type: int
    :param hidden_size: the dimension of the hidden states, type: int
    :param output_size: the dimension of the prediction, type: int
    :param num_tags: the number of tag types, type: int
    :param num_rnn_layers: the number of layers of the RNN, type: int
    :param num_mlp_layers: the number of layers of the MLP, type: int
    :param rnn_type: the type of RNN, type: str
    :param bidirectional: whether to use bidirectional rnn, type: bool
    :param activation: the activation type, type: str
    :param dropout_rate: the probability of dropout, type: float
    :param batch_norm: whether to enable batch normalization, type: bool
    :param l2_reg: the weight for the L2 regularizer, type: str
    :param loss: the training loss, type: str
    :param optimizer: the optimizer, type: str
    :param learning_rate: the learning rate for the optimizer, type: float
    :param metric: the metric, type: str
    return a RNN for NER with num_tags tag types,
    # activation document: https://keras.io/activations/
    # dropout document: https://keras.io/layers/core/#dropout
    # embedding document: https://keras.io/layers/embeddings/#embedding
    # recurrent layers document: https://keras.io/layers/recurrent
    # batch normalization document: https://keras.io/layers/normalization/
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    x = Input(shape=(input_length,))
    
    ################################
    ###### Word Representation #####
    ################################
    # word representation layer
    emb = Embedding(input_dim=vocab_size,
                    output_dim=embedding_size,
                    input_length=input_length,
                    embeddings_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=0))(x)
    
    ################################
    ####### Recurrent Layers #######
    ################################
    # recurrent layers
    # Referennce: https://keras.io/api/layers/#recurrent-layers
    if rnn_type == "rnn":
        fn = SimpleRNN
    elif rnn_type == "lstm":
        fn = LSTM
    elif rnn_type == "gru":
        fn = GRU
    else:
        raise NotImplementedError
        
    h = emb
    for i in range(num_rnn_layers):
        #is_last = (i == num_rnn_layers-1)
        if bidirectional:
            h = Bidirectional(fn(hidden_size,
                   kernel_initializer=keras.initializers.glorot_uniform(seed=0),
                   recurrent_initializer=keras.initializers.Orthogonal(gain=1.0, seed=0),
                   return_sequences=True))(h)
            # return_sequences:
            # Boolean. Whether to return the last output. in the output sequence, or the full sequence.
            # [h_1, h_2, ..., h_n] or h_n
        else:
            h = fn(hidden_size,
                   kernel_initializer=keras.initializers.glorot_uniform(seed=0),
                   recurrent_initializer=keras.initializers.Orthogonal(gain=1.0, seed=0),
                   return_sequences=True)(h)
        h = Dropout(dropout_rate, seed=0)(h)
    
    ################################
    #### Fully Connected Layers ####
    ################################
    # multi-layer perceptron
    for i in range(num_mlp_layers-1):
        new_h = Dense(hidden_size,
                      kernel_initializer=keras.initializers.he_normal(seed=0),
                      bias_initializer="zeros",
                      kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
        # add batch normalization layer
        if batch_norm:
            new_h = BatchNormalization()(new_h)
        # add residual connection
        if i == 0:
            h = new_h
        else:
            h = Add()([h, new_h])
        # add activation
        h = Activation(activation)(h)
    y = TimeDistributed(Dense(num_tags, activation='softmax'))(h)
    #y = Dense(output_size,
    #          activation="softmax",
    #          kernel_initializer=keras.initializers.he_normal(seed=0),
    #          bias_initializer="zeros")(h)
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optmizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optmizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model = Model(x, y)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

## Train the Model

In [12]:
from numpy.random import seed
import tensorflow as tf
seed(0)
tf.random.set_seed(0)

In [13]:
os.makedirs("models", exist_ok=True)

In [14]:
embedding_size = 500
hidden_size = max_sent_length*2
num_rnn_layers = 1
num_mlp_layers = 1

In [15]:
model = build_RNN(max_sent_length, len(vocab_dict), embedding_size,
              hidden_size, max_sent_length, len(tag_dict),
              num_rnn_layers, num_mlp_layers,
              rnn_type="gru",
              bidirectional=True,
              activation="tanh",
              dropout_rate=0.0,
              batch_norm=False,
              l2_reg=0.0,
              loss="categorical_crossentropy",
              optimizer="Adam",
              learning_rate=0.001,
              metric="accuracy")

In [16]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 500)          41137500  
_________________________________________________________________
bidirectional (Bidirectional (None, 128, 512)          1164288   
_________________________________________________________________
dropout (Dropout)            (None, 128, 512)          0         
_________________________________________________________________
time_distributed (TimeDistri (None, 128, 65)           33345     
Total params: 42,335,133
Trainable params: 42,335,133
Non-trainable params: 0
_________________________________________________________________


In [17]:
train_tokens.shape

(23600, 128)

In [18]:
train_tags.shape

(23600, 128, 65)

In [19]:
#tf.config.run_functions_eagerly(False)

In [20]:
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "bi_gru_emb500_hidden2unit.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)
earlystopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=0)

rnn_history = model.fit(train_tokens, train_tags,
                    validation_split=0.1,
                    epochs=10, batch_size=100, verbose=1,
                    callbacks=[checkpointer, earlystopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
model = keras.models.load_model(os.path.join("models", "bi_gru_emb500_hidden2unit.hdf5"))
train_score = model.evaluate(train_tokens, train_tags,
                             batch_size=100)
test_score = model.evaluate(val_tokens, val_tags,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

training loss: 0.11543704569339752 training accuracy 0.9673775434494019
test loss: 0.37453693151474 test accuracy 0.9129475355148315


## Predict

### For Validation Set

In [22]:
val_preds = model.predict(val_tokens, batch_size=10)
val_preds = np.argmax(val_preds, axis=2)
val_preds = [[idx2tag[idx] for idx in sentence] for sentence in val_preds]

In [23]:
len(val_preds)

2950

In [24]:
len(val_preds[0])

128

In [25]:
len(val_tokens)

2950

### For Test Set

In [26]:
test_preds = model.predict(test_tokens, batch_size=10)
test_preds = np.argmax(test_preds, axis=2)
test_preds = [[idx2tag[idx] for idx in sentence] for sentence in test_preds]

In [27]:
len(test_preds)

2950

In [28]:
len(test_preds[0])

128

In [29]:
len(test_tokens)

2950

## Output

In [30]:
import json
import pandas as pd

In [31]:
def write_predictions(ids, predictions,fileName):
    df = pd.DataFrame({'id': ids,
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in predictions]})
    df.to_csv(fileName, index=False)

### For Validation Set

In [32]:
write_predictions(val_dict["id"], val_preds, 'val_preds.csv')

### For Test Set

In [33]:
write_predictions(test_dict["id"], test_preds, 'test_preds.csv')

## Helper Functions

In [34]:
# Provided function to test accuracy
# You could check the validation accuracy to select the best of your models
def calc_accuracy(preds, tags, padding_id="_t_pad_"):
    """
        Input:
            preds (np.narray): (num_data, length_sentence)
            tags  (np.narray): (num_data, length_sentence)
        Output:
            Proportion of correct prediction. The padding tokens are filtered out.
    """
    preds_flatten = preds.flatten()
    tags_flatten = tags.flatten()
    non_padding_idx = np.where(tags_flatten!=padding_id)[0]
    
    return sum(preds_flatten[non_padding_idx]==tags_flatten[non_padding_idx])/len(non_padding_idx)

In [35]:
def evaluate(pred_file, ground_file):
    file_dict = pkl.load(open(ground_file, "rb"))
    file_preds = pd.read_csv(pred_file)
    return calc_accuracy(np.array([json.loads(line) for line in file_preds["labels"]]), 
              np.array(file_dict["tag_seq"]))

### For Validation Set

In [36]:
evaluate("val_preds.csv", "data/val.pkl")

0.9049119152998352