In [1]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

import numpy as np
import os
import time

print(tf.__version__)

2.0.0


In [2]:
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Concatenate, BatchNormalization
from tensorflow.keras import Input, Model
from tensorflow.keras import optimizers

In [3]:
labels = set()

def file2Examples(file_name):
  '''
  Read data files and return input/output pairs
  '''
  
  examples=[]

  with open(file_name,"r") as f:

    next(f)
    next(f)

    example = [[],[]]

    for line in f:

      input_output_split= line.split()

      if len(input_output_split)==4:
        example[0].append(input_output_split[0])
        example[1].append(input_output_split[-1])
        labels.add(input_output_split[-1])

      elif len(input_output_split)==0:
        examples.append(example)
        example=[[],[]]
      else:
        example=[[],[]]

    f.close()
    
    return examples
  
# Extract examples from train, validation, and test files which can be found at 
# https://github.com/davidsbatista/NER-datasets/tree/master/CONLL2003
train_examples = file2Examples("train.txt")
test_examples = file2Examples("test.txt")
valid_examples = file2Examples("valid.txt")

In [4]:
# create character vocab
all_text = " ".join([" ".join(x[0]) for x in train_examples+valid_examples+test_examples])
vocab = sorted(set(all_text))
    
    # create character/id and label/id mapping
char2idx = {u:i+1 for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
label2idx = {u:i+1 for i, u in enumerate(labels)}
idx2label = np.array(labels)
    
print(idx2label)
print(char2idx)

{'B-PER', 'B-MISC', 'B-LOC', 'O', 'B-ORG', 'I-MISC', 'I-PER', 'I-LOC', 'I-ORG'}
{' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '*': 11, '+': 12, ',': 13, '-': 14, '.': 15, '/': 16, '0': 17, '1': 18, '2': 19, '3': 20, '4': 21, '5': 22, '6': 23, '7': 24, '8': 25, '9': 26, ':': 27, ';': 28, '=': 29, '?': 30, '@': 31, 'A': 32, 'B': 33, 'C': 34, 'D': 35, 'E': 36, 'F': 37, 'G': 38, 'H': 39, 'I': 40, 'J': 41, 'K': 42, 'L': 43, 'M': 44, 'N': 45, 'O': 46, 'P': 47, 'Q': 48, 'R': 49, 'S': 50, 'T': 51, 'U': 52, 'V': 53, 'W': 54, 'X': 55, 'Y': 56, 'Z': 57, '[': 58, ']': 59, '`': 60, 'a': 61, 'b': 62, 'c': 63, 'd': 64, 'e': 65, 'f': 66, 'g': 67, 'h': 68, 'i': 69, 'j': 70, 'k': 71, 'l': 72, 'm': 73, 'n': 74, 'o': 75, 'p': 76, 'q': 77, 'r': 78, 's': 79, 't': 80, 'u': 81, 'v': 82, 'w': 83, 'x': 84, 'y': 85, 'z': 86}


In [5]:
def split_char_labels(eg):
      '''
      For a given input/output example, break tokens into characters while keeping 
      the same label.
      '''

      tokens = eg[0]
      labels=eg[1]

      input_chars = []
      output_char_labels = []

      for token,label in zip(tokens,labels):

        input_chars.extend([char for char in token])
        input_chars.extend(' ')
        output_char_labels.extend([label]*len(token))
        output_char_labels.extend('O')

      return [[char2idx[x] for x in input_chars[:-1]],np.array([label2idx[x] for x in output_char_labels[:-1]])]
   
train_formatted = [split_char_labels(eg) for eg in train_examples]
test_formatted = [split_char_labels(eg) for eg in test_examples]
valid_formatted = [split_char_labels(eg) for eg in valid_examples]

print(len(train_formatted))
print(len(test_formatted))
print(len(valid_formatted))

14986
3683
3465


In [6]:
# training generator
def gen_train_series():

    for eg in train_formatted:
        yield eg[0],eg[1]

# validation generator
def gen_valid_series():

    for eg in valid_formatted:
        yield eg[0],eg[1]

# test generator
def gen_test_series():

    for eg in test_formatted:
        yield eg[0],eg[1]
    
# create Dataset objects for train, test and validation sets  
series = tf.data.Dataset.from_generator(gen_train_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))
series_valid = tf.data.Dataset.from_generator(gen_valid_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))
series_test = tf.data.Dataset.from_generator(gen_test_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))

BATCH_SIZE = 128
BUFFER_SIZE=1000

# create padded batch series objects for train, test and validation sets
ds_series_batch = series.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)
ds_series_batch_valid = series_valid.padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)
ds_series_batch_test = series_test.padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)

# print example batches
for input_example_batch, target_example_batch in ds_series_batch_valid.take(1):
    print(input_example_batch)
    print(target_example_batch)

tf.Tensor(
[[34 49 40 ...  0  0  0]
 [43 46 45 ...  0  0  0]
 [54 65 79 ...  0  0  0]
 ...
 [ 3  1 36 ...  0  0  0]
 [40 66  1 ...  0  0  0]
 [35 81 78 ...  0  0  0]], shape=(128, 228), dtype=int32)
tf.Tensor(
[[4 4 4 ... 0 0 0]
 [3 3 3 ... 0 0 0]
 [2 2 2 ... 0 0 0]
 ...
 [4 4 4 ... 0 0 0]
 [4 4 4 ... 0 0 0]
 [1 1 1 ... 0 0 0]], shape=(128, 228), dtype=int32)


In [10]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense

def biLSTM(result):
    model = Sequential()
    model.add(Bidirectional(LSTM(64), input_shape=(len(result[0]), len(result[0][0]))))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation="softmax"))
    model.summary()
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=tf.keras.optimizers.Adam(1e-4),
                  metrics=['accuracy'])
    # history = model.fit(result,
    #                     total_label,
    #                     epochs=10,
    #                     validation_split=0.1,
    #                     batch_size=512)
    history = model.fit(ds_series_batch, epochs=EPOCHS, validation_data=ds_series_batch_valid,callbacks=[checkpoint_callback])

In [12]:
biLSTM(train_formatted)


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 128)               57856     
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 78,529
Trainable params: 78,529
Non-trainable params: 0
_________________________________________________________________


NameError: name 'total_label' is not defined

In [None]:
vocab_size = len(vocab)+1

  # The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

label_size = len(labels)  

# build LSTM model
def build_model(vocab_size,label_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                            batch_input_shape=[batch_size, None],mask_zero=True),
        tf.keras.layers.LSTM(rnn_units,
                    return_sequences=True,
                    stateful=True,
                    recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(label_size)
        ])
    return model

model = build_model(
    vocab_size = len(vocab)+1,
    label_size=len(labels)+1,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

model.summary()

In [None]:

import os

# define loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss,metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS=20
  
history = model.fit(ds_series_batch, epochs=EPOCHS, validation_data=ds_series_batch_valid,callbacks=[checkpoint_callback])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

preds = np.array([])
y_trues= np.array([])

# iterate through test set, make predictions based on trained model
for input_example_batch, target_example_batch in ds_series_batch_test:

  pred=model.predict(input_example_batch)
  pred_max=tf.argmax(tf.nn.softmax(pred),2).numpy().flatten()
  y_true=target_example_batch.numpy().flatten()

  preds=np.concatenate([preds,pred_max])
  y_trues=np.concatenate([y_trues,y_true])

# remove padding from evaluation
remove_padding = [(p,y) for p,y in zip(preds,y_trues) if y!=0]

r_p = [x[0] for x in remove_padding]
r_t = [x[1] for x in remove_padding]

# print confusion matrix and classification report
print(confusion_matrix(r_p,r_t))
print(classification_report(r_p,r_t))