In [63]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.losses import SparseCategoricalCrossentropy
from keras.layers import (
    Embedding, 
    TextVectorization, 
    Bidirectional, 
    LSTM,
    Dense,
    Input,
    TextVectorization,
    TimeDistributed,
    GRU,
)
from conlleval import evaluate



In [2]:
(ds_train, ds_test), info = tfds.load(
  name="conll2003",
  split=["train", "test"],
  with_info=True,
)


2022-12-29 11:16:40.754209: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-29 11:16:40.754252: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-29 11:16:40.754279: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (calcifer-Inspiron-7370): /proc/driver/nvidia/version does not exist
2022-12-29 11:16:40.754572: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [64]:
print(info)
print( "NER tags:", info.features["ner"].feature.names)
label_names = info.features["ner"].feature.names

tfds.core.DatasetInfo(
    name='conll2003',
    full_name='conll2003/conll2022/1.0.0',
    description="""
    The shared task of CoNLL-2003 concerns language-independent named entity
    recognition and concentrates on four types of named entities: persons,
    locations, organizations and names of miscellaneous entities that do not belong
    to the previous three groups.
    """,
    homepage='https://www.aclweb.org/anthology/W03-0419/',
    data_path='/home/calcifer/tensorflow_datasets/conll2003/conll2022/1.0.0',
    file_format=tfrecord,
    download_size=959.94 KiB,
    dataset_size=3.87 MiB,
    features=FeaturesDict({
        'chunks': Sequence(ClassLabel(shape=(), dtype=tf.int64, num_classes=23)),
        'ner': Sequence(ClassLabel(shape=(), dtype=tf.int64, num_classes=9)),
        'pos': Sequence(ClassLabel(shape=(), dtype=tf.int64, num_classes=47)),
        'tokens': Sequence(Text(shape=(), dtype=tf.string)),
    }),
    supervised_keys=None,
    disable_shuffling=False,
  

In [65]:
# Get max sequence length
max_sequence_length = (
  ds_train
  .map(lambda r: len(r['tokens']))
  .reduce(tf.constant(0, tf.int32), lambda t1, t2: tf.math.maximum(t1, t2) )
  .numpy()
)
print("Max sequence length in training set:", max_sequence_length)

Max sequence length in training set: 113


In [66]:
ds_train_cld = ds_train.filter(lambda r: len(r["tokens"]) > 0)
tokens = ds_train_cld.map(lambda r: r['tokens'])
labels = ds_train_cld.map(lambda r: r["ner"])

In [67]:
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100
vectorizer = TextVectorization(
  max_tokens=MAX_VOCAB_SIZE,
  output_sequence_length=MAX_SEQUENCE_LENGTH,
  standardize="lower",
  output_mode="int",
  split=None,
)

vectorizer.adapt(tokens)

In [68]:
train_data = (
  tokens
  .map(vectorizer)
  .filter(lambda r: len(r) > 0)
)

train_arr = np.vstack([x.numpy() for x in train_data])

In [69]:
train_labels = pad_sequences(
  sequences=[x.numpy()+1 for x in labels],
  maxlen=MAX_SEQUENCE_LENGTH,
  padding="post",
)

In [88]:
keras.backend.clear_session()
tf.random.set_seed(0)

EMBEDDING_SIZE = 100
OUTPUT_DIM = len(info.features["ner"].feature.names)+1

inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=np.int64)

x = Embedding(
  input_dim=MAX_VOCAB_SIZE, 
  output_dim=EMBEDDING_SIZE,
  mask_zero=True,
)(inputs)
x = Bidirectional(
        layer=GRU(
        units=64, 
        return_sequences=True, 
        recurrent_dropout=0.2, 
        recurrent_initializer='glorot_uniform',
        dropout=0.2,
    ),
)(x)
x = TimeDistributed(Dense(units=OUTPUT_DIM, activation="softmax"))(x)

model = Model(inputs=inputs, outputs=x)


In [89]:
# https://keras.io/examples/nlp/ner_transformers/
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=keras.losses.Reduction.NONE
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)


loss = CustomNonPaddingTokenLoss()


In [90]:
model.compile(
  optimizer="adam", 
  loss=loss,
  metrics=["accuracy"],
)

In [91]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 bidirectional (Bidirectiona  (None, 100, 128)         63744     
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 100, 10)          1290      
 ibuted)                                                         
                                                                 
Total params: 1,065,034
Trainable params: 1,065,034
Non-trainable params: 0
_________________________________________________________________


In [92]:
# Test the model
"""
output_shape: [records x sequence_length x  possible_tags]

"""
out = model.predict(train_arr[:5])
print(out.shape)


(5, 100, 10)


In [93]:
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 100

train_dataset = tf.data.Dataset.from_tensor_slices((train_arr, train_labels)).shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)

In [94]:
# Train
model.fit(
  train_dataset,
  epochs=20,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f45654136a0>

In [96]:
x = train_arr
y_padded = train_labels
preds_padded = np.argmax(model.predict(x), axis=-1)
y = list()
preds = list()
for p, l in zip(preds_padded, y_padded):
    mask = l > 0
    preds.append(p[mask])
    y.append(l[mask])

preds_concat = [label_names[tag-1] for tag in np.concatenate(preds)]
y_concat = [label_names[tag-1] for tag in np.concatenate(y)]

evaluate(y_concat, preds_concat)

processed 203608 tokens with 23498 phrases; found: 22636 phrases; correct: 21295.
accuracy:  90.33%; (non-O)
accuracy:  98.20%; precision:  94.08%; recall:  90.62%; FB1:  92.32
              LOC: precision:  95.44%; recall:  91.65%; FB1:  93.51  6857
             MISC: precision:  92.03%; recall:  80.89%; FB1:  86.10  3022
              ORG: precision:  92.20%; recall:  87.72%; FB1:  89.91  6013
              PER: precision:  95.28%; recall:  97.36%; FB1:  96.31  6744


(94.07580844672204, 90.62473401991659, 92.31803008627043)