In [1]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

from keras.models import Model
from keras.layers import (
    TimeDistributed,
    Dense,
    GRU,
    Embedding,
    TextVectorization,
    Input,
    StringLookup,
    Bidirectional,
)

import numpy as np

from utils import CustomNonPaddingTokenLoss
from conlleval import evaluate

2022-12-29 22:54:50.313257: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-29 22:54:50.446175: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-29 22:54:50.446209: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-29 22:54:51.241356: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

In [2]:
(ds_dev, ds_train, ds_test), info = tfds.load(
    name="conll2003",
    split=["dev", "train", "test",],
    with_info=True,
)

2022-12-29 22:54:52.744082: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-29 22:54:52.744113: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-29 22:54:52.744138: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (calcifer-Inspiron-7370): /proc/driver/nvidia/version does not exist
2022-12-29 22:54:52.744591: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
info

tfds.core.DatasetInfo(
    name='conll2003',
    full_name='conll2003/conll2022/1.0.0',
    description="""
    The shared task of CoNLL-2003 concerns language-independent named entity
    recognition and concentrates on four types of named entities: persons,
    locations, organizations and names of miscellaneous entities that do not belong
    to the previous three groups.
    """,
    homepage='https://www.aclweb.org/anthology/W03-0419/',
    data_path='/home/calcifer/tensorflow_datasets/conll2003/conll2022/1.0.0',
    file_format=tfrecord,
    download_size=959.94 KiB,
    dataset_size=3.87 MiB,
    features=FeaturesDict({
        'chunks': Sequence(ClassLabel(shape=(), dtype=tf.int64, num_classes=23)),
        'ner': Sequence(ClassLabel(shape=(), dtype=tf.int64, num_classes=9)),
        'pos': Sequence(ClassLabel(shape=(), dtype=tf.int64, num_classes=47)),
        'tokens': Sequence(Text(shape=(), dtype=tf.string)),
    }),
    supervised_keys=None,
    disable_shuffling=False,
  

In [4]:
tokens_train = ds_train.map(lambda r: r.get("tokens"))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [5]:
VOCAB_SIZE = 10000

In [6]:
vectorizer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    standardize="lower",
    split=None,
    ragged=True,
    # output_sequence_length=...,
)

vectorizer.adapt(tokens_train)

In [7]:
label_names = info.features["ner"].names
print("label_names:", label_names)

label_names: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [25]:
BATCH_SIZE = 64
MAX_SEQUENCE_LENGTH = 125

ds_train_cld = (
    ds_train
    .map(lambda r: (vectorizer(r["tokens"]), r["ner"]+1))
    .shuffle(buffer_size=100)
    .padded_batch(batch_size=64, padded_shapes=([None], [None]))
)
ds_dev_cld = (
    ds_dev
    .map(lambda r: (vectorizer(r["tokens"]), r["ner"]+1))
    .padded_batch(batch_size=64, padded_shapes=([None], [None]))
)
ds_test_cld = (
    ds_test
    .map(lambda r: (vectorizer(r["tokens"]), r["ner"]+1))
    .padded_batch(batch_size=64, padded_shapes=([None], [None]))
)

In [26]:
for x in ds_dev_cld.take(1).as_numpy_iterator():
    print("Tokens shape:", x[0].shape)
    print("Tags shape:", x[1].shape)  # should be equal to tokens shape

Tokens shape: (64, 41)
Tags shape: (64, 41)


In [27]:
EMBEDDING_DIM = 128

In [28]:
keras.backend.clear_session()
tf.random.set_seed(0)

inputs = Input(shape=(None,), name="tokens")
x = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    mask_zero=True,
)(inputs)
x = Bidirectional(layer=GRU(
    units=128,
    dropout=0.2,
    recurrent_dropout=0.2,
    return_sequences=True,
))(x)
outputs = TimeDistributed(layer=Dense(units=len(label_names)+1, activation="softmax"), name="tags")(x)

model = Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tokens (InputLayer)         [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         1280000   
                                                                 
 bidirectional (Bidirectiona  (None, None, 256)        198144    
 l)                                                              
                                                                 
 tags (TimeDistributed)      (None, None, 10)          2570      
                                                                 
Total params: 1,480,714
Trainable params: 1,480,714
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.compile(
    optimizer="adam",
    loss=CustomNonPaddingTokenLoss(),
    metrics=["accuracy"]
)

In [30]:
NUM_EPOCHS = 25

model.fit(
    ds_train_cld,
    epochs=NUM_EPOCHS,
    validation_data=ds_dev_cld,
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fd0ec16a590>

In [31]:
prob_preds_test = model.predict(ds_test_cld)
preds_test = [np.argmax(x, axis=-1) for x in prob_preds_test.to_list()]

labels_test = [r[1] for r in ds_test_cld.unbatch().as_numpy_iterator()]

for idx in range(len(preds_test)):
    assert preds_test[idx].shape == labels_test[idx].shape



In [32]:
preds_idx_concat, labels_idx_concat = list(), list()

for (p, l) in zip(preds_dev, labels_dev):
    mask = l > 0
    preds_idx_concat += p[mask].tolist()
    labels_idx_concat += l[mask].tolist()
    
preds_concat = [label_names[tag-1] for tag in preds_idx_concat]
labels_concat = [label_names[tag-1] for tag in labels_idx_concat]

In [33]:
evaluate(labels_concat, preds_concat)

processed 51362 tokens with 5942 phrases; found: 5347 phrases; correct: 4531.
accuracy:  76.22%; (non-O)
accuracy:  95.44%; precision:  84.74%; recall:  76.25%; FB1:  80.27
              LOC: precision:  89.67%; recall:  82.25%; FB1:  85.80  1685
             MISC: precision:  86.82%; recall:  70.72%; FB1:  77.94  751
              ORG: precision:  74.65%; recall:  63.24%; FB1:  68.47  1136
              PER: precision:  85.63%; recall:  82.52%; FB1:  84.05  1775


(84.73910604077052, 76.2537866038371, 80.27283196031534)