In [31]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa
import tensorflow_datasets as tfds

from keras.models import Model
from keras import losses as L
from keras.layers import (
    TimeDistributed,
    Dense,
    GRU,
    Embedding,
    TextVectorization,
    Input,
    StringLookup,
    Bidirectional,
)

import numpy as np

from utils import CustomNonPaddingTokenLoss
from conlleval import evaluate

In [4]:
(ds_dev, ds_train, ds_test), info = tfds.load(
    name="conll2003",
    split=["dev", "train", "test",],
    with_info=True,
)

2022-12-30 12:13:40.342085: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-30 12:13:40.342413: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-30 12:13:40.342442: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (calcifer-Inspiron-7370): /proc/driver/nvidia/version does not exist
2022-12-30 12:13:40.343980: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
info

tfds.core.DatasetInfo(
    name='conll2003',
    full_name='conll2003/conll2022/1.0.0',
    description="""
    The shared task of CoNLL-2003 concerns language-independent named entity
    recognition and concentrates on four types of named entities: persons,
    locations, organizations and names of miscellaneous entities that do not belong
    to the previous three groups.
    """,
    homepage='https://www.aclweb.org/anthology/W03-0419/',
    data_path='/home/calcifer/tensorflow_datasets/conll2003/conll2022/1.0.0',
    file_format=tfrecord,
    download_size=959.94 KiB,
    dataset_size=3.87 MiB,
    features=FeaturesDict({
        'chunks': Sequence(ClassLabel(shape=(), dtype=tf.int64, num_classes=23)),
        'ner': Sequence(ClassLabel(shape=(), dtype=tf.int64, num_classes=9)),
        'pos': Sequence(ClassLabel(shape=(), dtype=tf.int64, num_classes=47)),
        'tokens': Sequence(Text(shape=(), dtype=tf.string)),
    }),
    supervised_keys=None,
    disable_shuffling=False,
  

In [22]:
VOCAB_SIZE = 10000
BATCH_SIZE = 64
MAX_SEQUENCE_LENGTH = 125
EMBEDDING_DIM = 128
NUM_EPOCHS = 20

In [11]:
vectorizer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    standardize="lower",
    split=None,
    ragged=True,
    # output_sequence_length=...,
)

vectorizer.adapt(ds_train.map(lambda r: r.get("tokens")))

In [12]:
label_names = info.features["ner"].names
print("label_names:", label_names)

label_names: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [13]:
ds_train_cld = (
    ds_train
    .map(lambda r: (vectorizer(r["tokens"]), r["ner"]+1))
    .shuffle(buffer_size=100)
    .padded_batch(batch_size=64, padded_shapes=([None], [None]))
)
ds_dev_cld = (
    ds_dev
    .map(lambda r: (vectorizer(r["tokens"]), r["ner"]+1))
    .padded_batch(batch_size=64, padded_shapes=([None], [None]))
)
ds_test_cld = (
    ds_test
    .map(lambda r: (vectorizer(r["tokens"]), r["ner"]+1))
    .padded_batch(batch_size=64, padded_shapes=([None], [None]))
)

In [14]:
for x in ds_dev_cld.take(1).as_numpy_iterator():
    print("Tokens shape:", x[0].shape)
    print("Tags shape:", x[1].shape)  # should be equal to tokens shape

Tokens shape: (64, 41)
Tags shape: (64, 41)


2022-12-30 12:18:20.233140: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Utils

In [15]:
def compute_metrics(ds, model):
    prob_preds_test = model.predict(ds)
    preds_test = [np.argmax(x, axis=-1) for x in prob_preds_test.to_list()]

    labels_test = [r[1] for r in ds.unbatch().as_numpy_iterator()]

    for idx in range(len(preds_test)):
        assert preds_test[idx].shape == labels_test[idx].shape
    
    preds_idx_concat, labels_idx_concat = list(), list()

    for (p, l) in zip(preds_dev, labels_dev):
        mask = l > 0
        preds_idx_concat += p[mask].tolist()
        labels_idx_concat += l[mask].tolist()

    preds_concat = [label_names[tag-1] for tag in preds_idx_concat]
    labels_concat = [label_names[tag-1] for tag in labels_idx_concat]
    
    evaluate(labels_concat, preds_concat)

## Bi-LSTM

In [28]:
keras.backend.clear_session()
tf.random.set_seed(0)

inputs = Input(shape=(None,), name="tokens")
x = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    mask_zero=True,
)(inputs)
x = Bidirectional(layer=GRU(
    units=128,
    dropout=0.2,
    recurrent_dropout=0.2,
    return_sequences=True,
))(x)
outputs = TimeDistributed(layer=Dense(units=len(label_names)+1, activation="softmax"), name="tags")(x)

model = Model(inputs=inputs, outputs=outputs)

model.summary()

In [29]:
model.compile(
    optimizer="adam",
    loss=CustomNonPaddingTokenLoss(),
    metrics=["accuracy"]
)

In [30]:
model.fit(
    ds_train_cld,
    epochs=NUM_EPOCHS,
    validation_data=ds_dev_cld,
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fd0ec16a590>

In [31]:
prob_preds_test = model.predict(ds_test_cld)
preds_test = [np.argmax(x, axis=-1) for x in prob_preds_test.to_list()]

labels_test = [r[1] for r in ds_test_cld.unbatch().as_numpy_iterator()]

for idx in range(len(preds_test)):
    assert preds_test[idx].shape == labels_test[idx].shape



In [32]:
preds_idx_concat, labels_idx_concat = list(), list()

for (p, l) in zip(preds_dev, labels_dev):
    mask = l > 0
    preds_idx_concat += p[mask].tolist()
    labels_idx_concat += l[mask].tolist()
    
preds_concat = [label_names[tag-1] for tag in preds_idx_concat]
labels_concat = [label_names[tag-1] for tag in labels_idx_concat]

In [33]:
evaluate(labels_concat, preds_concat)

processed 51362 tokens with 5942 phrases; found: 5347 phrases; correct: 4531.
accuracy:  76.22%; (non-O)
accuracy:  95.44%; precision:  84.74%; recall:  76.25%; FB1:  80.27
              LOC: precision:  89.67%; recall:  82.25%; FB1:  85.80  1685
             MISC: precision:  86.82%; recall:  70.72%; FB1:  77.94  751
              ORG: precision:  74.65%; recall:  63.24%; FB1:  68.47  1136
              PER: precision:  85.63%; recall:  82.52%; FB1:  84.05  1775


(84.73910604077052, 76.2537866038371, 80.27283196031534)

## Bi-LSTM with CRF

Source: https://colab.research.google.com/drive/1kUUrn622sG9LeVz42XPpkLZEeoVW59Aq?usp=sharing#scrollTo=3b38225d9464

In [104]:
keras.backend.clear_session()
tf.random.set_seed(0)

inputs = Input(shape=(None,), name="tokens")
x = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    mask_zero=True,
)(inputs)
x = Bidirectional(layer=GRU(
    units=128,
    dropout=0.2,
    recurrent_dropout=0.2,
    return_sequences=True,
))(x)
x = TimeDistributed(layer=Dense(units=16, activation="relu"), name="tags")(x)
decoded_sequence, potentials, sequence_length, chain_kernel = tfa.layers.CRF(units=len(label_names)+1, )(x)
model = Model(inputs=inputs, outputs=[decoded_sequence, potentials, sequence_length, chain_kernel])

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tokens (InputLayer)         [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         1280000   
                                                                 
 bidirectional (Bidirectiona  (None, None, 256)        198144    
 l)                                                              
                                                                 
 tags (TimeDistributed)      (None, None, 16)          4112      
                                                                 
 crf (CRF)                   [(None, None),            290       
                              (None, None, 10),                  
                              (None,),                           
                              (10, 10)]                      

In [105]:
@tf.function
def crf_loss_func(potentials, sequence_length, kernel, y):
    crf_likelihood, _ = tfa.text.crf_log_likelihood(
        potentials, y, sequence_length, kernel
    )
    # likelihood to loss
    flat_crf_loss = -1 * crf_likelihood
    crf_loss = tf.reduce_mean(flat_crf_loss)

    return crf_loss

In [106]:
optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name="train_loss")

@tf.function(experimental_relax_shapes=True)
def train_step(x, y):
    with tf.GradientTape() as tape:
        decoded_sequence, potentials, sequence_length, kernel = model(x)
        crf_loss = crf_loss_func(potentials, sequence_length, kernel, y)
        loss = crf_loss + tf.reduce_sum(model.losses)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    train_loss(loss)

In [107]:
for epoch in range(NUM_EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()

    for x, y in ds_train_cld:
        train_step(x, y)

    print(f"Epoch {epoch + 1}, " f"Loss: {train_loss.result()}")

2022-12-30 13:18:37.068467: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: StatefulPartitionedCall/cond_5/branch_executed/_1220


Epoch 1, Loss: 9.223627090454102
Epoch 2, Loss: 2.494995355606079
Epoch 3, Loss: 1.4105294942855835
Epoch 4, Loss: 0.9632676839828491
Epoch 5, Loss: 0.7145354747772217
Epoch 6, Loss: 0.5659372210502625
Epoch 7, Loss: 0.44172558188438416
Epoch 8, Loss: 0.3533775806427002
Epoch 9, Loss: 0.29467251896858215
Epoch 10, Loss: 0.25024157762527466
Epoch 11, Loss: 0.2025570273399353
Epoch 12, Loss: 0.16012346744537354
Epoch 13, Loss: 0.14195403456687927
Epoch 14, Loss: 0.11544784903526306
Epoch 15, Loss: 0.09686145186424255
Epoch 16, Loss: 0.0824529379606247
Epoch 17, Loss: 0.07007083296775818
Epoch 18, Loss: 0.06280367076396942
Epoch 19, Loss: 0.06254170835018158
Epoch 20, Loss: 0.052692051976919174


In [108]:
for idx, r in enumerate(ds_test.take(10).as_numpy_iterator()):
    print("Tokens:", " ".join(t.decode("utf-8") for t in r["tokens"]))
    print("True:", [label_names[tag-1] for tag in labels_test[idx] if tag > 0])
    print("Pred:", [label_names[tag-1] for tag in prob_preds_test[idx] if tag > 0])
    print("")

Tokens: Stefanel Milan ( Italy ) 9 6 3 15
True: ['B-ORG', 'I-ORG', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O']
Pred: ['B-PER', 'B-ORG', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O']

Tokens: Bowling : Wasim Akram 8.1-0-43-3 ( 9w , 1nb ) , Waqar Younis
True: ['O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER']
Pred: ['O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER']

Tokens: Some said the central bank may have been concerned a weaker yen would lead to unfounded pessimism about Japan 's economy .
True: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O']
Pred: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O']

Tokens: Zaragoza 15 2 8 5 18 23 14
True: ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred: ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Tokens: SATURDAY , DECEMBER 7 SCHEDULE
True: ['O', 'O', 'O', 'O', 'O']


In [109]:
def compute_metrics(ds, model):
    preds_test, *_ = model.predict(ds)
    labels_test = [r[1] for r in ds.unbatch().as_numpy_iterator()]

    for idx in range(len(labels_test)):
        assert preds_test[idx].shape == labels_test[idx].shape, f"error at id: {idx}"
    
    preds_idx_concat, labels_idx_concat = list(), list()

    for (p, l) in zip(preds_test, labels_test):
        mask = l > 0
        preds_idx_concat += p.numpy()[mask].tolist()
        labels_idx_concat += l[mask].tolist()

    preds_concat = [label_names[tag-1] for tag in preds_idx_concat]
    labels_concat = [label_names[tag-1] for tag in labels_idx_concat]
    
    evaluate(labels_concat, preds_concat)

In [111]:
compute_metrics(ds_test_cld, model)

processed 46435 tokens with 5648 phrases; found: 5219 phrases; correct: 3719.
accuracy:  69.30%; (non-O)
accuracy:  92.74%; precision:  71.26%; recall:  65.85%; FB1:  68.45
              LOC: precision:  80.52%; recall:  76.32%; FB1:  78.36  1581
             MISC: precision:  64.60%; recall:  60.83%; FB1:  62.66  661
              ORG: precision:  61.99%; recall:  61.95%; FB1:  61.97  1660
              PER: precision:  75.17%; recall:  61.22%; FB1:  67.48  1317
