In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

tf.keras.utils.set_random_seed(33)

In [6]:
def load_data(file_path):

    with open(file_path,'r') as f:
        data = [line.strip() for line in f.readlines()]

    return data

In [7]:
train_sentences =  load_data('train_sentences.txt')
train_labels = load_data('train_labels.txt')

val_sentences =  load_data('val_sentences.txt')
val_labels = load_data('val_labels.txt')

test_sentences =  load_data('test_sentences.txt')
test_labels = load_data('test_labels.txt')

In [8]:
def get_sentence_vectorizer(sentences):

    # TextVectorization used to build vocab, transform sentence to vector
    sentence_vectorizer = tf.keras.layers.TextVectorization(
        standardize = None, # By default, standardize = 'lower_and_strip_punctuation', but in NER task we don't need that
        split='whitespace', # split tokens by whitespace
        max_tokens=5000,    # Maximum number of tokens in the vocabulary
    )

    # fit TextVectorization on the data
    sentence_vectorizer.adapt(sentences)

    vocab = sentence_vectorizer.get_vocabulary()

    return sentence_vectorizer, vocab

In [9]:
test_vectorizer, test_vocab = get_sentence_vectorizer(train_sentences[:1000])
print(f"Test vocab size: {len(test_vocab)}")

sentence = "I like learning new NLP models !"
sentence_vectorized = test_vectorizer(sentence)
print(f"Sentence: {sentence}\nSentence vectorized: {sentence_vectorized}")

Test vocab size: 4650
Sentence: I like learning new NLP models !
Sentence vectorized: [ 296  314    1   59    1    1 4649]


In [10]:
print(f"Sentence: {train_sentences[0]}")
print(f"Labels: {train_labels[0]}")

Sentence: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
Labels: O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O


In [11]:
def get_tags(labels):

    "get all tags without repetition"

    tag_set = set()

    for sent_label in labels:
        for tag in sent_label.split(" "):
            tag_set.add(tag)

    tag_list = list(tag_set)
    tag_list.sort()

    return tag_list

In [12]:
tags = get_tags(train_labels)
print(tags)

['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [13]:
def make_tag_map(tags):

    "returns dict for tag, index"

    tag_map = {}

    for i, tag in enumerate(tags):
        tag_map[tag] = i

    return tag_map

In [14]:
tag_map = make_tag_map(tags)
print(tag_map)

{'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16}


In [15]:
def label_vectorizer(labels, tag_map):

    "transform labels into vectors"

    label_ids = []

    for sent_labels in labels:

        splitted_labels = sent_labels.split(' ')
        label_id = []

        for label in splitted_labels:
            label_id.append(tag_map[label])

        label_ids.append(label_id)

    # padding value will be -1
    label_ids_padded = tf.keras.utils.pad_sequences(
        label_ids, padding = 'post', value = -1)

    return label_ids_padded

In [16]:
print(f"Sentence: {train_sentences[5]}")
print(f"Labels: {train_labels[5]}")
print(f"Vectorized labels: {label_vectorizer([train_labels[5]], tag_map)}")

Sentence: The party is divided over Britain 's participation in the Iraq conflict and the continued deployment of 8,500 British troops in that country .
Labels: O O O O O B-gpe O O O O B-geo O O O O O O O B-gpe O O O O O
Vectorized labels: [[16 16 16 16 16  3 16 16 16 16  2 16 16 16 16 16 16 16  3 16 16 16 16 16]]


In [17]:
sentence_vectorizer, vocab = get_sentence_vectorizer(train_sentences)

In [18]:
def generate_dataset(sentences, labels, sentence_vectorizer, tag_map):

    sentences_ids = sentence_vectorizer(sentences)
    labels_ids = label_vectorizer(labels, tag_map)

    dataset = tf.data.Dataset.from_tensor_slices((sentences_ids, labels_ids))

    return dataset

In [19]:
train_dataset = generate_dataset(
    train_sentences, train_labels,sentence_vectorizer,tag_map)

val_dataset = generate_dataset(
    val_sentences,val_labels,  sentence_vectorizer, tag_map)

test_dataset = generate_dataset(
    test_sentences, test_labels,  sentence_vectorizer, tag_map)

In [20]:
# Exploring information about the training data
print(f'The number of outputs is {len(tags)}')
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words in the training set: {g_vocab_size}")
print('The training size is', len(train_dataset))
print('The validation size is', len(val_dataset))
print('An example of the first sentence is\n\t', next(iter(train_dataset))[0].numpy())
print('An example of its corresponding label is\n\t', next(iter(train_dataset))[1].numpy())

The number of outputs is 17
Num of vocabulary words in the training set: 5000
The training size is 33570
The validation size is 7194
An example of the first sentence is
	 [1046    6 1121   18 1832  232  543    7  528    2  158    5   60    9
  648    2  922    6  192   87   22   16   54    3    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
An example of its corresponding label is
	 [16 16 16 16 16 16  2 16 16 16 16 16  2 16 16 16 16 16  3 16 16 16 16 16
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -

In [21]:
def NER(num_tags, vocab_size, embedding_dim = 50):

    model = tf.keras.Sequential([

        # if mask_zero = True, so the index 0 will reserve for padding
        # and we need to increase the vocab size by 1 because the first idx is reserved
        tf.keras.layers.Embedding(
            input_dim = vocab_size + 1,
            output_dim = embedding_dim,
            mask_zero = True),

        tf.keras.layers.LSTM(
            units = embedding_dim, return_sequences = True),

        tf.keras.layers.Dense(
            num_tags, activation = tf.nn.log_softmax)
    ])

    return model

In [22]:
def masked_loss(y_true, y_pred):

    "the loss function with ignoring the padding"

    loss_fun = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits = True, # output is log probability, so it's logits
        ignore_class = -1,  # the padding value in y_true
    )

    loss = loss_fun(y_true, y_pred)

    return loss

In [23]:
true_labels = np.array([0,1,2,0])

predicted_logits = np.array([[0.4,0.6,0.3] , [0.22,0.79,0.2], [1, 4.5,0.4], [0.4,0.4,7.2]])

masked_loss(true_labels, predicted_logits)

<tf.Tensor: shape=(), dtype=float32, numpy=3.2097778>

In [24]:
def masked_accuracy(y_true, y_pred):

    "get accuracy with ignoring for padding values in true labels"

    mask = tf.cast(tf.math.not_equal(y_true, -1), dtype = tf.int32)

    y_pred_labels = tf.argmax(y_pred, axis = -1)
    y_pred_labels = tf.cast(y_pred_labels, dtype = tf.int32)

    y_true = tf.cast(y_true, tf.int32)

    matches_true_pred = tf.equal(y_true, y_pred_labels)
    matches_true_pred = tf.cast(matches_true_pred,tf.int32)
    matches_true_pred *= mask

    masked_acc = tf.reduce_sum(matches_true_pred) / tf.reduce_sum(mask)

    return masked_acc

In [25]:
true_labels = np.array([0,1,2,0, -1])

predicted_logits = np.array([
    [0.1,0.6,0.36],
    [0.1,0.7,0.1],
    [0.1, 6.5,0.4],
    [1.4,0.4,0.2],
    [0.1,0.6,9.4]
    ])

print(masked_accuracy(true_labels, predicted_logits))

tf.Tensor(0.5, shape=(), dtype=float64)


In [42]:
n_tags = len(tag_map)
vocab_size = len(vocab)

model = NER(n_tags, vocab_size, embedding_dim = 50)
model.build(input_shape=(None, None))
model.summary()

**A note on padding**

let's check now how padding does not affect the model's output. Of course the output dimension will change. If ten zeros are added at the end of the tensor, then the resulting output dimension will have 10 more elements (more specifically, 10 more arrays of length 17 each). However, those are removed from any calculation further on, so it won't impact at all the model's performance and training. You will be using the function tf.expand_dims.

In [27]:
x = tf.expand_dims(np.array([545, 467, 896]), axis = 0)

x_padded = tf.expand_dims(np.array([545, 467, 896, 0, 0, 0]), axis = 0)


pred_x = model(x)
pred_x_padded = model(x_padded)
print(f'x shape: {pred_x.shape}\nx_padded shape: {pred_x_padded.shape}')

x shape: (1, 3, 17)
x_padded shape: (1, 6, 17)


In [28]:
# check if pred_x the same as pred_x_padded when we neglect padding vectors
np.allclose(pred_x, pred_x_padded[:, :3, :])

True

In [29]:
y_true =tf.expand_dims([16, 6, 12], axis = 0)
y_true_padded =tf.expand_dims([16,6,12,-1,-1,-1], axis = 0) # Remember you mapped the padded values to -1 in the labels
print(f"masked_loss is the same: {np.allclose(masked_loss(y_true, pred_x), masked_loss(y_true_padded, pred_x_padded))}")
print(f"masked_accuracy is the same: {np.allclose(masked_accuracy(y_true, pred_x), masked_accuracy(y_true_padded, pred_x_padded))}")

masked_loss is the same: True
masked_accuracy is the same: True


In [30]:
model.compile(loss = masked_loss,
              optimizer = tf.keras.optimizers.Adam(0.01),
              metrics = [masked_accuracy])

In [31]:
val_dataset = val_dataset.batch(64)
train_dataset = train_dataset.batch(64)
test_dataset = test_dataset.batch(64)

In [32]:
model.fit(train_dataset,
          validation_data = val_dataset,
          shuffle = True,
          epochs = 10)

Epoch 1/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 0.4659 - masked_accuracy: 0.8934 - val_loss: 0.1675 - val_masked_accuracy: 0.9497
Epoch 2/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 0.1697 - masked_accuracy: 0.9494 - val_loss: 0.1592 - val_masked_accuracy: 0.9508
Epoch 3/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 0.1541 - masked_accuracy: 0.9523 - val_loss: 0.1600 - val_masked_accuracy: 0.9510
Epoch 4/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.1439 - masked_accuracy: 0.9546 - val_loss: 0.1596 - val_masked_accuracy: 0.9509
Epoch 5/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - loss: 0.1361 - masked_accuracy: 0.9566 - val_loss: 0.1619 - val_masked_accuracy: 0.9503
Epoch 6/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.1303 - masked_accuracy:

<keras.src.callbacks.history.History at 0x790dcfd7add0>

In [33]:
save_path = 'weights.keras'
model.save(save_path)

In [34]:
test_sents_ids = sentence_vectorizer(test_sentences)
true_test_labels = label_vectorizer(test_labels,tag_map)

y_pred_test = model.predict(test_sents_ids)

test_acc = masked_accuracy(true_test_labels, y_pred_test)
test_acc

[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


<tf.Tensor: shape=(), dtype=float64, numpy=0.9472714005914206>

In [35]:
def predict(sentence, model, sentence_vectorizer, tag_map):

    sent_ids = sentence_vectorizer(sentence)
    sent_ids = tf.expand_dims(sent_ids,axis=0)

    pred_logits = model(sent_ids)
    pred_labels = tf.argmax(pred_logits, axis = -1)

    labels = list(tag_map.keys())
    pred = []

    for label in pred_labels.numpy()[0]:
        tag = labels[label]
        pred.append(tag)

    return pred

In [36]:
sentence = "OpenAI and Google are great companies to work for in US"

predict(sentence, model, sentence_vectorizer, tag_map)

['O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo']