In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from datasets import Dataset, Features, Sequence, Value
from transformers import BertTokenizerFast
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers

In [2]:
df = pd.read_csv("NER_dataset.csv", encoding='windows-1252')

In [3]:
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')


In [4]:
df[:10]

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [5]:
grouped = df.groupby("Sentence #").agg(list)
sentences = grouped['Word'].tolist()
ner_tags = grouped['Tag'].tolist()

In [6]:
grouped[:4]

Unnamed: 0_level_0,Word,POS,Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]"


In [7]:
sentences[:4]

[['Thousands',
  'of',
  'demonstrators',
  'have',
  'marched',
  'through',
  'London',
  'to',
  'protest',
  'the',
  'war',
  'in',
  'Iraq',
  'and',
  'demand',
  'the',
  'withdrawal',
  'of',
  'British',
  'troops',
  'from',
  'that',
  'country',
  '.'],
 ['Iranian',
  'officials',
  'say',
  'they',
  'expect',
  'to',
  'get',
  'access',
  'to',
  'sealed',
  'sensitive',
  'parts',
  'of',
  'the',
  'plant',
  'Wednesday',
  ',',
  'after',
  'an',
  'IAEA',
  'surveillance',
  'system',
  'begins',
  'functioning',
  '.'],
 ['Helicopter',
  'gunships',
  'Saturday',
  'pounded',
  'militant',
  'hideouts',
  'in',
  'the',
  'Orakzai',
  'tribal',
  'region',
  ',',
  'where',
  'many',
  'Taliban',
  'militants',
  'are',
  'believed',
  'to',
  'have',
  'fled',
  'to',
  'avoid',
  'an',
  'earlier',
  'military',
  'offensive',
  'in',
  'nearby',
  'South',
  'Waziristan',
  '.'],
 ['They',
  'left',
  'after',
  'a',
  'tense',
  'hour-long',
  'standoff',
  'wi

In [8]:
ner_tags[:4]

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-tim',
  'O',
  'O',
  'O',
  'B-org',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'B-tim',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-org',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'I-geo',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]

## Label encoding

In [9]:
label_encoder = LabelEncoder()
flat_tags = [tag for sublist in ner_tags for tag in sublist]
label_encoder.fit(flat_tags)
encoded_tags = [label_encoder.transform(tags).tolist() for tags in ner_tags]
label_list = label_encoder.classes_.tolist()
num_classes = len(label_list)

In [10]:
num_classes

18

In [11]:
label_list

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O',
 'nan']

In [12]:
encoded_tags[:4]

[[16,
  16,
  16,
  16,
  16,
  16,
  2,
  16,
  16,
  16,
  16,
  16,
  2,
  16,
  16,
  16,
  16,
  16,
  3,
  16,
  16,
  16,
  16,
  16],
 [3,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  7,
  16,
  16,
  16,
  5,
  16,
  16,
  16,
  16,
  16],
 [16,
  16,
  7,
  16,
  16,
  16,
  16,
  16,
  2,
  16,
  16,
  16,
  16,
  16,
  5,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  2,
  10,
  16],
 [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]]

# Create Dataset

In [13]:
print(f"tokens: {sentences[0]}")
print(f"tag encoded: {encoded_tags[0]}")

tokens: ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
tag encoded: [16, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16, 3, 16, 16, 16, 16, 16]


In [14]:
data_dicts = [{"tokens": tokens, "ner_tags": tags} for tokens, tags in zip(sentences, encoded_tags)]

features = Features({
    'tokens': Sequence(Value(dtype='string')),
    'ner_tags': Sequence(Value(dtype='int64'))
})

dataset = Dataset.from_list(data_dicts, features=features)

In [15]:
data_dicts[0]

{'tokens': ['Thousands',
  'of',
  'demonstrators',
  'have',
  'marched',
  'through',
  'London',
  'to',
  'protest',
  'the',
  'war',
  'in',
  'Iraq',
  'and',
  'demand',
  'the',
  'withdrawal',
  'of',
  'British',
  'troops',
  'from',
  'that',
  'country',
  '.'],
 'ner_tags': [16,
  16,
  16,
  16,
  16,
  16,
  2,
  16,
  16,
  16,
  16,
  16,
  2,
  16,
  16,
  16,
  16,
  16,
  3,
  16,
  16,
  16,
  16,
  16]}

## Tokenize

In [16]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [17]:
special_tokens = {
    "additional_special_tokens": ["<PAD>"]
}
tokenizer.add_special_tokens(special_tokens)

1

In [None]:
def encode(input):
    tokens = input["tokens"]
    encoding = tokenizer(tokens, is_split_into_words=True, truncation=True, padding="max_length", max_length=128)
    word_ids = encoding.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(0)
        elif word_idx != previous_word_idx:
            labels.append(input['ner_tags'][word_idx])
        else:
            labels.append(input['ner_tags'][word_idx])
        previous_word_idx = word_idx
    encoding["labels"] = labels
    return encoding

encoded_dataset = dataset.map(encode)


Map:   0%|          | 0/16571 [00:00<?, ? examples/s]

In [19]:
split_dataset = encoded_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset_hf = split_dataset['train']
test_dataset_hf = split_dataset['test']


In [20]:
def gen(dataset):
    for item in dataset:
        labels = tf.one_hot(item["labels"], depth=num_classes)
        yield ({
            "input_ids": tf.convert_to_tensor(item["input_ids"], dtype=tf.int32),
            "attention_mask": tf.convert_to_tensor(item["attention_mask"], dtype=tf.int32)
        }, labels)

output_signature = (
    {
        "input_ids": tf.TensorSpec(shape=(128,), dtype=tf.int32),
        "attention_mask": tf.TensorSpec(shape=(128,), dtype=tf.int32),
    },
    tf.TensorSpec(shape=(128, num_classes), dtype=tf.float32),
)

batch_size=32

train_dataset = tf.data.Dataset.from_generator(lambda: gen(train_dataset_hf), output_signature=output_signature).shuffle(1000).batch(batch_size)
test_dataset = tf.data.Dataset.from_generator(lambda: gen(test_dataset_hf), output_signature=output_signature).batch(batch_size)


## Positional Encoding

In [21]:
def positional_encoding(length, depth):
    depth = depth / 2
    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :] / depth
    angle_rates = 1 / (10000**depths)
    angle_rads = positions * angle_rates
    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)
    return tf.cast(pos_encoding, dtype=tf.float32)

## Positional Embedding

In [22]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model, max_length=128):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=max_length, depth=d_model)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[tf.newaxis, :length, :]
        return x

## Attention

In [23]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.add = tf.keras.layers.Add()
        self.norm = tf.keras.layers.LayerNormalization()

In [24]:
class GlobalSelfAttention(BaseAttention):
    def call(self, x, mask=None):
        attn_output = self.mha(query=x, value=x, key=x, attention_mask=mask)
        x = self.add([x, attn_output])
        x = self.norm(x)
        return x

## Feedforward

In [25]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.norm(x)
        return x

## Encoder

In [26]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.attention = GlobalSelfAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)
        self.ffn = FeedForward(d_model, dff, dropout_rate)

    def call(self, x, mask):
        x = self.attention(x, mask)
        x = self.ffn(x)
        return x

## Transformer

In [27]:
class TransformerEncoder(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, num_classes, dropout_rate=0.1):
        super().__init__()
        self.embedding = PositionalEmbedding(vocab_size, d_model)
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.classifier = tf.keras.layers.Dense(num_classes)

    def call(self, inputs):
        x = inputs['input_ids']
        mask = tf.cast(tf.math.not_equal(inputs['attention_mask'], 0), tf.float32)[:, tf.newaxis, tf.newaxis, :]
        x = self.embedding(x)
        x = self.dropout(x)
        for enc_layer in self.enc_layers:
            x = enc_layer(x, mask)
        return self.classifier(x)

## Train

In [28]:
model = TransformerEncoder(
    num_layers=4,
    d_model=128,
    num_heads=8,
    dff=512,
    vocab_size=len(tokenizer),
    num_classes=num_classes,
    dropout_rate=0.1
)

In [29]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['categorical_accuracy']
)

In [None]:
model.fit(train_dataset, validation_data=test_dataset, epochs=1)

    366/Unknown [1m1186s[0m 3s/step - categorical_accuracy: 0.9328 - loss: 0.2977