## 介紹
- NER: 命名實體識別，對於以 word 為單位會給予更細緻的資訊，在一些特定可以產生大用。[參考資訊](https://medium.com/royes-researchcraft/%E8%87%AA%E7%84%B6%E8%AA%9E%E8%A8%80%E8%99%95%E7%90%86-3-%E5%91%BD%E5%90%8D%E5%AF%A6%E9%AB%94%E6%A8%99%E8%A8%BB-name-entity-recognition-%E7%90%86%E8%AB%96%E8%A8%AD%E8%A8%88%E7%AF%87-923348c31a7b)


In [1]:
# 取得資料集

!pip install datasets
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

--2021-09-06 02:35:35--  https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7502 (7.3K) [text/plain]
Saving to: ‘conlleval.py.1’


2021-09-06 02:35:35 (40.7 MB/s) - ‘conlleval.py.1’ saved [7502/7502]



In [2]:
# 模組

import os
import numpy as np  
from datasets import load_dataset       # HuggingFace datasets
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import Counter
from conlleval import evaluate      # *

In [3]:
# 先從模型建構開始
# Transformer block, 需要encoder部分

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.att = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim
        )
        self.ffn = keras.Sequential([
            layers.Dense(units=ff_dim, activation='relu'),
            layers.Dense(units=embed_dim)
        ])
        self.layernorm1 = layers.LayerNormalization()
        self.layernorm2 = layers.LayerNormalization()
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attention_output = self.att(inputs, inputs)
        attention_output = self.dropout1(attention_output, training=training)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        
        return self.layernorm2(out1 + ffn_output)

In [4]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings

In [5]:
# NER model

class NERTransformer(keras.Model):
    def __init__(self, num_tags, maxlen=128, vocab_size=30000, embed_dim=32, num_heads=2, ff_dim=32, rate=0.1):
        super(NERTransformer, self).__init__()
        self.embedding = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_encoder = TransformerEncoder(embed_dim, num_heads, ff_dim, rate)
        self.dropout1 = layers.Dropout(rate)
        self.ffn = layers.Dense(ff_dim, activation='relu')
        self.dropout2 = layers.Dropout(rate)
        self.classifier = layers.Dense(units=num_tags, activation='softmax')
    
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.transformer_encoder(x)
        x = self.dropout1(x, training=training)
        x = self.ffn(x)
        x = self.dropout2(x, training=training)
        return self.classifier(x)

In [6]:
def create_model():
    inputs = keras.Input(shape=(128, ), dtype=tf.int32)
    outputs = NERTransformer(num_tags=10)(inputs)
    
    return keras.Model(inputs=inputs, outputs=outputs)

ner_model = create_model()
ner_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
ner_transformer (NERTransfor (None, 128, 10)           976138    
Total params: 976,138
Trainable params: 976,138
Non-trainable params: 0
_________________________________________________________________


In [7]:
ner_model(np.random.randint(2, 100, size=(1,128)))

<tf.Tensor: shape=(1, 128, 10), dtype=float32, numpy=
array([[[0.06237963, 0.09621482, 0.02808117, ..., 0.18108463,
         0.07764017, 0.06209963],
        [0.13527536, 0.02988676, 0.10768166, ..., 0.2491994 ,
         0.10046712, 0.04631824],
        [0.17903793, 0.12856357, 0.11218224, ..., 0.23698992,
         0.07657179, 0.02472914],
        ...,
        [0.18590496, 0.13187802, 0.00812685, ..., 0.29874146,
         0.22643621, 0.00977837],
        [0.18811896, 0.08036596, 0.00589105, ..., 0.07163271,
         0.20709743, 0.06145992],
        [0.04475464, 0.07802353, 0.01901839, ..., 0.19102167,
         0.30132622, 0.05419953]]], dtype=float32)>

## Load the CoNLL 2003 dataset from the datasets library and process it

In [8]:
conll_data = load_dataset("conll2003")
conll_data

Reusing dataset conll2003 (/root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [9]:
conll_data['train']

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 14041
})

In [10]:
print(conll_data['train']['tokens'][:10])
print(conll_data['train']['ner_tags'][:10])

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn'], ['BRUSSELS', '1996-08-22'], ['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.'], ['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.'], ['"', 'We', 'do', "n't", 'support', 'any', 'such', 'recommendation', 'because', 'we', 'do', "n't", 'see', 'any', 'grounds', 'for', 'it', ',', '"', 'the', 'Commission', "'s", 'chief', 'spokesman', 'Nikolaus', 'van', 'der', 'Pas', 'told', 'a', 'news', 'briefing', '.'], ['He', 'said

In [12]:
# 將其轉換為tf.data.Dataset
# len   sentence of tokens sentence of ner_tags

def export_to_file(export_file_path, data):
    with open(export_file_path, "w") as f:
        for record in data:
            ner_tags = record["ner_tags"]
            tokens = record["tokens"]
            f.write(
                str(len(tokens))
                + "\t"
                + "\t".join(tokens)
                + "\t"
                + "\t".join(map(str, ner_tags))
                + "\n"
            )


os.mkdir("data")
export_to_file("./data/conll_train.txt", conll_data["train"])
export_to_file("./data/conll_val.txt", conll_data["validation"])

In [13]:
# 查詢table

def make_tag_lookup_table():
    iob_labels = ["B", "I"]
    ner_labels = ["PER", "ORG", "LOC", "MISC"]
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ["-".join([a, b]) for a, b in all_labels]
    all_labels = ["[PAD]", "O"] + all_labels
    return dict(zip(range(0, len(all_labels) + 1), all_labels))


mapping = make_tag_lookup_table()
print(mapping)

{0: '[PAD]', 1: 'O', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-ORG', 6: 'B-LOC', 7: 'I-LOC', 8: 'B-MISC', 9: 'I-MISC'}


In [14]:
# all_tokens = []
# for sub_tokens in conll_data['train']['tokens']:
#     all_tokens += sub_tokens
all_tokens = sum(conll_data["train"]["tokens"], [])
all_tokens_array = np.array(list(map(str.lower, all_tokens)))       # 小寫所有token

counter = Counter(all_tokens_array)
print(len(counter))            

num_tags = len(mapping)
vocab_size = len(counter)

# We only take (vocab_size - 2) most commons words from the training data since
# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
# token and another one denoting a masking token
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# The StringLook class will convert tokens to token IDs
lookup_layer = keras.layers.StringLookup(
    vocabulary=vocabulary
)

21009


In [15]:
counter.most_common(10)

[('the', 8390),
 ('.', 7374),
 (',', 7290),
 ('of', 3815),
 ('in', 3621),
 ('to', 3424),
 ('a', 3199),
 ('and', 2872),
 ('(', 2861),
 (')', 2861)]

In [16]:
# 創造 tf.data.Dataset
# a line = a row data

train_data = tf.data.TextLineDataset("./data/conll_train.txt")
val_data = tf.data.TextLineDataset("./data/conll_val.txt")

In [17]:
for x in train_data:
    print(x)
    break

tf.Tensor(b'9\tEU\trejects\tGerman\tcall\tto\tboycott\tBritish\tlamb\t.\t3\t0\t7\t0\t0\t0\t7\t0\t0', shape=(), dtype=string)


In [18]:
print(list(train_data.take(1).as_numpy_iterator()))

[b'9\tEU\trejects\tGerman\tcall\tto\tboycott\tBritish\tlamb\t.\t3\t0\t7\t0\t0\t0\t7\t0\t0']


In [19]:
# 將dataset, 透過函數轉換成可使用的input

def map_record_to_training_data(record):
    record = tf.strings.split(input=record, sep='\t')
    length = tf.strings.to_number(record[0], out_type=tf.int32) # 此位置為 length
    tokens = record[1:length+1]     
    tags = record[length+1:]
    tags = tf.strings.to_number(tags, out_type=tf.int64)        # *
    tags += 1
    return tokens, tags

def lowercase_and_convert_to_ids(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)

In [20]:
# We use `padded_batch` here because each record in the dataset has a
# different length.
batch_size = 32
train_dataset = (
    train_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)
val_dataset = (
    val_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)

# ner_model = create_model(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ner_model = NERTransformer(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)

In [21]:
# 透過自制loss去忽略padding計算, 用sample_weight也可以做到。

class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, name='custom_ner_loss'):
        super(CustomNonPaddingTokenLoss, self).__init__(name=name)
    
    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=True,
            reduction=keras.losses.Reduction.NONE
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast(x=(y_true > 0), dtype=tf.float32)        # 轉型
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

loss = CustomNonPaddingTokenLoss()

In [22]:
# 模型設定以及訓練

ner_model.compile(optimizer="adam", loss=loss)
ner_model.fit(train_dataset, epochs=10)


def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return lowercase_and_convert_to_ids(tokens)


# Sample inference using the trained model
sample_input = tokenize_and_convert_to_ids(
    "eu rejects german call to boycott british lamb"
)
sample_input = tf.reshape(sample_input, shape=[1, -1])
print(sample_input)

output = ner_model.predict(sample_input)
prediction = np.argmax(output, axis=-1)[0]
prediction = [mapping[i] for i in prediction]

# eu -> B-ORG, german -> B-MISC, british -> B-MISC
print(prediction)

Epoch 1/10


  '"`sparse_categorical_crossentropy` received `from_logits=True`, but '


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
tf.Tensor([[  988 10950   204   628     6  3938   215  5773]], shape=(1, 8), dtype=int64)
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O']


In [23]:
def calculate_metrics(dataset):
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        output = ner_model.predict(x)
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])

        true_tag_ids = np.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]

    evaluate(real_tags, predicted_tags)


calculate_metrics(val_dataset)

processed 51362 tokens with 5942 phrases; found: 8790 phrases; correct: 4113.
accuracy:  66.10%; (non-O)
accuracy:  89.40%; precision:  46.79%; recall:  69.22%; FB1:  55.84
              LOC: precision:  50.95%; recall:  81.93%; FB1:  62.83  2954
             MISC: precision:  55.34%; recall:  68.00%; FB1:  61.02  1133
              ORG: precision:  30.57%; recall:  69.80%; FB1:  42.52  3062
              PER: precision:  63.68%; recall:  56.73%; FB1:  60.01  1641
