## 介紹
- NER: 命名實體識別，對於以 word 為單位會給予更細緻的資訊，在一些特定可以產生大用。[參考資訊](https://medium.com/royes-researchcraft/%E8%87%AA%E7%84%B6%E8%AA%9E%E8%A8%80%E8%99%95%E7%90%86-3-%E5%91%BD%E5%90%8D%E5%AF%A6%E9%AB%94%E6%A8%99%E8%A8%BB-name-entity-recognition-%E7%90%86%E8%AB%96%E8%A8%AD%E8%A8%88%E7%AF%87-923348c31a7b)


In [1]:
# 取得資料集

!pip install datasets
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

Collecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
[?25l[K     |█                               | 10 kB 17.2 MB/s eta 0:00:01[K     |██▏                             | 20 kB 22.5 MB/s eta 0:00:01[K     |███▏                            | 30 kB 12.2 MB/s eta 0:00:01[K     |████▎                           | 40 kB 9.1 MB/s eta 0:00:01[K     |█████▍                          | 51 kB 5.7 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 5.8 MB/s eta 0:00:01[K     |███████▌                        | 71 kB 5.7 MB/s eta 0:00:01[K     |████████▋                       | 81 kB 6.4 MB/s eta 0:00:01[K     |█████████▋                      | 92 kB 6.4 MB/s eta 0:00:01[K     |██████████▊                     | 102 kB 5.4 MB/s eta 0:00:01[K     |███████████▊                    | 112 kB 5.4 MB/s eta 0:00:01[K     |████████████▉                   | 122 kB 5.4 MB/s eta 0:00:01[K     |██████████████                  | 133 kB 5.4 MB/s eta 0:00:01

In [2]:
# 模組

import os
import numpy as np  
from datasets import load_dataset       # HuggingFace datasets
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import Counter
from conlleval import evaluate      

In [3]:
# 先從模型建構開始
# Transformer block, 需要encoder部分

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.att = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim
        )
        self.ffn = keras.Sequential([
            layers.Dense(units=ff_dim, activation='relu'),
            layers.Dense(units=embed_dim)
        ])
        self.layernorm1 = layers.LayerNormalization()
        self.layernorm2 = layers.LayerNormalization()
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attention_output = self.att(inputs, inputs)
        attention_output = self.dropout1(attention_output, training=training)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        
        return self.layernorm2(out1 + ffn_output)

In [8]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings

In [9]:
# NER model

class NERTransformer(keras.Model):
    def __init__(self, num_tags, maxlen=128, vocab_size=30000, embed_dim=32, num_heads=2, ff_dim=32, rate=0.1):
        super(NERTransformer, self).__init__()
        self.embedding = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_encoder = TransformerEncoder(embed_dim, num_heads, ff_dim, rate)
        self.dropout1 = layers.Dropout(rate)
        self.ffn = layers.Dense(ff_dim, activation='relu')
        self.dropout2 = layers.Dropout(rate)
        self.classifier = layers.Dense(units=num_tags, activation='softmax')
    
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.transformer_encoder(x)
        x = self.dropout1(x, training=training)
        x = self.ffn(x)
        x = self.dropout2(x, training=training)
        return self.classifier(x)

In [10]:
def create_model():
    inputs = keras.Input(shape=(128, ), dtype=tf.int32)
    outputs = NERTransformer(num_tags=10)(inputs)
    
    return keras.Model(inputs=inputs, outputs=outputs)

ner_model = create_model()
ner_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 128)]             0         
                                                                 
 ner_transformer_1 (NERTrans  (None, 128, 10)          976138    
 former)                                                         
                                                                 
Total params: 976,138
Trainable params: 976,138
Non-trainable params: 0
_________________________________________________________________


In [11]:
# test 模型建構

ner_model(np.random.randint(2, 100, size=(1,128)))

<tf.Tensor: shape=(1, 128, 10), dtype=float32, numpy=
array([[[0.00557135, 0.1340287 , 0.06035688, ..., 0.09155902,
         0.03129598, 0.06532156],
        [0.00953622, 0.29689997, 0.20168866, ..., 0.0962644 ,
         0.06850299, 0.02784863],
        [0.08990407, 0.13213487, 0.02095668, ..., 0.10830377,
         0.14824402, 0.1417393 ],
        ...,
        [0.03120897, 0.09387311, 0.25704387, ..., 0.18323582,
         0.08552855, 0.05473125],
        [0.07588514, 0.23372371, 0.09378874, ..., 0.18578942,
         0.09087655, 0.03129123],
        [0.03479563, 0.2554504 , 0.14394003, ..., 0.23075898,
         0.15841122, 0.03928217]]], dtype=float32)>

## Load the CoNLL 2003 dataset from the datasets library and process it
- 經典的資料集，[中文參考文章](https://yuanxiaosc.github.io/2018/12/26/%E5%91%BD%E5%90%8D%E5%AE%9E%E4%BD%93%E8%AF%86%E5%88%ABCoNLL2003/)

In [12]:
conll_data = load_dataset("conll2003")
conll_data

Downloading:   0%|          | 0.00/2.60k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 4.63 MiB, generated: 9.78 MiB, post-processed: Unknown size, total: 14.41 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6...


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/650k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/146k [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [13]:
# 觀察 train data
conll_data['train']

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 14041
})

In [15]:
len(conll_data['train']['tokens'])

14041

In [16]:
# 每一個row 是一個句子，已經以word為單位分開，對應的ner_tags 則是標籤

for i in range(2):
    print(conll_data['train']['tokens'][i])
    print(conll_data['train']['ner_tags'][i])
    print('-'*20)

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[3, 0, 7, 0, 0, 0, 7, 0, 0]
--------------------
['Peter', 'Blackburn']
[1, 2]
--------------------


In [17]:
# 先將檔案轉換成txt, 後續透過tf.data.Dataset導入

def export_to_file(export_file_path, data):
    """
        將檔案寫成txt檔案。
        
        token數目   整句話  每一個token的標籤(str)
    """
    with open(export_file_path, "w") as f:
        for record in data:
            ner_tags = record["ner_tags"]
            tokens = record["tokens"]
            f.write(
                str(len(tokens))
                + "\t"
                + "\t".join(tokens)
                + "\t"
                + "\t".join(map(str, ner_tags))
                + "\n"
            )

# 創建資料夾並且寫入
os.mkdir("data")
export_to_file("./data/conll_train.txt", conll_data["train"])
export_to_file("./data/conll_val.txt", conll_data["validation"])

In [18]:
# 查詢table

def make_tag_lookup_table():
    """
        建立查詢標籤表，共10種標籤。
    """
    iob_labels = ["B", "I"]         # Begin, Inside
    ner_labels = ["PER", "ORG", "LOC", "MISC"]  # 人、組織、地點、
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ["-".join([a, b]) for a, b in all_labels]
    all_labels = ["[PAD]", "O"] + all_labels
    return dict(zip(range(0, len(all_labels) + 1), all_labels))


mapping = make_tag_lookup_table()
print(mapping)

{0: '[PAD]', 1: 'O', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-ORG', 6: 'B-LOC', 7: 'I-LOC', 8: 'B-MISC', 9: 'I-MISC'}


In [19]:
all_tokens = sum(conll_data["train"]["tokens"], [])
all_tokens_array = np.array(list(map(str.lower, all_tokens)))       # 小寫所有token

counter = Counter(all_tokens_array)
print(len(counter))            

num_tags = len(mapping)
vocab_size = len(counter)

# We only take (vocab_size - 2) most commons words from the training data since
# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
# token and another one denoting a masking token
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# The StringLook class will convert tokens to token IDs
lookup_layer = keras.layers.StringLookup(
    vocabulary=vocabulary
)

21009


In [20]:
counter.most_common(10)

[('the', 8390),
 ('.', 7374),
 (',', 7290),
 ('of', 3815),
 ('in', 3621),
 ('to', 3424),
 ('a', 3199),
 ('and', 2872),
 ('(', 2861),
 (')', 2861)]

In [22]:
# 簡單查找

lookup_layer('apple')

<tf.Tensor: shape=(), dtype=int64, numpy=9001>

In [23]:
# 創造 tf.data.Dataset
# a line = a row data

train_data = tf.data.TextLineDataset("./data/conll_train.txt")
val_data = tf.data.TextLineDataset("./data/conll_val.txt")

In [24]:
# 看一row資料
for x in train_data:
    print(x)
    break

tf.Tensor(b'9\tEU\trejects\tGerman\tcall\tto\tboycott\tBritish\tlamb\t.\t3\t0\t7\t0\t0\t0\t7\t0\t0', shape=(), dtype=string)


In [26]:
# 將dataset, 透過函數轉換成可使用的input

def map_record_to_training_data(record):
    record = tf.strings.split(input=record, sep='\t')           # 因為整理資料時，是用\t分割。
    length = tf.strings.to_number(record[0], out_type=tf.int32) # 此位置為 length
    tokens = record[1:length+1]                                 # 這位置為 所有tokens
    tags = record[length+1:]                                    # 此位置為 標籤
    tags = tf.strings.to_number(tags, out_type=tf.int64)        # 將標籤轉換成 int 才不會模型後續出錯
    tags += 1
    return tokens, tags

def lowercase_and_convert_to_ids(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)

### [padded_batch](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#padded_batch)

In [33]:
# We use `padded_batch` here because each record in the dataset has a
# different length.

# 因為句子長度不依，透過padded_batch將其轉換

batch_size = 32
train_dataset = (
    train_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)       # 不給定padded_shapes, 則會根據每一個batch延伸至最大的長度
)
val_dataset = (
    val_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)

ner_model = NERTransformer(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)

In [34]:
# 透過自制loss去忽略padding計算, 用sample_weight也可以做到。

class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, name='custom_ner_loss'):
        super(CustomNonPaddingTokenLoss, self).__init__(name=name)
    
    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False,                           # from_logits=True 代表出來是值而不是機率，當沒有使用softmax時使用。
            reduction=keras.losses.Reduction.NONE       # 代表不要縮減
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast(x=(y_true > 0), dtype=tf.float32)        # 轉型, y_true > 0 剛好排除了 [PAD]
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)        # 因為前面reduction沒有使用，故此處使用。

loss = CustomNonPaddingTokenLoss()

In [35]:
# 模型設定以及訓練

ner_model.compile(optimizer="adam", loss=loss)     # metrics 新加的
ner_model.fit(train_dataset, epochs=10)


def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return lowercase_and_convert_to_ids(tokens)


# Sample inference using the trained model
# 拿一個樣本當做預測
sample_input = tokenize_and_convert_to_ids(
    "eu rejects german call to boycott british lamb"
)
sample_input = tf.reshape(sample_input, shape=[1, -1])
print(sample_input)

# 預測並透過np.argmax取得最大機率的位置也就是標籤，再透過mapping 轉換
output = ner_model.predict(sample_input)
prediction = np.argmax(output, axis=-1)[0]
prediction = [mapping[i] for i in prediction]

# eu -> B-ORG, german -> B-MISC, british -> B-MISC
print(prediction)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
tf.Tensor([[  988 10950   204   628     6  3938   215  5773]], shape=(1, 8), dtype=int64)
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O']


In [36]:
def calculate_metrics(dataset):
    """
        計算真實的準確率，NER 任務比較看重 recall(所有正樣本中，預測多少百分比正確)
    """
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        output = ner_model.predict(x)
        predictions = np.argmax(output, axis=-1)        # 預測標籤
        predictions = np.reshape(predictions, [-1])     

        true_tag_ids = np.reshape(y, [-1])              # 真實標籤

        mask = (true_tag_ids > 0) & (predictions > 0)   # 只算除了PAD只外的標籤
        true_tag_ids = true_tag_ids[mask]               # 真實標籤基數
        predicted_tag_ids = predictions[mask]           # 預測標籤基數

        all_true_tag_ids.append(true_tag_ids)           # 丟入batch
        all_predicted_tag_ids.append(predicted_tag_ids) # 丟入batch

    all_true_tag_ids = np.concatenate(all_true_tag_ids)             # 將一個個batch concatenate
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)   # 將一個個batch concatenate

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]

    evaluate(real_tags, predicted_tags)
    return predicted_tags, real_tags


predicted_tags, real_tags = calculate_metrics(val_dataset)

processed 51362 tokens with 5942 phrases; found: 8493 phrases; correct: 3853.
accuracy:  62.68%; (non-O)
accuracy:  89.04%; precision:  45.37%; recall:  64.84%; FB1:  53.38
              LOC: precision:  40.61%; recall:  77.30%; FB1:  53.24  3497
             MISC: precision:  38.72%; recall:  69.63%; FB1:  49.77  1658
              ORG: precision:  44.86%; recall:  63.16%; FB1:  52.46  1888
              PER: precision:  65.10%; recall:  51.25%; FB1:  57.35  1450


## 換上預訓練 BERT 來實作