In [None]:
! pip install transformers
! pip install seqeval

# 1. Load Data

In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from transformers import shape_list, BertTokenizer, TFBertModel
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from seqeval.metrics import f1_score, classification_report
import tensorflow as tf
import urllib.request

* Bring train and test data, and labels

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/18.%20Fine-tuning%20BERT%20(Cls%2C%20NER%2C%20NLI)/dataset/ner_train_data.csv", filename="ner_train_data.csv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/18.%20Fine-tuning%20BERT%20(Cls%2C%20NER%2C%20NLI)/dataset/ner_test_data.csv", filename="ner_test_data.csv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/18.%20Fine-tuning%20BERT%20(Cls%2C%20NER%2C%20NLI)/dataset/ner_label.txt", filename="ner_label.txt")

('ner_label.txt', <http.client.HTTPMessage at 0x7f7f4e32e910>)

In [4]:
train_ner_df = pd.read_csv("ner_train_data.csv")

In [5]:
test_ner_df = pd.read_csv("ner_test_data.csv")

In [6]:
train_ner_df.head()

Unnamed: 0,Sentence,Tag
0,"정은 씨를 힘들게 한 가스나그, 가만둘 수 없겠죠 .",PER-B O O O O O O O O
1,▶ 쿠마리 한동수가 말하는 '가넷 & 에르덴',O PER-B PER-I O PER-B O PER-B
2,슈나이더의 프레젠테이션은 말 청중을 위한 특별한 쇼다 .,PER-B O O CVL-B O O O O
3,지구 최대 연료탱크 수검 회사 구글이 연내 22명 안팎의 인력을 갖춘 연구개발(R&...,O O TRM-B O O ORG-B DAT-B NUM-B O O O ORG-B LO...
4,5. <10:00:TI_HOUR> 도이치증권대 <0:1:QT_SPORTS> 연예오락...,NUM-B O ORG-B O ORG-B


In [7]:
test_ner_df.head()

Unnamed: 0,Sentence,Tag
0,"라티은-원윤정, 휘닉스파크클래식 프로골퍼",PER-B EVT-B CVL-B
1,5원으로 맺어진 애인까지 돈이라는 민감한 원자재를 통해 현대인의 물질만능주의를 꼬집...,NUM-B O O O O O O O O O O O FLD-B O
2,-날로 삼키면 맛이 어떤지 일차 드셔보시겠어요 .,O O O O NUM-B O O
3,"-네, 지었습니다 .",O O O
4,◇신규 투자촉진에 방점=이번 접속료 조정결과에서 눈에 띄는 지점은 WCDMA/HSD...,O O O O O O O O TRM-B O TRM-B TRM-I ORG-B O TR...


In [8]:
print("Number of Training Sample :", len(train_ner_df))
print("Number of Test Sample :", len(test_ner_df))

Number of Training Sample : 81000
Number of Test Sample : 9000


* Separate sentences and labels in traning and test dataset

In [9]:
train_data_sentence = [sent.split() for sent in train_ner_df['Sentence'].values]
test_data_sentence = [sent.split() for sent in test_ner_df['Sentence'].values]
train_data_label = [tag.split() for tag in train_ner_df['Tag'].values]
test_data_label = [tag.split() for tag in test_ner_df['Tag'].values]

* Arbitrary Sampling a sentence and label

In [10]:
print(train_data_sentence[2])
print(train_data_label[2])

['슈나이더의', '프레젠테이션은', '말', '청중을', '위한', '특별한', '쇼다', '.']
['PER-B', 'O', 'O', 'CVL-B', 'O', 'O', 'O', 'O']


* Labels are tagged based on white-space divisions
* Read 'ner_label.txt' and check information on tagging

In [11]:
labels = [label.strip() for label in open('ner_label.txt', 'r', encoding='utf-8')]
print('Entity taggings :', labels)

Entity taggings : ['O', 'PER-B', 'PER-I', 'FLD-B', 'FLD-I', 'AFW-B', 'AFW-I', 'ORG-B', 'ORG-I', 'LOC-B', 'LOC-I', 'CVL-B', 'CVL-I', 'DAT-B', 'DAT-I', 'TIM-B', 'TIM-I', 'NUM-B', 'NUM-I', 'EVT-B', 'EVT-I', 'ANM-B', 'ANM-I', 'PLT-B', 'PLT-I', 'MAT-B', 'MAT-I', 'TRM-B', 'TRM-I']


In [12]:
tag_to_index = {tag: index for index, tag in enumerate(labels)}
index_to_tag = {index: tag for index, tag in enumerate(labels)}

In [13]:
print(tag_to_index)
print(index_to_tag)

{'O': 0, 'PER-B': 1, 'PER-I': 2, 'FLD-B': 3, 'FLD-I': 4, 'AFW-B': 5, 'AFW-I': 6, 'ORG-B': 7, 'ORG-I': 8, 'LOC-B': 9, 'LOC-I': 10, 'CVL-B': 11, 'CVL-I': 12, 'DAT-B': 13, 'DAT-I': 14, 'TIM-B': 15, 'TIM-I': 16, 'NUM-B': 17, 'NUM-I': 18, 'EVT-B': 19, 'EVT-I': 20, 'ANM-B': 21, 'ANM-I': 22, 'PLT-B': 23, 'PLT-I': 24, 'MAT-B': 25, 'MAT-I': 26, 'TRM-B': 27, 'TRM-I': 28}
{0: 'O', 1: 'PER-B', 2: 'PER-I', 3: 'FLD-B', 4: 'FLD-I', 5: 'AFW-B', 6: 'AFW-I', 7: 'ORG-B', 8: 'ORG-I', 9: 'LOC-B', 10: 'LOC-I', 11: 'CVL-B', 12: 'CVL-I', 13: 'DAT-B', 14: 'DAT-I', 15: 'TIM-B', 16: 'TIM-I', 17: 'NUM-B', 18: 'NUM-I', 19: 'EVT-B', 20: 'EVT-I', 21: 'ANM-B', 22: 'ANM-I', 23: 'PLT-B', 24: 'PLT-I', 25: 'MAT-B', 26: 'MAT-I', 27: 'TRM-B', 28: 'TRM-I'}


In [14]:
tag_size = len(tag_to_index)
print("Tag Size :", tag_size)

Tag Size : 29


# 2. Understanding Preprocessing

In [15]:
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

* Sample a sentence and label and process it
* Which will help understanding pre-processing process

In [16]:
sent = train_data_sentence[1]
label = train_data_label[1]

print("Sentence :", sent)
print("Label :", label)
print("Integer encoding of label", [tag_to_index[idx] for idx in label])
print("Length of sentence", len(sent))
print("Length of Label", len(label))

Sentence : ['▶', '쿠마리', '한동수가', '말하는', "'가넷", '&', "에르덴'"]
Label : ['O', 'PER-B', 'PER-I', 'O', 'PER-B', 'O', 'PER-B']
Integer encoding of label [0, 1, 2, 0, 1, 0, 1]
Length of sentence 7
Length of Label 7


* With the sampled data, understand the process of BERT Tokenizer. 
* For sentences with word-level tokenization, we need to apply sub-word tokenization first

In [17]:
tokens = []

for one_word in sent:
    # Separate a word into subwords
    # ex) one_word = 'embeddings' ===> subword_tokens = ['embed', '##ing', '##s']
    subword_tokens = tokenizer.tokenize(one_word)
    tokens.extend(subword_tokens)

print("Sentence after BERT Tokenize :", tokens)
print("Label", label)
print("Integer Encoding :", [tag_to_index[idx] for idx in label])
print("Sentence Length :", len(tokens))
print("Label Length :", len(label))

Sentence after BERT Tokenize : ['▶', '쿠', '##마리', '한동', '##수', '##가', '말', '##하', '##는', "'", '가', '##넷', '&', '에르', '##덴', "'"]
Label ['O', 'PER-B', 'PER-I', 'O', 'PER-B', 'O', 'PER-B']
Integer Encoding : [0, 1, 2, 0, 1, 0, 1]
Sentence Length : 16
Label Length : 7


* Now, we can see that the length of label and that of sentence become vary as we adopted subword tokenizer
* We need to match both lengths. What can be done?
    * Assign label only to the first subword as give -100
    * This will be treated in same manner as with padding token
    * token with -100 will be ignored in loss function also

In [18]:
tokens = []
labels_ids = []

for one_word, label_token in zip(train_data_sentence[1], train_data_label[1]):
    subword_tokens = tokenizer.tokenize(one_word)
    tokens.extend(subword_tokens)
    # 1st for-loop: label_1 label_2 -100
    # 2nd for-loop: label_1 label_2 -100 label_3 l1bel_4 -100 ...
    # 3rd ...
    labels_ids.extend([tag_to_index[label_token]] + [-100] * (len(subword_tokens) - 1))

print("Sentence tokenized :", tokens)
print("Label :", ['[PAD]' if idx == -100 else index_to_tag[idx] for idx in labels_ids])
print("Integer Encoding of Label :", labels_ids)
print("Sentence Length :", len(tokens))
print("Label Length :", len(labels_ids))

Sentence tokenized : ['▶', '쿠', '##마리', '한동', '##수', '##가', '말', '##하', '##는', "'", '가', '##넷', '&', '에르', '##덴', "'"]
Label : ['O', 'PER-B', '[PAD]', 'PER-I', '[PAD]', '[PAD]', 'O', '[PAD]', '[PAD]', 'PER-B', '[PAD]', '[PAD]', 'O', 'PER-B', '[PAD]', '[PAD]']
Integer Encoding of Label : [0, 1, -100, 2, -100, -100, 0, -100, -100, 1, -100, -100, 0, 1, -100, -100]
Sentence Length : 16
Label Length : 16


# Pre-processing

In [19]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer, 
                                 pad_token_id_for_segment=0, 
                                 pad_token_id_for_label=-100):
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []

    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        tokens = []
        labels_ids = []
        for one_word, label_token in zip(example, label):
            # Subword tokenize for each word
            subword_tokens = tokenizer.tokenize(one_word)
            tokens.extend(subword_tokens)
            # Assign entity label only to the first subword, do -100 to others
            labels_ids.extend([tag_to_index[label_token]] + 
                              [pad_token_id_for_label] * (len(subword_tokens) - 1))
            
        # Considering that we will add [CLS] and [SEP], 
        # Truncate sample with length over max_seq_len to (max_seq_len - 2)
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            labels_ids = labels_ids[:(max_seq_len - special_tokens_count)]
        
        # Code for adding [SEP]
        # 1. Add [SEP] at the end of sequence
        # 2. Same for labels
        tokens += [sep_token]
        labels_ids += [pad_token_id_for_label]

        # Code for adding [CLS]
        # 1. Add [CLS] at the commece of sequence
        # 2. Same for labels

        tokens = [cls_token] + tokens
        labels_ids = [pad_token_id_for_label] + labels_ids

        # Integer Encoding
        input_id = tokenizer.convert_tokens_to_ids(tokens)

        # Attention Mask
        attention_mask = [1] * len(input_id)

        # Calc padding length for integer encoding
        padding_count = max_seq_len - len(input_id)

        # Pad to integer encoding and attention mask
        input_id = input_id + ([pad_token_id] * padding_count)
        attention_mask = attention_mask + ([0] * padding_count)

        # Segment encoding
        token_type_id = [pad_token_id_for_segment] * max_seq_len

        # Pad Labels (Idx of padding token is -100)
        label = labels_ids + ([pad_token_id_for_label] * padding_count)

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with input length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with input length {} vs {}".format(len(token_type_id), max_seq_len)
        assert len(label) == max_seq_len, "Error with input length {} vs {}".format(len(label), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.asarray(input_ids, dtype=int)
    attention_masks = np.asarray(attention_masks, dtype=int)
    token_type_ids = np.asarray(token_type_ids, dtype=int)
    data_labels = np.asarray(data_labels, dtype=int)

    return (input_ids, attention_masks, token_type_ids), data_labels


In [20]:
X_train, y_train = convert_examples_to_features(train_data_sentence, train_data_label, max_seq_len=128, tokenizer=tokenizer)
X_test, y_test = convert_examples_to_features(test_data_sentence, test_data_label, max_seq_len=128, tokenizer=tokenizer)

100%|██████████| 81000/81000 [00:42<00:00, 1887.64it/s]
100%|██████████| 9000/9000 [00:04<00:00, 1907.80it/s]


In [21]:
print('Original Sentence :', train_data_label[0])
print('-' * 50)
print('Tokenized Sentence :', [tokenizer.decode([word]) for word in X_train[0][0]])
print('Label after Tokenization :', ['[PAD]' if idx == -100 else index_to_tag[idx] for idx in y_train[0]])
print('-' * 50)
print('Integer Encoding :', X_train[0][0])
print('Integer Encoded Label :', y_train[0])

Original Sentence : ['PER-B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
--------------------------------------------------
Tokenized Sentence : ['[CLS]', '정은', '씨', '##를', '힘들', '##게', '한', '가스', '##나', '##그', ',', '가만', '##둘', '수', '없', '##겠', '##죠', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

* In NER, we don't need to discern two sentences
    * Thus all segment encodings will have same value of 0

* In case of Attention Mask, Assign 1 from [CLS] to [SEP] and Assign 0 to [PAD]

In [22]:
print("Segment Encoding :", X_train[2][0])
print("Attention Mask :", X_train[1][0])

Segment Encoding : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Attention Mask : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


# Modeling

* output[1] is a way to approach to [CLS] token, which used for Many-to-One like Text Classification
* NER is Many-to-Many, so we need to access outputs[0]
* Pass over num_layers to output layer connected with outputs[0]. 
* Intentionally won't use softmax in output layer. Instead, loss function will handle this 

In [23]:
class TFBertForTokenClassification(tf.keras.Model):
    def __init__(self, model_name, num_labels):
        super(TFBertForTokenClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(num_labels,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                name='classifier')
    
    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        # Make sure outputs[0] in order to classify entire sequence
        all_output = outputs[0]
        prediction = self.classifier(all_output)

        return prediction

# Exclude -100 labels in Loss Function (Example)

* -100 is an encoding for label, which we want to ignore when calculating loss 
* Let's assume that logits is predicted value of a model (3-dimensional probability distribution after softmax function, so sum to 1)

In [24]:
labels = tf.constant([[-100, 2, 1, -100]])
logits = tf.constant([[ [0.8, 0.1, 0.1], 
                        [0.06, 0.04, 0.9],
                        [0.75, 0.1, 0.15],
                        [0.4, 0.5, 0.1] ]])

* active_loss returns False when a label is -100, True otherwise

In [25]:
active_loss = tf.reshape(labels, (-1)) != -100
print(active_loss)

tf.Tensor([False  True  True False], shape=(4,), dtype=bool)


* Remain values that are not -100

In [26]:
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
print(reduced_logits)

tf.Tensor(
[[0.06 0.04 0.9 ]
 [0.75 0.1  0.15]], shape=(2, 3), dtype=float32)


* Do same thing on labels

In [27]:
labels = tf.boolean_mask(tf.reshape(labels, (-1)), active_loss)
print(labels)

tf.Tensor([2 1], shape=(2,), dtype=int32)


# Define Loss Fuction

* As a loss function, Use SparseCategoricalCrossentropy for multi-label classification
* We didn't use softmax fuction in our output layer of TFBertForTokenClassification. Thus the sum of a vector is not 1. 
* In this case, if we set 'from_logits=True', SparseCategoricalCrossentropy calculates loss considering that.  

In [38]:
def compute_loss(labels, logits):

    # When we don't use softmax in multiclass classification, 
    # Set from_logit = True

    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    
    # revises labels to not reflect loss for label with -100 
    active_loss = tf.reshape(labels, (-1,)) != -100

    # Get reduced_logits and labels from active_loss
    reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
    labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)

    return loss_fn(labels, reduced_logits)

# def compute_loss(labels, logits):
#     # 다중 클래스 분류 문제에서 소프트맥스 함수 미사용 시 from_logits=True로 설정
#     loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
#                                                             reduction=tf.keras.losses.Reduction.NONE)
#     # -100의 값을 가진 정수에 대해서는 오차를 반영하지 않도록 labels를 수정.
#     active_loss = tf.reshape(labels, (-1,)) != -100

#     # activa_loss 로 부 터 reduced_logits 과 labels 를 각 각 얻 는 다 .
#     reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
#     labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) 13
#     return loss_fn(labels, reduced_logits)

# Learning / Fitting

In [39]:
#######################
# Code for TPU
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

strategy = tf.distribute.experimental.TPUStrategy(resolver)

with strategy.scope():
#######################
    model = TFBertForTokenClassification("klue/bert-base", num_labels=tag_size)
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    model.compile(optimizer=optimizer, loss=compute_loss)    

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'bert.embeddings.position_ids', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [42]:
class F1score(tf.keras.callbacks.Callback):
    def __init__(self, X_test, y_test):
        self.X_test = X_test
        self.y_test = y_test
    
    def sequences_to_tags(self, label_ids, pred_ids):
        label_list = []
        pred_list = []

        for i in range(0, len(label_ids)):
            label_tag = []
            pred_tag = []

            # If the value of a lbel is -100, don't calc F1 score
            # ex) label decoding process
            # label_index : [1 -100 2 -100] ===> [1 2] ===> label_tag : [PER-B PER-I]

            for label_index, pred_index in zip(label_ids[i], pred_ids[i]):
                if label_index != -100:
                    label_tag.append(index_to_tag[label_index])
                    pred_tag.append(index_to_tag[pred_index])
            
            label_list.append(label_tag)
            pred_list.append(pred_tag)
        
        return label_list, pred_list
    
    def on_epoch_end(self, epoch, logs={}):

        y_predicted = self.model.predict(self.X_test)
        y_predicted = np.argmax(y_predicted, axis = 2)

        label_list, pred_list = self.sequences_to_tags(self.y_test, y_predicted)

        score = f1_score(label_list, pred_list, suffix=True)
        print('  - f1: {:04.2f}'.format(score * 100))
        print(classification_report(label_list, pred_list, suffix=True))

In [43]:
f1_score_report = F1score(X_test, y_test)
model.fit(
    X_train, y_train, 
    epochs=3, batch_size=32,
    callbacks = [f1_score_report])

Epoch 1/3
   6/2532 [..............................] - ETA: 2:58 - loss: 0.1554



  - f1: 86.11
              precision    recall  f1-score   support

         AFW       0.72      0.57      0.64       394
         ANM       0.72      0.79      0.75       701
         CVL       0.82      0.86      0.84      5758
         DAT       0.92      0.92      0.92      2521
         EVT       0.73      0.79      0.76      1094
         FLD       0.68      0.57      0.62       228
         LOC       0.87      0.86      0.86      2126
         MAT       0.33      0.08      0.13        12
         NUM       0.90      0.93      0.92      5590
         ORG       0.87      0.88      0.87      4086
         PER       0.89      0.90      0.89      4426
         PLT       0.67      0.29      0.41        34
         TIM       0.86      0.88      0.87       314
         TRM       0.80      0.72      0.76      1964

   micro avg       0.86      0.87      0.86     29248
   macro avg       0.77      0.72      0.73     29248
weighted avg       0.86      0.87      0.86     29248

Epoch 2/3
 

<keras.callbacks.History at 0x7f7e4465f9d0>

# Prediction

In [46]:
def convert_examples_to_features_for_prediction(examples, max_seq_len, 
                                                tokenizer, 
                                                pad_token_id_for_segment=0,
                                                pad_token_id_for_label=-100):
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    input_ids, attention_masks, token_type_ids, label_masks = [], [], [], []

    for example in tqdm(examples):
        tokens = []
        label_mask = []
        for one_word in example:
            # Subword tokenize each word
            subword_tokens = tokenizer.tokenize(one_word)
            tokens.extend(subword_tokens)
            # Except first subword, fill other labels with -100
            label_mask.extend([0] + [pad_token_id_for_label] * (len(subword_tokens) -1))
        
        # Considering that we will add tokens for [CLS] and [SEP], 
        # Truncate sample with length over max_seq_len to (max_seq_len - 2)
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[ : (max_seq_len - special_tokens_count)]
            label_mask = label_mask[: (max_seq_len - special_tokens_count)]
        
         # Code for adding [SEP]
        # 1. Add [SEP] at the end of sequence
        # 2. Same for labels
        tokens += [sep_token]
        label_mask += [pad_token_id_for_label]

        # Code for adding [CLS]
        # 1. Add [CLS] at the commece of sequence
        # 2. Same for labels
        tokens = [cls_token] + tokens
        label_mask = [pad_token_id_for_label] + label_mask

        # Integer Encoding
        input_id = tokenizer.convert_tokens_to_ids(tokens)

        # Attention Mask
        attention_mask = [1] * len(input_id)

        # Calc padding length for integer encoding
        padding_count = max_seq_len - len(input_id)

        # Pad to integer encoding and attention mask
        input_id = input_id + ([pad_token_id] * padding_count)
        attention_mask = attention_mask + ([0] * padding_count)

        # Segment encoding
        token_type_id = [pad_token_id_for_segment] * max_seq_len

        # Pad Labels (Idx of padding token is -100)
        label_mask = label_mask + ([pad_token_id_for_label] * padding_count)

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with input length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with input length {} vs {}".format(len(token_type_id), max_seq_len)
        assert len(label_mask) == max_seq_len, "Error with input length {} vs {}".format(len(label_mask), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        label_masks.append(label)

    input_ids = np.asarray(input_ids, dtype=int)
    attention_masks = np.asarray(attention_masks, dtype=int)
    token_type_ids = np.asarray(token_type_ids, dtype=int)
    label_masks = np.asarray(label_masks, dtype=int)

    return (input_ids, attention_masks, token_type_ids), label_masks

* Sample 5 data from test set and do test whether NER model works

In [None]:
X_pred, label_masks = convert_examples_to_features_for_prediction(
    test_data_sentence[:5], max_seq_len=128, tokenizer=tokenizer)

In [None]:
print("Original Sentence :", test_data_sentence[0])
print('-' * 50)
print("Tokenized Sentence :", [tokenizer.decode([word]) for word in X_pred[0][0]])
print("Label Mask :", ['[PAD]' if idx == -100 else '[FIRST]' for idx in label_masks[0]])

In [48]:
def ner_prediction(examples, max_seq_len, tokenizer):
    examples = [sent.split() for sent in examples]
    X_pred, label_masks = convert_examples_to_features_for_prediction(examples, 
                                                                      max_seq_len=128, 
                                                                      tokenizer=tokenizer)
    y_predicted = model.predict(X_pred)
    y_predicted = np.argmax(y_predicted, axis=2)

    pred_list = []
    result_list = []

    for i in range(0, len(label_masks)):
        pred_tag = []

        # ex) Model's decoding process
        # From y_predicted, remove the value located in the place where the value of label_masks is -100
        # label_masks : [-100 0 -100 0 -100]
        # y_predicted : [  0  1   0   2   0 ] ===> [1 2] ===> final_prediction (pred_tag) : [PER-B PER-I]

        for label_index, pred_index in zip(label_masks[i], y_predicted[i]):
            if label_index != -100:
                pred_tag.append(index_to_tag[pred_index])
        
        pred_list.append(pred_tag)

    for example, pred in zip(examples, pred_list):
        one_sample_result = []
        for one_word, label_token in zip(example, pred):
            one_sample_result.append((one_word, label_token))
        result_list.append(one_sample_result)
    
    return result_list

* Let's predicy Entity name for two sentences that are not exist in training data

In [None]:
sent1 = '오리온스는 리그 최정상급 포인트가드 김동훈을 앞세우는 빠른 공수전환이 돋보이는 팀이다'
sent2 = '하이신사에 속한 섬들도 위로 솟아 있는데 타인은 살고 있어요'
test_samples = [sent1, sent2]
result_list = ner_prediction(test_samples, max_seq_len=128, tokenizer=tokenizer )
result_list