# Kashgari NER Benchmarks

- Kashgari: 2.1.0
- TensorFLow: 2.1.0

## Prepare Environment and Data

Download Embeddings to Embddings Folder and unzip.
- [BERT-Base, Chinese](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)

Requirements:

- Kashgari: 2.1.0
- TensorFLow: 2.1.0

Macros:

- SEQUENCE_LENGTH: training sequence length

In [None]:
!pip uninstall -y kashgari
!pip install git+https://github.com/BrikerMan/Kashgari.git@kashgari2

In [85]:
# Setup macros
SEQUENCE_LENGTH = 200
EPOCHS = 30
EARL_STOPPING_PATIENCE = 5
REDUCE_RL_PATIENCE = 5

BATCH_SIZE = 64

EMBEDDING_FOLDER = './embeddings'
TF_LOG_FOLDER = './tf_dir'
LOG_FILE_PATH = './ner_training_log.json'

In [86]:
EMBEDDING_FOLDER = '/Users/brikerman/Desktop/kashgari-demo/embeddings'

In [87]:
DOWNLOAD_DATASET = False

if DOWNLOAD_DATASET:
    !mkdir embeddings
    # Download BERT
    !wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
    !unzip embeddings/chinese_L-12_H-768_A-12.zip
    !mv chinese_L-12_H-768_A-12 embeddings/chinese_L-12_H-768_A-12
    # Download Albert-Tiny-Google
    !wget https://storage.googleapis.com/albert_zh/albert_tiny_zh_google.zip
    !unzip albert_tiny_zh_google.zip
    !mv albert_tiny_zh_google embeddings/albert_tiny_zh_google

In [88]:
from kashgari.corpus import ChineseDailyNerCorpus

train_x, train_y = ChineseDailyNerCorpus.load_data('train')
test_x, test_y = ChineseDailyNerCorpus.load_data('test')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')

train_x = train_x[:100]
train_y = train_y[:100]

In [89]:
import os
import json
from tensorflow import keras
from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model
from kashgari.tasks.labeling import BiGRU_Model, BiGRU_CRF_Model
from kashgari.callbacks import EvalCallBack

from kashgari.embeddings import WordEmbedding, BertEmbedding
from IPython.display import clear_output

In [90]:
# voidful/albert_chinese_tiny https://github.com/brightmart/albert_zh

albert_tiny_path = os.path.join(EMBEDDING_FOLDER, 'albert_tiny_zh_google')
albert_tiny_vocab = os.path.join(albert_tiny_path, 'vocab.txt')
albert_tiny_config = os.path.join(albert_tiny_path, 'albert_config_tiny_g.json')
albert_tiny_checkpoint = os.path.join(albert_tiny_path, 'albert_model.ckpt')
albert_tiny = TransformerEmbedding(vocab_path=albert_tiny_vocab,
                                   config_path=albert_tiny_config,
                                   checkpoint_path=albert_tiny_checkpoint,
                                   model_type='albert')

# Google Bert
bert_chinese = BertEmbedding(os.path.join(EMBEDDING_FOLDER, 'chinese_L-12_H-768_A-12'))

embeddings = [
    # ('Tiny-Albert-Chinse', albert_tiny),
    # ('Bert-Chinese', bert_chinese),
    ('Bare', None)
]

model_classes = [
    ('BiLSTM', BiLSTM_Model), 
    ('BiLSTM_CRF', BiLSTM_CRF_Model), 
    ('BiGRU', BiGRU_Model), 
    ('BiGRU_CRF', BiGRU_CRF_Model)
]

In [81]:
for embed_name, embed in embeddings:
    for model_name, MOEDL_CLASS in model_classes:
        run_name = f"{embed_name}-{model_name}"
        model = MOEDL_CLASS(embed, sequence_length=SEQUENCE_LENGTH)
        
        early_stop = keras.callbacks.EarlyStopping(patience=EARL_STOPPING_PATIENCE)
        reduse_lr_callback = keras.callbacks.ReduceLROnPlateau(factor=0.1, 
                                                               patience=REDUCE_RL_PATIENCE)

        eval_callback = EvalCallBack(task_model=model,
                                     valid_x=valid_x, 
                                     valid_y=valid_y,
                                     step=1)

        tf_board = keras.callbacks.TensorBoard(
            log_dir=os.path.join(TF_LOG_FOLDER, run_name), 
            update_freq=1000
        )

        callbacks = [early_stop, reduse_lr_callback, eval_callback, tf_board]

        model.fit(train_x, 
                  train_y,
                  valid_x,
                  valid_y,
                  callbacks=callbacks,
                  epochs=EPOCHS)

        if os.path.exists(LOG_FILE_PATH):
            logs = json.load(open(LOG_FILE_PATH, 'r'))
        else:
            logs = {}
        
        logs[run_name] = eval_callback.logs
        
        with open(LOG_FILE_PATH, 'w') as f:
            f.write(json.dumps(logs, indent=2))

Preparing text vocab dict: 100%|██████████| 200/200 [00:00<00:00, 56337.19it/s]
Model: "model_41"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 128)    2704384     Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 128)    256         Input-Segment[0][0]              
___________

KeyboardInterrupt: 

In [None]:
print(albert_tiny_489k_config)

In [None]:
!cat /Users/brikerman/Desktop/kashgari-demo/embeddings/albert_tiny_489k/albert_config_tiny.json

In [9]:
!pip install transformer

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
[31mERROR: Could not find a version that satisfies the requirement transformer (from versions: none)[0m
[31mERROR: No matching distribution found for transformer[0m
You should consider upgrading via the '/Users/brikerman/Desktop/python/Kashgari2/venv/bin/python -m pip install --upgrade pip' command.[0m
