# Load & Extract

## Download Pretrained Weights

In [1]:
!pip install -q keras-bert

In [2]:
!wget -q https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip

In [3]:
!unzip -o chinese_L-12_H-768_A-12.zip

Archive:  chinese_L-12_H-768_A-12.zip
  inflating: chinese_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: chinese_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: chinese_L-12_H-768_A-12/vocab.txt  
  inflating: chinese_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: chinese_L-12_H-768_A-12/bert_config.json  


## Build Model & Dictionary

Set paths:

In [4]:
import os

pretrained_path = 'chinese_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

Enable `tf.keras` by adding `TF_KERAS` to environment variables:

In [5]:
os.environ['TF_KERAS'] = '1'

Build the dictionary:

In [6]:
import codecs

token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

Build the model:

In [7]:
from keras_bert import load_trained_model_from_checkpoint

model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
model.summary(line_length=120)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
________________________________________________________________________________________________________________________
Layer (type)                           Output Shape               Param #       Connected to                            
Input-Token (InputLayer)               (None, 512)                0                                                     
________________________________________________________________________________________________________________________
Input-Segment (InputLayer)             (None, 512)                0                                                     
________________________________________________________________________________________________________________________
Embedding-Token (TokenEmbedding)       [(None, 512, 768), (21128, 16226304      Input-T

## Tokenization

In [8]:
from keras_bert import Tokenizer

tokenizer = Tokenizer(token_dict)
text = '语言模型'
tokens = tokenizer.tokenize(text)
indices, segments = tokenizer.encode(first=text, max_len=512)
print(indices[:10])
print(segments[:10])

[101, 6427, 6241, 3563, 1798, 102, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Extract Feature

In [9]:
import numpy as np

predicts = model.predict([np.array([indices]), np.array([segments])])[0]
for i, token in enumerate(tokens):
    print(token, predicts[i].tolist()[:5])

[CLS] [-0.6325103044509888, 0.20302410423755646, 0.07936538010835648, -0.03284265100955963, 0.5668085813522339]
语 [-0.7588362097740173, 0.0965188592672348, 1.0718743801116943, 0.005039289593696594, 0.6887993812561035]
言 [0.5477026104927063, -0.7921162843704224, 0.44435110688209534, -0.7112641930580139, 1.2048895359039307]
模 [-0.29242411255836487, 0.6052717566490173, 0.49968627095222473, -0.42457854747772217, 0.42855408787727356]
型 [-0.7473456263542175, 0.49431660771369934, 0.7185154557228088, -0.8723534941673279, 0.8349594473838806]
[SEP] [-0.8741379976272583, -0.2165030986070633, 1.33883798122406, -0.10587061941623688, 0.3960897624492645]
