# 加载模型 提取特征

- 预训练模型中只有encoder 所只能是提取encoder的特征


In [1]:
import os
import codecs

In [2]:
# 定义地址
bert_model_path = "/Users/zhouwencheng/Desktop/Grass/data/model/ImportModel/BERT/chinese_L-12_H-768_A-12"
config_path = os.path.join(bert_model_path, 'bert_config.json')
checkpoint_path = os.path.join(bert_model_path, 'bert_model.ckpt')
vocab_path = os.path.join(bert_model_path, 'vocab.txt')

In [3]:
# 读取 vocab 文件  加载为字典

os.environ['TF_KERAS'] = '1'
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

In [4]:
print(len(token_dict)) 

21128


In [6]:
# 载入模型
from keras_bert import load_trained_model_from_checkpoint

In [7]:
model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
model.summary(line_length=120)

Model: "model_1"
________________________________________________________________________________________________________________________
Layer (type)                           Output Shape               Param #       Connected to                            
Input-Token (InputLayer)               [(None, 512)]              0                                                     
________________________________________________________________________________________________________________________
Input-Segment (InputLayer)             [(None, 512)]              0                                                     
________________________________________________________________________________________________________________________
Embedding-Token (TokenEmbedding)       [(None, 512, 768), (21128, 16226304      Input-Token[0][0]                       
________________________________________________________________________________________________________________________
Embedding-Segme

In [8]:
model

<tensorflow.python.keras.engine.training.Model at 0x15b466320>

In [9]:
text ="我们"
indice=['100', '120']

## Tokenization

In [10]:
from keras_bert import Tokenizer
tokenizer = Tokenizer(token_dict)
text = '语言模型'
tokens = tokenizer.tokenize(text)
indices, segments = tokenizer.encode(first=text, max_len=512)

In [11]:
print("tokens:", tokens)
print("indices:", indices[:10])
print("segments:", segments[:10])

tokens: ['[CLS]', '语', '言', '模', '型', '[SEP]']
indices: [101, 6427, 6241, 3563, 1798, 102, 0, 0, 0, 0]
segments: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Extract Feature (提取特征)

In [12]:
import numpy as np
predicts = model.predict([np.array([indices]), np.array([segments])])
print("predicts.shape:", predicts.shape)
predicts = predicts[0]
for i, token in enumerate(tokens):
    print(token, predicts[i].tolist()[:5])

predicts.shape: (1, 512, 768)
[CLS] [-0.6325103044509888, 0.20302362740039825, 0.07936552911996841, -0.03284244239330292, 0.5668082237243652]
语 [-0.7588359713554382, 0.09651876986026764, 1.0718752145767212, 0.005038648843765259, 0.6887993812561035]
言 [0.5477023124694824, -0.7921169400215149, 0.44435206055641174, -0.7112643718719482, 1.2048896551132202]
模 [-0.2924240231513977, 0.6052718162536621, 0.49968674778938293, -0.4245801568031311, 0.4285529851913452]
型 [-0.747345507144928, 0.4943161904811859, 0.7185164093971252, -0.8723527789115906, 0.8349593877792358]
[SEP] [-0.8741376399993896, -0.21650390326976776, 1.338839054107666, -0.1058710515499115, 0.3960898220539093]


In [18]:
predicts[:6].shape

(6, 768)

In [20]:
from keras_bert import extract_embeddings
bert_model_path = "/Users/zhouwencheng/Desktop/Grass/data/model/ImportModel/BERT/chinese_L-12_H-768_A-12"

texts = ['其实也是不错的']

embeddings = extract_embeddings(bert_model_path, texts)

768