# Embedding BERT

**摘要: BERT的出现是令人兴奋的，它足以方NLP走上一个台阶。BERT作为语言模型在它出现的时候表现可谓是惊艳，在一段时间内统领NLP的多个领域，它的预训练模型便是促使这一些的主要原因。BERT本质是一个语言模型，于是用它来做向量化也就是常理之中了。本次探索加载预训练模型(chinese_L-12_H-768_A-12)进行文字的向量化**

[参考源码地址==========](https://github.com/yongzhuo/Keras-TextClassification)

BERT的强大就不做赘述了，不过BERT需要耗费的算力也是非常让人头疼的事情了！也至于我知道现在有也没有能真正去好好是使用BERT等一些比较大型的网络，是在是有些遗憾。等哥以后有GPU了(可以用到GPU资源，不是说自己买哈-------=-=-=)哥一定好好去跑这些大型网络，想想还有点开心。

这里记录下今天（2019-11-27）探索BERT Embedding的实现代码，以便于以后有需要的时候能快速实现。

In [2]:
from keras.layers import Add, Embedding
from gensim.models import KeyedVectors
from keras.models import Input, Model

import numpy as np
import codecs
import os
import re

path_embedding_bert = "/Users/zhouwencheng/Desktop/Grass/data/model" \
                      "/ImportModel/BERT/chinese_L-12_H-768_A-12"

In [3]:
from __future__ import print_function, division
from keras.engine import Layer


class NonMaskingLayer(Layer):
    """
    fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978
    thanks for https://github.com/jacoxu
    """

    def __init__(self, **kwargs):
        self.supports_masking = True
        super(NonMaskingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        pass

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        return x

    def compute_output_shape(self, input_shape):
        return input_shape

In [6]:
class BertEmbedding(object):
    def __init__(self,
                 len_max=50,  # 文本最大长度, 建议25-50
                 embed_size=300,  # 嵌入层尺寸
                 vocab_size=30000,  # 字典大小, 这里随便填的，会根据代码里修改
                 trainable=True,  # 是否训练参数
                 path_mode=path_embedding_bert,
                 layer_indexes=[24] # 默认取最后一层的输出 大于13则取最后一层的输出
                ):
        self.len_max = len_max
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.trainable = trainable 
        self.path_mode = path_mode
        self.layer_indexes = layer_indexes
        
        self.input = None
        self.output = None
        self.model = None 
        self.build()
        
 

    def build(self):
        import keras_bert

        config_path = os.path.join(self.path_mode, 'bert_config.json')
        check_point_path = os.path.join(self.path_mode, 'bert_model.ckpt')
        dict_path = os.path.join(self.path_mode, 'vocab.txt')
        print('load bert model start!')
        model = keras_bert.load_trained_model_from_checkpoint(config_path,
                                                              checkpoint_file=check_point_path,
                                                              seq_len=self.len_max,
                                                              trainable=self.trainable)
        print('load bert model end!')

        layer_dict = [6]
        layer_0 = 7
        for i in range(12):
            layer_0 = layer_0 + 8
            layer_dict.append(layer_0)
        print(layer_dict)

        # 输出他本身
        if len(self.layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层，就只取最后那一层的weight；取得不正确，就默认取最后一层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in [i + 1 for i in range(13)]:
                encoder_layer = model.get_layer(index=layer_dict[self.layer_indexes[0]-1]).output
            else:
                encoder_layer = model.get_layer(index=layer_dict[-1]).output
        # 否则遍历需要取的层，把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)]
                          else model.get_layer(index=layer_dict[-1]).output  # 如果给出不正确，就默认输出最后一层
                          for lay in self.layer_indexes]
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
        self.output = NonMaskingLayer()(encoder_layer)
        self.input = model.inputs
        self.model = Model(inputs=self.input, outputs=self.output)
        self.embedding_size = self.model.output_shape[-1]

        self.token_dict = {}
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)
        self.vocab_size = len(self.token_dict)
        self.tokenizer = keras_bert.Tokenizer(self.token_dict)

    
    def sentence2idx(self, text, second_text=None):
        text = self.extract_chinese(str(text)).upper()
        input_id, input_type_id = self.tokenizer.encode(first=text,
                                                        second=second_text,
                                                        max_len=self.len_max)
        return [input_id, input_type_id]
    
    
    def extract_chinese(self, text):
        """
          只提取出中文、字母和数字
        :param text: str, input of sentence
        :return:
        """
        chinese_exttract = ''.join(re.findall(u"([\u4e00-\u9fa5A-Za-z0-9@._])", text))
        return chinese_exttract   

In [7]:
texts = ["今天天气不错",
             "明天天气也不错"]
eb = BertEmbedding()
x = []
x_type = []
for t in texts:
    x_buff, x_type_buff = eb.sentence2idx(t)
    x.append(x_buff)
    x_type.append(x_type_buff)
x = np.array(x)
x_type = np.array(x_type)

print(x.shape)
print(x_type.shape)

model = eb.model
p = model.predict([x, x_type])
print(p.shape)
print(p)
print(p.shape)

load bert model start!




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

load bert model end!
[6, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103]
(2, 50)
(2, 50)
(2, 50, 768)
[[[-0.2086307   0.35827795  0.21853793 ... -0.44025412  0.4094602
   -0.17016348]
  [-0.39738995  0.36013675  0.7867802  ... -1.0404619  -0.18768609
   -0.14245623]
  [ 0.30535394 -0.4651899  -0.37177834 ...  0.28120545  0.8466301
    0.05893405]
  ...
  [ 0.46516663 -0.4488659  -0.32406497 ...  0.310403   -0.03735422
   -0.20383228]
  [ 0.40643167 -0.45044908  0.05603563 ...  0.08266892 -0.00854655
   -0.45705166]
  [ 0.24296917  0.0176946  -0.1882883  ...  0.00265872 -0.22536862
   -0.23967592]]

 [[-0.07990564  0.23048158  0.50818104 ... -0.60418904  0.3972589
   -0.3813861 ]
  [-0.44916582  0.0599215   0.53094774 ... -0.83169353 -0.35385585
    0.20767196]
  [-0.11684791 -0.95862097 -0.12063565 ...  0.9308959   0.7715939
    0.07150119]
  .