## 数据及环境准备

In [None]:
# 自带数据且已经解压，忽略前两个代码块
from modelarts.session import Session
session = Session()

if session.region_name == 'cn-north-4':
    bucket_path = 'professional-construction/NLP/kbqa.zip'
else:
    print("请更换地区到北京四")
    
session.download_data(bucket_path=bucket_path, path='./kbqa.zip')

In [9]:
! unzip kbqa.zip

Archive:  kbqa.zip
   creating: kbqa/
   creating: kbqa/bert/
   creating: kbqa/bert/bert_zh_L-12_H-768_A-12_2/
   creating: kbqa/bert/bert_zh_L-12_H-768_A-12_2/assets/
  inflating: kbqa/bert/bert_zh_L-12_H-768_A-12_2/assets/vocab.txt  
  inflating: kbqa/bert/bert_zh_L-12_H-768_A-12_2/saved_model.pb  
   creating: kbqa/bert/bert_zh_L-12_H-768_A-12_2/variables/
  inflating: kbqa/bert/bert_zh_L-12_H-768_A-12_2/variables/variables.data-00000-of-00001  
  inflating: kbqa/bert/bert_zh_L-12_H-768_A-12_2/variables/variables.index  
   creating: kbqa/Data/
  inflating: kbqa/Data/construct_dataset_attribute.py  
  inflating: kbqa/Data/construct_dataset_ner.py  
   creating: kbqa/Data/NER_Data/
  inflating: kbqa/Data/NER_Data/test.txt  
  inflating: kbqa/Data/NER_Data/train.txt  
   creating: kbqa/Data/NLPCC2016KBQA/
  inflating: kbqa/Data/NLPCC2016KBQA/nlpcc-iccpol-2016.kbqa.kb  
  inflating: kbqa/Data/NLPCC2016KBQA/nlpcc-iccpol-2016.kbqa.testing-data  
  inflating: kbqa/Data/NLPCC2016KBQA/nlpc

In [1]:
! pip install tensorflow-hub
! pip install bert-for-tf2
! pip install seqeval

Collecting tensorflow-hub
  Using cached tensorflow_hub-0.13.0-py2.py3-none-any.whl (100 kB)
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.13.0
Collecting bert-for-tf2
  Using cached bert-for-tf2-0.14.9.tar.gz (41 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py-params>=0.9.6
  Using cached py-params-0.10.2.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting params-flow>=0.8.0
  Using cached params-flow-0.8.2.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting tqdm
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25ldone
[?25h  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30516 sha256=b9c8695146fc8181ba81d3814d1e81714a8f7c71d930ec1f1c52d697c57cff22
  Stored in directory: /home/fangguian_i2023/.cache/pip/wheel

In [2]:
# 装好上述内容之后没有TensorFlow 2.1.0，需要手动安装 查询不到2.1.0，只能安装2.11.0，后续可能因为版本出现问题
! pip install tensorflow



# 知识库问答

## 实体识别

In [1]:
import os
import json
import numpy as np
import pandas as pd
import os 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # TensorFlow的warning信息太多，这里屏蔽掉
import tensorflow as tf
import tensorflow_hub as hub
from bert import bert_tokenization
from tensorflow.keras.models import load_model
from seqeval.metrics.sequence_labeling import get_entities, classification_report

### 加载数据

In [2]:
def read_conll_format_file(file_path: str,
                           text_index: int = 0,
                           label_index: int = 1):
    """
    conll 格式数据读取
    Args:
        file_path: 文件路径
        text_index: 输入所在列的索引，默认是0，第一列
        label_index: 标签所在列的索引，默认是1，第二列
    Returns:
    """
    x_data, y_data = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()
        x, y = [], []
        for line in lines:
            rows = line.split()
            if len(rows) == 0: # 如果当前行为空格，则表示上一句结束，把上一句的序列数据加入到数据中
                x_data.append(x)
                y_data.append(y)
                x = []
                y = []
            else: # 添加输入和标签
                x.append(rows[text_index])
                y.append(rows[label_index])
                
    return x_data, y_data

In [5]:
train_x, train_y = read_conll_format_file('./kbqa/Data/NER_Data/train.txt')
test_x, test_y = read_conll_format_file('./kbqa/Data/NER_Data/test.txt')

print(train_x[1])
print(train_y[1])

['《', '高', '等', '数', '学', '》', '是', '哪', '个', '出', '版', '社', '出', '版', '的', '？']
['O', 'B-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


### 参数配置

In [3]:
class BERT_NER_Config():
    max_seq_length = 64  # 输入序列的最大长度，用于把输入padding成统一长度
    bert_dir = './kbqa/bert/bert_zh_L-12_H-768_A-12_2' # bert预训练模型的路径
    epochs = 8 # 训练轮数
    batch_size = 256 
    
bert_ner_config = BERT_NER_Config()

### 预处理（文字->id）

In [4]:
class NER_Preprocessor(object):
    """
    预处理类，字->id, 标签->id
    """
    def __init__(self, config):
        """
        初始化，加载分词器，此处是按字符切词
        """
        self.config = config
        self.label_dict = {'O':0, 'B-ENT':1, 'I-ENT':2}
        self.idx2label = {0: 'O', 1: 'B-ENT', 2: 'I-ENT'}
        self._tokenizer = bert_tokenization.FullTokenizer(vocab_file=config.bert_dir+'/assets/vocab.txt', do_lower_case=False)
        
    def get_masks(self, tokens):
        """获取 BERT mask id 输入"""

        return [1]*len(tokens)

    def get_segments(self, tokens):
        """获取 BERT segments id 输入，第一句用0表示，第二句用1表示"""

        segments = []
        current_segment_id = 0
        for token in tokens:
            segments.append(current_segment_id)
            if token == "[SEP]":
                current_segment_id = 1
        return segments

    def get_ids(self, tokens):
        """获取 BERT 字符id 输入"""
        token_ids = [self._tokenizer.vocab.get(token, self._tokenizer.vocab['[UNK]']) for token in tokens]

        return token_ids
    
    def transform(self, stokens):
        """将词序列转换成BERT需要的字符id、mask id、segments id"""
        stokens = ["[CLS]"] + stokens + ["[SEP]"]

        input_ids = self.get_ids(stokens)
        input_masks = self.get_masks(stokens)
        input_segments = self.get_segments(stokens)
        
        return input_ids, input_masks, input_segments
    
    def conll_transform(self, x_train, y_train=None):
        """训练数据批量转换"""
        all_input_ids, all_input_masks, all_input_segments = [], [], []
        for stokens in x_train:
            input_ids, input_masks, input_segments = self.transform(stokens)
            all_input_ids.append(input_ids)
            all_input_masks.append(input_masks)
            all_input_segments.append(input_segments)
            
        ## 输入数据统一padding成统一长度，以矩阵的形式送入模型    
        all_input_ids = tf.keras.preprocessing.sequence.pad_sequences(all_input_ids, maxlen=self.config.max_seq_length, padding='post')
        all_input_masks = tf.keras.preprocessing.sequence.pad_sequences(all_input_masks, maxlen=self.config.max_seq_length, padding='post')
        all_input_segments = tf.keras.preprocessing.sequence.pad_sequences(all_input_segments, maxlen=self.config.max_seq_length, padding='post')
            
        if y_train:
            all_label_ids = []
            for slabels in y_train:
                slabels = ['O'] + slabels + ['O']
                slabel_ids = [self.label_dict[label] for label in slabels]
                all_label_ids.append(slabel_ids)
                
            all_label_ids = tf.keras.preprocessing.sequence.pad_sequences(all_label_ids, maxlen=self.config.max_seq_length, padding='post')
                
            return [all_input_ids, all_input_masks, all_input_segments], all_label_ids
        else:
            return [all_input_ids, all_input_masks, all_input_segments]
              
    def online_transform(self, text):
        """单个文本在线转换"""
        stokens = self._tokenizer.tokenize(text)
        input_ids, input_masks, input_segments = self.transform(stokens)
        
        ## 输入数据padding成统一长度
        input_ids = tf.keras.preprocessing.sequence.pad_sequences([input_ids], maxlen=self.config.max_seq_length, padding='post')
        input_masks = tf.keras.preprocessing.sequence.pad_sequences([input_masks], maxlen=self.config.max_seq_length, padding='post')
        input_segments = tf.keras.preprocessing.sequence.pad_sequences([input_segments], maxlen=self.config.max_seq_length, padding='post')
        
        return [input_ids, input_masks, input_segments]
    
    def label_inverse_transform(self, label_ids, lengths=None):
        """标签id转换回标签名"""
        result = []
        for index, seq in enumerate(label_ids):
            labels_ = []
            for idx in seq:
                labels_.append(self.idx2label[idx])
            if lengths is not None:
                labels_ = labels_[1:lengths[index] + 1]
            else:
                labels_ = labels_[1:-1]
            result.append(labels_)
        return result

In [6]:
ner_preprocessor = NER_Preprocessor(bert_ner_config)
ids, masks, segments = ner_preprocessor.online_transform("《高等数学》？")
print("id: {}".format(ids))
print("masks: {}".format(masks))
print("segments: {}".format(segments))

id: [[ 101  517 7770 5023 3144 2110  518 8043  102    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
masks: [[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
segments: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


### 模型构建

In [7]:
class BERT_LSTM_NER():
    def __init__(self, config):
        """初始化预处理器"""
        self._config = config # 配置参数
        self._preprocessor = NER_Preprocessor(config) # 预处理器
    
    def build_model(self):
        """搭建模型架构"""
        # 定义bert模型输入，包括字符id, mask id, segment id
        max_seq_length = self._config.max_seq_length
        input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                               name="input_word_ids")
        input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                           name="input_mask")
        segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                            name="segment_ids")
        bert_layer = hub.KerasLayer(self._config.bert_dir, trainable=False) # tensorflow hub加载bert并转化为keras层
        pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
        # 取bert的序列输出sequence_output, 后接bilstm
        bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=128,return_sequences=True))(sequence_output)
        
        dense = tf.keras.layers.Dense(units=64, activation='tanh', name='dense_layer_1')(bilstm)
        
        num_class = len(self._preprocessor.label_dict)
        output = tf.keras.layers.Dense(units=num_class, activation='softmax', name='output')(dense)
        

        model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=output)
        
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        self.model = model
        
    def fit(self, x_train, y_train, x_valid=None, y_valid=None):
        """训练"""
        
        self.build_model()
        self.model.summary()
        x_train_processed, y_train_processed = self._preprocessor.conll_transform(x_train, y_train)
        
        if x_valid is not None and y_valid is not None: # 验证集存在
            x_valid_preprocessed, y_valid_preprocessed = self._preprocessor.conll_transform(x_valid, y_valid)
            self.model.fit(x_train_processed, y_train_processed, 
                           validation_data=(x_valid_preprocessed, y_valid_preprocessed),
                           epochs=self._config.epochs,
                           batch_size=self._config.batch_size,
                           shuffle=True)
        else: # 验证集不存在
            self.model.fit(x_train_processed, y_train_processed, 
                           epochs=self._config.epochs,
                           batch_size=self._config.batch_size,
                           shuffle=True)
            
    def evaluate(self, x_test, y_test):
        """测试评估"""
        x_test_preprocessed = self._preprocessor.conll_transform(x_test)
        y_preds = self.model.predict(x_test_preprocessed)
        y_preds = np.argmax(y_preds, -1)
        lengths = [len(seq) for seq in y_test]
        y_preds = self._preprocessor.label_inverse_transform(y_preds, lengths) #预测的id转换回标签名
        
        new_y_trues, new_y_preds = [], []
        for y_true_seq, y_pred_seq in zip(y_test, y_preds):
            new_y_true_seq, new_y_pred_seq = [], []
            for y_true, y_pred in zip(y_true_seq, y_pred_seq):
                new_y_true_seq.append(str(y_true))
                new_y_pred_seq.append(str(y_pred))
            new_y_trues.append(new_y_true_seq)
            new_y_preds.append(new_y_pred_seq)
            
        print(classification_report(new_y_trues, new_y_preds)) # 输出precision, recall, f1
        
    def inference(self, text):
        """推理"""
        inputs = self._preprocessor.online_transform(text)
        y_preds = self.model.predict(inputs)
        y_preds = np.argmax(y_preds, -1)
        y_preds = self._preprocessor.label_inverse_transform(y_preds)
        print(y_preds)
        entities = get_entities(y_preds[0])
        format_entites = []
        for entity in entities:
            value = text[entity[1]:entity[2] + 1]

            # 输入实体类型， 开始位置，结束位置，实体值
            format_entites.append({
                "entity": entity[0],
                "start": entity[1],
                "end": entity[2],
                "value": value,
            })

        return {"text": text, "entities": format_entites}
            
    def save(self, model_save_dir):
        """保存模型"""
        model_path = os.path.join(model_save_dir, "model.h5")
        self.model.save_weights(model_path)
    
    def restore(self, model_dir):
        """ 加载模型"""
        self.build_model()
        self.model.load_weights(os.path.join(model_dir, "model.h5"))

In [None]:
#改进模型
#!pip install keras_nlp
from transformers import TFRobertaModel
from tensorflow.keras.layers import Input, Dense, TimeDistributed, LSTM, Bidirectional, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras_nlp import layers
from keras_nlp import metrics
import tensorflow as tf

class Roberta_BiLSTM_CRF():
    def __init__(self, config):
        """初始化预处理器"""
        self._config = config # 配置参数
        self._preprocessor = NER_Preprocessor(config) # 预处理器
    
    def build_model(self):
        """搭建模型架构"""
        # 定义roberta模型输入，包括字符id, mask id, segment id
        max_seq_length = self._config.max_seq_length
        input_word_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
        input_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
        segment_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
        roberta = TFRobertaModel.from_pretrained('roberta-base')
        sequence_output = roberta([input_word_ids, input_mask, segment_ids]).last_hidden_state
        # 取roberta的序列输出sequence_output, 后接bilstm
        bilstm = Bidirectional(LSTM(units=128, return_sequences=True))(sequence_output)
        output = TimeDistributed(Dense(len(self._preprocessor.label_dict), activation='softmax'))(bilstm)
        
        model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=output)
        
        model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=[tf.keras.metrics.CategoricalAccuracy()])
        
        self.model = model
        
    def fit(self, x_train, y_train, x_valid=None, y_valid=None):
        """训练"""
        
        self.build_model()
        self.model.summary()
        x_train_processed, y_train_processed = self._preprocessor.conll_transform(x_train, y_train)
        y_train_processed = tf.one_hot(y_train_processed, depth=len(self._preprocessor.label_dict))
        self.model.fit(x_train_processed, y_train_processed, epochs=8, batch_size=32)

            
    def evaluate(self, x_test, y_test):
        """测试评估"""
        x_test_preprocessed = self._preprocessor.conll_transform(x_test)
        y_preds = self.model.predict(x_test_preprocessed)
        y_preds = np.argmax(y_preds, -1)
        lengths = [len(seq) for seq in y_test]
        y_preds = self._preprocessor.label_inverse_transform(y_preds, lengths)
        y_test = self._preprocessor.label_inverse_transform(y_test, lengths)
        print(classification_report(y_test, y_preds)) # 输出precision, recall, f1

    def save(self, model_dir):
        """保存模型"""
        self.model.save_weights(model_dir + 'model.h5')
        with open(model_dir + 'preprocessor.pkl', 'wb') as f:
            pickle.dump(self._preprocessor, f)

    def restore(self, model_dir):
        """加载模型"""
        with open(model_dir + 'preprocessor.pkl', 'rb') as f:
            self._preprocessor = pickle.load(f)
        self.build_model()
        self.model.load_weights(model_dir + 'model.h5')


### 模型训练，评估

In [8]:
import warnings
warnings.filterwarnings('ignore') # 注：放的位置也会影响效果，真是奇妙的代码
#bert_ner = BERT_LSTM_NER(bert_ner_config) #改进模型
bert_ner = Roberta_BiLSTM_CRF(bert_ner_config)
bert_ner.fit(train_x, train_y, test_x, test_y)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 64)]         0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 64)]         0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 64)]         0           []                               
                                                                                                  
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  124645632  ['input_word_ids[0][0]',         
 el)                            thPoolingAndCrossAt               'input_mask[0][0]',         

In [9]:
bert_ner.evaluate(test_x, test_y)

              precision    recall  f1-score   support

         ENT       0.96      0.97      0.97      9030

   micro avg       0.96      0.97      0.97      9030
   macro avg       0.96      0.97      0.97      9030
weighted avg       0.96      0.97      0.97      9030



### 模型保存

In [10]:
bert_ner.save('kbqa/output_ner/')

### 模型加载，推理

In [16]:
bert_ner = BERT_LSTM_NER(bert_ner_config)
bert_ner.restore('kbqa/output_ner/')
bert_ner.inference("《高等数学》的价格多少？")

[['O', 'B-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]


{'text': '《高等数学》的价格多少？',
 'entities': [{'entity': 'ENT', 'start': 1, 'end': 4, 'value': '高等数学'}]}

## 语义匹配

In [9]:
def load_data(file_path):
    """加载"""
    data_df = pd.read_csv(file_path, sep='\t', header=None, names=['question', 'attribute', 'label'])
    sent_pairs = data_df.apply(lambda row: (row.question, row.attribute), axis=1).tolist()
    labels = data_df.apply(lambda row: int(row.label), axis=1).tolist()

    return sent_pairs, np.asarray(labels)

In [10]:
x_train, y_train = load_data("./kbqa/Data/Sim_Data/train.txt")
x_valid, y_valid = load_data("./kbqa/Data/Sim_Data/dev.txt")
x_test, y_test = load_data("./kbqa/Data/Sim_Data/test.txt")

x_train[0], y_train[0]

(('请问有没有其他出版社出版了东京暗鸦？', '版权信息'), 0)

In [11]:
class BERT_Sim_Config():
    bert_dir = "./kbqa/bert/bert_zh_L-12_H-768_A-12_2" #预训练bert模型的路径
    max_seq_length = 128 # 最大序列长度，用于padding成统一的长度
    epochs = 5
    batch_size = 256
    
bert_sim_config = BERT_Sim_Config()

In [12]:
class Sim_Preprocessor(object):
    """
    预处理类，用于处理转换用户输入，把文字转换成对应的id
    """
    def __init__(self, config):
        """
        初始化，加载分词器，此处是按字符切词
        """
        self.config = config
        self._tokenizer = bert_tokenization.FullTokenizer(vocab_file=config.bert_dir+'/assets/vocab.txt', do_lower_case=False)
        
    def tokenize(self, sent1, sent2):
        """
        句对分词
        Args:
            sent1: 句子1， e.g: "我爱中国"
            sent2: 句子2， e.g.: "我爱杭州"
        Return:
            ['[CLS]', '我', '爱', '中', '国', '[SEP]', '我', '爱', '杭', '州', '[SEP]']
        """
        stokens_1 = self._tokenizer.tokenize(sent1)
        stokens_2 = self._tokenizer.tokenize(sent2)
        return ['[CLS]'] + stokens_1 + ['[SEP]'] + stokens_2 + ['[SEP]']
        
    def get_masks(self, tokens):
        """获取 BERT mask id 输入"""

        return [1]*len(tokens)

    def get_segments(self, tokens):
        """获取 BERT segments id 输入，第一句用0表示，第二句用1表示"""

        segments = []
        current_segment_id = 0
        for token in tokens:
            segments.append(current_segment_id)
            if token == "[SEP]":
                current_segment_id = 1
        return segments

    def get_ids(self, tokens):
        """获取 BERT 字符id 输入"""
        token_ids = [self._tokenizer.vocab.get(token, self._tokenizer.vocab['[UNK]']) for token in tokens]

        return token_ids
    
    def transform(self, sent1, sent2):
        """将句对转换成BERT需要的字符id、mask id、segments id"""
        
        stokens = self.tokenize(sent1, sent2)

        input_ids = self.get_ids(stokens)
        input_masks = self.get_masks(stokens)
        input_segments = self.get_segments(stokens)
        
        return input_ids, input_masks, input_segments
        
    def batch_transform(self, sent_pair_list, max_len=None):
        """批量转换"""

        batch_input_ids, batch_input_masks, batch_input_segments = [], [], []
        for sent1, sent2 in sent_pair_list:
            input_ids, input_masks, input_segments = self.transform(sent1, sent2)
            batch_input_ids.append(input_ids)
            batch_input_masks.append(input_masks)
            batch_input_segments.append(input_segments)
        
        batch_input_ids = tf.keras.preprocessing.sequence.pad_sequences(batch_input_ids, maxlen=self.config.max_seq_length, padding='post')
        batch_input_masks = tf.keras.preprocessing.sequence.pad_sequences(batch_input_masks, maxlen=self.config.max_seq_length, padding='post')
        batch_input_segments = tf.keras.preprocessing.sequence.pad_sequences(batch_input_segments, maxlen=self.config.max_seq_length, padding='post')
        
            
        return [batch_input_ids, batch_input_masks, batch_input_segments]

In [13]:
sim_preprocessor = Sim_Preprocessor(bert_sim_config)

sent1 = "我爱中国"
sent2 = "我爱杭州"
print("按字符切分：")
print(sim_preprocessor.tokenize(sent1, sent2))
print("把字符编码成对应的字符id、mask id以及句段id：")
print(sim_preprocessor.transform(sent1, sent2))

按字符切分：
['[CLS]', '我', '爱', '中', '国', '[SEP]', '我', '爱', '杭', '州', '[SEP]']
把字符编码成对应的字符id、mask id以及句段id：
([101, 2769, 4263, 704, 1744, 102, 2769, 4263, 3343, 2336, 102], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])


In [14]:
class BertSim(object):
    """模型类定义"""
    def __init__(self, config):
        """
        初始化
        Args:
            config: 配置参数
        """
        self._config = config
        self._preprocessor = Sim_Preprocessor(config)
        
    def build_model(self):
        """搭建模型架构"""
        max_seq_length = self._config.max_seq_length
        input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                               name="input_word_ids")
        input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                           name="input_mask")
        segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                            name="segment_ids")
        bert_layer = hub.KerasLayer(self._config.bert_dir, trainable=False)
        pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
        
        dense = tf.keras.layers.Dense(units=128, activation='relu')(pooled_output)
        output = tf.keras.layers.Dense(units=1, activation='sigmoid')(dense)
        
        model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        self.model = model
          
    def fit(self, x_train, y_train, x_valid=None, y_valid=None, **kwargs):
        """
        模型训练
        Args:
            x_data: list[(sent1, sent2)]
            y_data: list[label]
        """
        self.build_model() # 搭建模型
        self.model.summary()
        x_train = self._preprocessor.batch_transform(x_train) #输入预处理
        if x_valid is not None and y_valid is not None:
            x_valid = self._preprocessor.batch_transform(x_valid)
            self.model.fit(
                x_train,
                y_train,
                validation_data=[x_valid, y_valid],
                epochs=self._config.epochs,
                batch_size=self._config.batch_size,
                shuffle=True,
                **kwargs) #训练
            
        else:
            self.model.fit(
                x_train,
                y_train,
                epochs=self._config.epochs,
                batch_size=self._config.batch_size,
                shuffle=True,
                **kwargs) #训练
            
    def evaluate(self, x_test, y_test):
        """测试集评估"""
        x_test = self._preprocessor.batch_transform(x_test)
        self.model.evaluate(x_test, y_test, batch_size=self._config.batch_size)
 
    def predict_similarity(self, sent1, sent2):
        """
        预测两个句子的相似度
        """
        input_ids, input_masks, input_segments = self._preprocessor.transform(sent1, sent2)
        
        ## 输入数据padding成统一长度
        input_ids = tf.keras.preprocessing.sequence.pad_sequences([input_ids], maxlen=self._config.max_seq_length, padding='post')
        input_masks = tf.keras.preprocessing.sequence.pad_sequences([input_masks], maxlen=self._config.max_seq_length, padding='post')
        input_segments = tf.keras.preprocessing.sequence.pad_sequences([input_segments], maxlen=self._config.max_seq_length, padding='post')
        
        result = self.model.predict([input_ids, input_masks, input_segments])[0][0]
        
        return result
        
    def save(self, model_save_dir):
        """
        保存模型
        """
        
        model_path = os.path.join(model_save_dir, "model.h5")
        self.model.save(model_path)
    
    @classmethod
    def restore(self, model_dir):
        """
        加载模型
        """
        self.model=load_model(os.path.join(model_dir, "model.h5"), custom_objects={'KerasLayer':hub.KerasLayer})

In [18]:
bert_sim = BertSim(bert_sim_config)
bert_sim.fit(x_train, y_train, x_valid, y_valid)

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 keras_layer_2 (KerasLayer)     [(None, 768),        102267649   ['input_word_ids[0][0]',         
                                 (None, 128, 768)]                'input_mask[0][0]',       

In [20]:
bert_sim.evaluate(x_test, y_test)



In [19]:
bert_sim.save('kbqa/output_sim/')

In [15]:
bert_sim = BertSim(bert_sim_config)
bert_sim.restore('kbqa/output_sim/') #只有restore()方法没有load()方法
bert_sim.predict_similarity("《机械设计基础》这本书的作者是谁？", "作者")



0.99578923

### 知识库问答

In [17]:
class KBQA(object):
    def __init__(self, kb_path, ner_model, semantic_model):
        """
        初始化知识库，实体识别模型，语义匹配模型
        Args:
            kb_path: 知识库文件路径
            ner_model: 实体识别模型
            semantic_model: 语义匹配模型
        """
        data_kb_list = []
        data_kb = pd.read_csv(kb_path)

        for row in data_kb.index:
            question = data_kb.loc[row,"q_str"]
            entity = data_kb.loc[row,"t_str"].split("|||")[0].split(">")[1].strip()
            attribute = data_kb.loc[row, "t_str"].split("|||")[1].strip()
            answer = data_kb.loc[row, "t_str"].split("|||")[2].strip()
            
            question = question.replace("#", "").replace("[UNK]", "%").replace("\n", "")
            entity = entity.replace("#", "").replace("[UNK]", "%").replace("\n", "")
            attribute = attribute.replace("#", "").replace("[UNK]", "%").replace("\n", "")
            answer = answer.replace("#", "").replace("[UNK]", "%").replace("\n", "")

            data_kb_list.append([question, entity, attribute, answer])

        self._data_kb_list = data_kb_list
        self._ner_model = ner_model
        self._semantic_model = semantic_model

    def query(self, question, method="kb"):
        """
        查询
        Args:
            question: 查询语句
            method：kb:直接匹配知识库三元组, faq: 匹配知识库中自带的问句，如果匹配，返回对应的答案
        """
        if len(question) == 0:
            print("再见啦！")
            return

        print('\n你的问题是:{}'.format(question))

        ner_res = self._ner_model.inference(question)
        if not ner_res['entities']:
            print("未找到实体，请检查问句是否包含实体词，或调整实体识别模型，提升识别性能")
            return
        
        entity_value = ner_res['entities'][0]['value']
        print('识别的实体是：{}'.format(entity_value))

        ans_range = []
        for j in range(len(self._data_kb_list)):
            if self._data_kb_list[j][1] == entity_value:
                print("结果可能来自：", self._data_kb_list[j])
                ans_range.append(self._data_kb_list[j])
                
        ans = None
        ans_base = None
        score = 0

        for k in range(len(ans_range)):
            if method=='faq':
                print("句子_{}: {}".format(k+1, ans_range[k][0]))
            else:
                print("\n知识三元组%d："%(k+1),ans_range[k][1],ans_range[k][2],ans_range[k][3])

            #非语义匹配
            if ans_range[k][2] in question:
                print("属性“",ans_range[k][2],"”在问题中")
                ans_ = 1

            #语义匹配
            else:
                if method=="faq":
                    ans_ = self._semantic_model.predict_similarity(question, ans_range[k][0])
                    print("问句-问句相似度为：", ans_)
                else:                
                    #ans_ = self._semantic_model.predict_similarity(question, ans_range[k][1]+ans_range[k][2]+ans_range[k][3])
                    ans_ = self._semantic_model.predict_similarity(question, ans_range[k][2])
                    print("问句--属性匹配度为：", ans_)

            if score < ans_:
                score = ans_
                ans = ans_range[k][3]
                ans_base = ans_range[k]
        
        threshold = 0.8 if method=='faq' else 0.7

        if score < threshold:
            print("\n\033[1;31m答案不确定\033[0m")
        else:
            print("\n\033[1;31m答案是：{}\033[0m".format(ans))
            print("答案来自三元组：",ans_base[1],ans_base[2],ans_base[3])


In [18]:
# 加载实体识别模型
bert_ner_config = BERT_NER_Config()
bert_ner = BERT_LSTM_NER(bert_ner_config)
bert_ner.restore('kbqa/output_ner/')

# 加载属性映射模型
bert_sim_config = BERT_Sim_Config()
bert_sim = BertSim(bert_sim_config)
bert_sim.restore('kbqa/output_sim/')

# 实例化问答对象
kbqa = KBQA(kb_path="./kbqa/Data/test.csv", ner_model=bert_ner, semantic_model=bert_sim)

In [19]:
kbqa.query('《机械设计基础》的价格多少', method='kb')


你的问题是:《机械设计基础》的价格多少
[['O', 'B-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
识别的实体是：机械设计基础
结果可能来自： ['机械设计基础的isbn码是什么？', '机械设计基础', 'isbn', '9787040192094']
结果可能来自： ['机械设计基础的定价是多少？', '机械设计基础', '定价', '24.50元']

知识三元组1： 机械设计基础 isbn 9787040192094
问句--属性匹配度为： 0.16088037

知识三元组2： 机械设计基础 定价 24.50元
问句--属性匹配度为： 0.9485226

[1;31m答案是：24.50元[0m
答案来自三元组： 机械设计基础 定价 24.50元


In [20]:
kbqa.query('《机械设计基础》的价格多少', method='faq')


你的问题是:《机械设计基础》的价格多少
[['O', 'B-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'I-ENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
识别的实体是：机械设计基础
结果可能来自： ['机械设计基础的isbn码是什么？', '机械设计基础', 'isbn', '9787040192094']
结果可能来自： ['机械设计基础的定价是多少？', '机械设计基础', '定价', '24.50元']
句子_1: 机械设计基础的isbn码是什么？
问句-问句相似度为： 0.9989245
句子_2: 机械设计基础的定价是多少？
问句-问句相似度为： 0.9994774

[1;31m答案是：24.50元[0m
答案来自三元组： 机械设计基础 定价 24.50元
