# BERT + Keras 对新闻标题分类

日期：2020年4月3日

此方法与 PyTorch 的前半部分基本一致。

In [4]:
import os
import re
import time

import numpy as np
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences  # padding句子用
from tqdm.notebook import tqdm
from transformers import create_optimizer
from transformers import TFBertModel, BertTokenizer, TFBertForSequenceClassification

In [5]:
print(tf.__version__)

2.0.0


In [6]:
print(transformers.__version__)

2.5.1


In [7]:
# 查看可用GPU数量
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# 指定GPU
print(tf.device('/device:gpu:0'))

Num GPUs Available:  2
<tensorflow.python.eager.context._EagerDeviceContext object at 0x7fb9f3182590>


In [8]:
class Config(object):
    """配置参数"""
    def __init__(self):
        self.model_name = 'BERT_SPBM'           # 模型名称
        self.bert_path = './bert-chinese/'     # BERT 文件路径

        self.batch_size = 128                   # mini-batch大小
        self.epsilon = 1e-08                    # adam参数
        self.hidden_size = 768                  # 隐藏层神经元
        self.hidden_dropout_prob = 0.1          # dropout比率
        self.learning_rate = 2e-5               # 学习率
        self.max_len = 32                       # 句子的最长padding长度
        self.num_classes = 240                  # 类别数
        self.num_epoch = 2                      # epoch

In [9]:
config = Config()

## 读取数据

首先读取新闻数据。这部分可以参考 Pytorch 部分。

In [10]:
file = "train.txt"

with open(file, encoding="utf-8") as f:
    sentences_and_labels = [line for line in f.readlines()]
f.close()

In [11]:
# 前几句
sentences_and_labels[0:10]

['中华女子学院：本科层次仅1专业招男生\t3\n',
 '两天价网站背后重重迷雾：做个网站究竟要多少钱\t4\n',
 '东5环海棠公社230-290平2居准现房98折优惠\t1\n',
 '卡佩罗：告诉你德国脚生猛的原因 不希望英德战踢点球\t7\n',
 '82岁老太为学生做饭扫地44年获授港大荣誉院士\t5\n',
 '记者回访地震中可乐男孩：将受邀赴美国参观\t5\n',
 '冯德伦徐若瑄隔空传情 默认其是女友\t9\n',
 '传郭晶晶欲落户香港战伦敦奥运 装修别墅当婚房\t1\n',
 '《赤壁OL》攻城战诸侯战硝烟又起\t8\n',
 '“手机钱包”亮相科博会\t4\n']

In [12]:
seq, label = sentences_and_labels[2].split('\t')
print(seq)
print(label)

东5环海棠公社230-290平2居准现房98折优惠
1



In [13]:
sentences = []
labels = []

for sentence_with_label in sentences_and_labels:
    sentence, label = sentence_with_label.split('\t')
    sentences.append(sentence)
    labels.append(label)

In [14]:
print(sentences[0:3])
print(labels[0:3])

['中华女子学院：本科层次仅1专业招男生', '两天价网站背后重重迷雾：做个网站究竟要多少钱', '东5环海棠公社230-290平2居准现房98折优惠']
['3\n', '4\n', '1\n']


## 输入准备

### Tokenizer

In [15]:
tokenizer = BertTokenizer.from_pretrained('./bert-chinese/', do_lower_case=True)
tokenized_texts = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

In [16]:
# 这句话的input_ids
print(f"Tokenize 前的第一句话：\n{sentences[0]}\n")
print(f"Tokenize 后的第一句话: \n{tokenized_texts[0]}")

Tokenize 前的第一句话：
中华女子学院：本科层次仅1专业招男生

Tokenize 后的第一句话: 
[101, 704, 1290, 1957, 2094, 2110, 7368, 8038, 3315, 4906, 2231, 3613, 788, 122, 683, 689, 2875, 4511, 4495, 102]


In [17]:
print (len(tokenized_texts))  # 180000句话

180000


### Padding

In [18]:
# 输入padding
# 此函数在keras里面
input_ids = pad_sequences([txt for txt in tokenized_texts],
                          maxlen=config.max_len, 
                          dtype="long", 
                          truncating="post", 
                          padding="post")

In [19]:
print(f"Tokenize 前的第一句话：\n\n{sentences[0]}\n\n")
print(f"Tokenize 后的第一句话: \n\n{tokenized_texts[0]}\n\n")
print(f"Padding 后的第一句话： \n\n{input_ids[0]}")

Tokenize 前的第一句话：

中华女子学院：本科层次仅1专业招男生


Tokenize 后的第一句话: 

[101, 704, 1290, 1957, 2094, 2110, 7368, 8038, 3315, 4906, 2231, 3613, 788, 122, 683, 689, 2875, 4511, 4495, 102]


Padding 后的第一句话： 

[ 101  704 1290 1957 2094 2110 7368 8038 3315 4906 2231 3613  788  122
  683  689 2875 4511 4495  102    0    0    0    0    0    0    0    0
    0    0    0    0]


In [20]:
# 转换回来
raw_texts = [tokenizer.decode(input_ids[0])]
print(raw_texts)
print(len(raw_texts))

['[CLS] 中 华 女 子 学 院 ： 本 科 层 次 仅 1 专 业 招 男 生 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']
1


In [21]:
# 创建attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

In [22]:
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


### Labels

In [23]:
print(len(labels))
print(labels[0:10])

180000
['3\n', '4\n', '1\n', '7\n', '5\n', '5\n', '9\n', '1\n', '8\n', '4\n']


In [24]:
clean_labels = []
for label in labels:
    clean_labels.append(int(label.strip('\n')))

print(clean_labels[0:10])

[3, 4, 1, 7, 5, 5, 9, 1, 8, 4]


In [25]:
input_ids = np.asarray(input_ids)
clean_labels = np.asarray(clean_labels)
attention_masks = np.asarray(attention_masks)

In [26]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, clean_labels, 
                                                            random_state=2019, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2019, test_size=0.1)

In [27]:
print(train_labels[0:10])
print(len(set(train_labels)))

[4 3 4 3 8 8 1 8 7 7]
10


In [28]:
print(f"      标签总数：", len(labels))
print(f"训练集标签总数：", len(train_labels))
print(f"验证集标签总数：", len(validation_labels))

      标签总数： 180000
训练集标签总数： 162000
验证集标签总数： 18000


In [29]:
# 用于测试额外拼接
train_add = np.random.randn(len(train_labels))
print(len(train_add))
print(type(train_add))
print(train_add[0:10])

162000
<class 'numpy.ndarray'>
[ 1.01630025 -1.76619566  0.68270153  0.69482675 -0.75537917 -1.78704829
  0.71261622 -0.71917118  1.81378373  0.97758765]


Keras 可以直接接受 `np.array()` 形式，所以不用转换为 `Tensor`。

In [30]:
train_X = [train_inputs, train_masks, train_add]
train_y = train_labels

##  构建模型

In [31]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
optimizer = Adam(learning_rate=config.learning_rate, epsilon=config.epsilon)

In [34]:
def build_model():
    """构建模型"""
    token_inputs = tf.keras.layers.Input((config.max_len), dtype=tf.int32, name='Input_word_ids')
    mask_inputs = tf.keras.layers.Input((config.max_len,), dtype=tf.int32, name='Input_masks')
    #seg_inputs = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')

    # going with pooled output since seq_output results in ResourceExhausted Error even with GPU
    # 导入 BERT 模型
    bert_model = TFBertModel.from_pretrained(config.bert_path)
    _, pooled_output = bert_model(inputs=token_inputs, 
                                   attention_mask=mask_inputs, 
                                   token_type_ids=None,
                                   position_ids=None)
    # X = GlobalAveragePooling1D()(pooled_output)
    X = tf.keras.layers.Dropout(0.1)(pooled_output)
    output_ = tf.keras.layers.Dense(10, activation='softmax', name='output')(X)

    # 输入输出确定
    bert_model2 = tf.keras.models.Model([token_inputs, mask_inputs], output_)

    print(bert_model2.summary())

    # 编译模型
    bert_model2.compile(optimizer=optimizer, loss=loss_function)

    return bert_model2

如果需要修改模型结构，参考如下，比如多了一个 `add_inputs`：

```python
def build_model():
    """构建模型"""
    token_inputs = tf.keras.layers.Input((config.max_len), dtype=tf.int32, name='Input_word_ids')
    mask_inputs = tf.keras.layers.Input((config.max_len,), dtype=tf.int32, name='Input_masks')
    add_inputs = tf.keras.layers.Input((1,), dtype=tf.float32, name='Random_add')
    #seg_inputs = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')

    # going with pooled output since seq_output results in ResourceExhausted Error even with GPU
    # 导入 BERT 模型
    bert_model = TFBertModel.from_pretrained(config.bert_path)
    _, pooled_output = bert_model(inputs=token_inputs, 
                                   attention_mask=mask_inputs, 
                                   token_type_ids=None,
                                   position_ids=None)
    # X = GlobalAveragePooling1D()(pooled_output)
    X = tf.keras.layers.Dropout(0.1)(pooled_output)
    X = tf.keras.layers.Concatenate()([X, add_inputs])
    output_ = tf.keras.layers.Dense(10, activation='softmax', name='output')(X)

    # 输入输出确定
    bert_model2 = tf.keras.models.Model([token_inputs, mask_inputs, add_inputs], output_)
    
    print(bert_model2.summary())
    
    # 编译模型
    bert_model2.compile(optimizer=optimizer, loss=loss_function)
    
    return bert_model2
```

### 参考：

https://medium.com/analytics-vidhya/bert-in-keras-tensorflow-2-0-using-tfhub-huggingface-81c08c5f81d8

https://www.kaggle.com/stitch/albert-in-keras-tf2-using-huggingface-explained

In [35]:
K.clear_session()
model = build_model()







Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input_word_ids (InputLayer)     [(None, 32)]         0                                            
__________________________________________________________________________________________________
Input_masks (InputLayer)        [(None, 32)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 32, 768), (N 102267648   Input_word_ids[0][0]             
__________________________________________________________________________________________________
dropout_37 (Dropout)            (None, 768)          0           tf_bert_model[0][1]              
______________________________________________________________________________________________

In [36]:
model.fit(train_X, train_y, epochs=2, batch_size = 128)

Train on 162000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fb732db1ad0>

In [37]:
valid_X = [validation_inputs, validation_masks]
valid_y = validation_labels

In [38]:
# 模型预测
result = model.predict(valid_X)
pred_flat = np.argmax(result, axis=1).flatten()

In [39]:
print(pred_flat[0:10])

[1 5 9 5 8 8 7 9 4 3]


In [40]:
print(len(pred_flat))

18000


In [None]:
# 保存模型
print(config.model_name)

model.save_weights(config.model_name, overwrite=True)

# 读取模型
model.load_weights(config.model_name)