In [1]:
import json
import numpy as np
import pandas as pd
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam,extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from keras.layers import Dense,Lambda
from keras.models import Model
from tqdm import tqdm
from sklearn.model_selection import train_test_split 
from keras.utils import to_categorical

tqdm.pandas('demo')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
# 基本信息
maxlen = 512
epochs = 20
batch_size = 8
learning_rate = 2e-5

pretrain_model = '/home/david/pretrain_model/zhiyi_pretrain_model/'
config_path = pretrain_model + 'chinese_roberta_L-6_H-384_A-12/bert_config.json'
checkpoint_path = pretrain_model + 'chinese_roberta_L-6_H-384_A-12/bert_model.ckpt'
dict_path = pretrain_model + 'chinese_roberta_L-6_H-384_A-12/vocab.txt'

# data
train_data_path = './data/labeled_data.csv'
test_data_path = './data/test_data.csv'

maps = {
    '财经':'高风险',
    '时政':'高风险',
    '房产':'中风险',
    '科技':'中风险',
    '教育':'低风险',
    '时尚':'低风险',
    '游戏':'低风险',
    '家居':'可公开',
    '体育':'可公开',
    '娱乐':'可公开',
}

def load_train(filename):
    df = pd.read_csv(filename)
    label2id = {}
    id2label = {}
    for i, ele in enumerate(df['class_label'].unique()):
        label2id[ele] = i
        id2label[i] = ele
        
    df['label'] = df['class_label'].progress_map(lambda x: label2id[x])
    text = []
    label = []
    for t, l in zip(df['content'], df['label']):
        text.append(t)
        label.append(l)
    return text, label, label2id, id2label

def load_test(filename):
    df = pd.read_csv(filename)
    text = []
    for t in df['content']:
        text.append(t)
    return text

# 读取数据
train_text, train_label, label2id, id2label = load_train(train_data_path)
test_text = load_test(test_data_path)

X_train, X_val, y_train, y_val = train_test_split(train_text, train_label, test_size=0.1, random_state=42)

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label) in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []



100%|██████████| 7000/7000 [00:00<00:00, 411754.13it/s]


In [3]:
label2id

{'家居': 6, '房产': 1, '教育': 5, '时尚': 4, '时政': 0, '科技': 3, '财经': 2}

In [4]:
bert = build_transformer_model(
    config_path,
    checkpoint_path,
    return_keras_model=False,
)

output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
output = Dense(
    units=len(label2id),
    activation='softmax',
    kernel_initializer=bert.initializer
)(output)

model = keras.models.Model( bert.model.input, output)
model.summary()

AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')

model.compile(
    loss='sparse_categorical_crossentropy',
    # optimizer=Adam(1e-5),  # 用足够小的学习率
    optimizer=AdamLR(learning_rate=1e-4, lr_schedule={
        1000: 1,
        2000: 0.1
    }),
    metrics=['accuracy'],
)
trains = [(t,l) for t, l in zip(list(X_train),list(y_train))]
valids = [(t,l) for t, l in zip(list(X_val),list(y_val))]
tests = [(t,0) for t in test_text]
train_generator = data_generator(trains,batch_size)
valid_generator = data_generator(valids,batch_size)
test_generator = data_generator(tests,batch_size)

def evaluate(data):
    total, right = 0., 0.
    for x_true, y_true in data:
        y_pred = model.predict(x_true).argmax(axis=1)
        y_true = y_true[:, 0]
        total += len(y_true)
        right += (y_true == y_pred).sum()
    return right / total

def predict(data):
    y_hat = []
    for x_true, _ in data:
        y_pred = model.predict(x_true).argmax(axis=1)
        for i in y_pred:
            y_hat.append(i[0])
    return y_hat

class Evaluator(keras.callbacks.Callback):
    """评估与保存
    """
    def __init__(self):
        self.best_val_acc = 0.

    def on_epoch_end(self, epoch, logs=None):
        val_acc = evaluate(valid_generator)
        if val_acc > self.best_val_acc:
            self.best_val_acc = val_acc
            model.save_weights('best_model.weights')
        print(
            u'val_acc: %.5f, best_val_acc: %.5f\n' %
            (val_acc, self.best_val_acc)
        )


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 128)    2704384     Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 128)    256         Input-Segment[0][0]              
____________________________________________________________________________________________

In [None]:
trains[0]

('街头：时尚达人们的早春装扮(组图)有人说要想知道服饰的潮流动态，走到街头看帅哥美女的时尚装扮就能领略流行带给我们的变化与影响，看看下面的街头达人们装扮，先从视觉上震撼你的灵魂。美丽的笑容像绽放的花朵，看了让人舒心，斗篷款式的灰白色毛衣外套因为有了流苏边的装饰更有了一丝灵动。帆布鞋个性独特的选择了左右不同的颜色，使得整身的黑色调中有了夺人眼球的亮点，加上可爱的小辫子，不得不承认年轻就要有活力和创意。暗红色的格纹衬衫长度刚刚好，恰到好处的露出短裤，映衬着女孩美丽的笑容，仿佛整个人心情也跟着好了起来。刚送走了一位可人儿，又来了一位酷女郎，红豹纹的鞋子非常抢眼，皮草的外套加蓝色金丝绒长裙的组合高贵优雅，配合上这个pose和大大的墨镜又多了些神秘感。',
 4)

In [None]:
evaluator = Evaluator()

model.fit_generator(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=10,
    callbacks=[evaluator]
)



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/10

In [None]:
model.load_weights('best_model.weights')
y_hat = predict(test_generator)