In [1]:
from utils import helper

# define macros
ORIGINAL_DATA_PATH = '/Users/brikerman/Downloads/殆知阁古代文献藏书/史藏/编年'
TARGET_DATA_PATH = '/Users/brikerman/Desktop/daizhige/史藏/编年'

MAX_SEQUENCE_LENGTH = 100
EPOCHS = 5
BATCH_SIZE = 200

Using TensorFlow backend.


## 数据预处理

### 提取有标点的数据集

原数据没有区分那些数据是有标点，那些数据是没有标点。所以我们的第一部是区分有标点和没有标点的数据集。
目前这里采取了很简单粗暴的方案，看一篇文章中汉字和标点的比例，即 标点符号数量 / (汉字数量 + 标点符号数量) 当比例大于 0.1 则判断为有标点符号。

In [None]:
from utils import split_data
split_data.split_marked_unmarked_files(ORIGINAL_DATA_PATH, TARGET_DATA_PATH)

### 数据 token 化
这里我自己写了个 Tokenizer，支持直接 tokenize 数据或者利用预训练的词向量模型，关于如何预训练词向量我晚些时候补上。

In [4]:
import os
from utils.tokenizer import Tokenizer

# 构建 tokenizer，只需要构建一次
tokenizer = Tokenizer()
# tokenizer.build(corpus_path=ORIGINAL_DATA_PATH, tokenizer_path='./data/embedding/')

# 加载 tokenizer 数据
tokenizer.load('./data/embedding/')

In [None]:
# token 化数据
marked_file_path = os.path.join(TARGET_DATA_PATH, 'marked')
tokenizer.max_length = MAX_SEQUENCE_LENGTH
tokenizer.tokenize_files(files_path=marked_file_path,
                         data_path='./data')

### 构建模型

In [5]:
from random import random
from numpy import array
from numpy import cumsum
from keras.models import Sequential
from keras.layers import Input
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional



model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word2idx),
                    output_dim=100,
                    input_length=tokenizer.max_length,
                    mask_zero=True))
model.add(Bidirectional(LSTM(50, return_sequences=True)))  
model.add(TimeDistributed(Dense(len(tokenizer.labels2idx), activation='sigmoid')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])


model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          1056700   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 100)          60400     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 100, 17)           1717      
Total params: 1,118,817
Trainable params: 1,118,817
Non-trainable params: 0
_________________________________________________________________


In [20]:
from typing import List
import numpy as np
from keras.utils import to_categorical

def h5f_generator(h5path: str,
                  indices: List[int],
                  label_count: int,
                  batch_size: int=128):

    db = h5py.File(h5path, "r")

    while True:
        np.random.shuffle(indices)
        for i in range(0, len(indices), batch_size):
            batch_indices = indices[i:i+batch_size]
            batch_indices.sort()
            by = db["x"][batch_indices, :]
            bx = to_categorical(db["y"][batch_indices, :],
                                num_classes=label_count,
                                dtype=np.int)
            yield (bx, by)

In [21]:
import h5py
from sklearn.model_selection import train_test_split

dataset = h5py.File('./data/dataset.h5', 'r')

train_idx, test_idx = train_test_split(range(len(dataset['x'])), test_size=0.15)

train_generator = h5f_generator(h5path='./data/dataset.h5',
                                       indices=train_idx,
                                       label_count=len(tokenizer.labels2idx),
                                       batch_size=200)

test_generator = h5f_generator(h5path='./data/dataset.h5',
                                      indices=test_idx,
                                      label_count=len(tokenizer.labels2idx),
                                      batch_size=200)

model.fit_generator(train_generator, 
                    steps_per_epoch=len(train_idx) // BATCH_SIZE,
                    epochs=EPOCHS,
                    verbose=1,
                    callbacks=[],
                    validation_data=test_generator,
                    validation_steps=len(test_idx) // BATCH_SIZE)


Epoch 1/5


ValueError: Error when checking input: expected embedding_2_input to have 2 dimensions, but got array with shape (200, 100, 17)

In [2]:
import re
import os
import keras
import numpy as np
from keras import backend as K
from keras.preprocessing import sequence

def weighted_categorical_crossentropy(weights):
    """
    A weighted version of keras.objectives.categorical_crossentropy

    Variables:
        weights: numpy array of shape (C,) where C is the number of classes

    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """

    weights = K.variable(weights)

    def loss(y_true, y_pred):
        # scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        # clip to prevent NaN's and Inf's
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # calc
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss

    return loss


values = [100] * 22
values[0] = 1
values[1] = 70
loss_f = weighted_categorical_crossentropy(np.array(values))

## 使用模型预测

In [10]:
import keras
model = keras.models.load_model('/Users/brikerman/Desktop/ailab/wenyanwen/data/model/model.model',  # type: keras.models.Sequential
                                           custom_objects={'loss': loss_f})
for text in [
    '余谓拯饥之政富郑公在青州为百世师仿其意而行之在后贤焉兴赈田'
                         '之利除赈籴之害今日事矣除害于今未必不生害于后特举其槩与诸大'
                         '夫国人议之或曰黔之天时繁阴雨山溜洒泉可资濯溉不若中原赤地千'
                         '里之田也'
]:
    r = helper.predict_with_model(tokenizer=tokenizer, model=model, text=text)
    print(r)

余谓拯饥之政富郑公在青州为百世师仿其意而行之在后贤焉兴赈田之利除赈籴之害“今日事矣除害于；今未必不生害于；后特举其槩与诸大夫国人议之或曰黔之天时繁“阴雨山溜“洒泉可资濯溉不若中”原赤地千里之“田也
