diff --git a/.coveragerc b/.coveragerc index 4b37f0b1..ae01d6a2 100644 --- a/.coveragerc +++ b/.coveragerc @@ -12,6 +12,8 @@ exclude_lines = # Don't complain if tests don't hit defensive assertion code: raise AssertionError raise NotImplementedError + raise ValueError + except Exception as e: # Don't complain if non-runnable code isn't run: if __name__ == .__main__.: diff --git a/.travis.yml b/.travis.yml index 44754af3..2046c5a6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,14 +4,21 @@ python: - "3.6" cache: pip # command to install dependencies +env: + - TEST_FILE=tests/test_classifier_models.py + - TEST_FILE=tests/test_seq_labeling_models.py + - TEST_FILE=tests/test_corpus.py + - TEST_FILE=tests/test_embeddings.py before_install: - export BOTO_CONFIG=/dev/null install: - pip install python-coveralls - pip install -r requirements.txt - pip install . + - pip install coverage + - pip install nose # command to run tests script: - - sh test.sh + - nosetests --with-coverage --cover-html --cover-html-dir=htmlcov --cover-package="kashgari" $TEST_FILE after_success: - coveralls \ No newline at end of file diff --git a/README.md b/README.md index 1515dd86..41fb076f 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![Issues](https://img.shields.io/github/issues/BrikerMan/Kashgari.svg)](https://github.com/BrikerMan/Kashgari/issues) [![Contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg)](CONTRIBUTING.md) ![](https://img.shields.io/pypi/l/kashgari.svg?style=flat) -[![](https://img.shields.io/pypi/dw/kashgari.svg)](https://pypi.org/project/kashgari/) +[![](https://img.shields.io/pypi/dm/kashgari.svg)](https://pypi.org/project/kashgari/) Simple and powerful NLP framework, build your state-of-art model in 5 minutes for named entity recognition (NER), part-of-speech tagging (PoS) and text classification tasks. @@ -25,14 +25,20 @@ Kashgare is: * Embedding support * Classic word2vec embedding * BERT embedding -* Text Classification Models - * CNN Classification Model - * CNN LSTM Classification Model - * Bidirectional LSTM Classification Model -* Text Labeling Models (NER, PoS) - * Bidirectional LSTM Labeling Model - * Bidirectional LSTM CRF Labeling Model - * CNN LSTM Labeling Model +* Sequence(Text) Classification Models + * CNNModel + * BLSTMModel + * CNNLSTMModel + * AVCNNModel + * KMaxCNNModel + * RCNNModel + * AVRNNModel + * DropoutBGRUModel + * DropoutAVRNNModel +* Sequence(Text) Labeling Models (NER, PoS) + * CNNLSTMModel + * BLSTMModel + * BLSTMCRFModel * Model Training * Model Evaluate * GPU Support diff --git a/examples/run_flask_api.py b/examples/run_flask_api.py new file mode 100644 index 00000000..1d894321 --- /dev/null +++ b/examples/run_flask_api.py @@ -0,0 +1,37 @@ +# encoding: utf-8 +""" +@author: BrikerMan +@contact: eliyar917@gmail.com +@blog: https://eliyar.biz + +@version: 1.0 +@license: Apache Licence +@file: run_flask_api +@time: 2019-02-24 + +""" +import random +from flask import Flask, jsonify +from kashgari.tasks.classification import KMaxCNNModel +from kashgari.corpus import SMP2017ECDTClassificationCorpus + +train_x, train_y = SMP2017ECDTClassificationCorpus.get_classification_data() + +model = KMaxCNNModel() +model.fit(train_x, train_y) + + +app = Flask(__name__) + + +@app.route('/predict', methods=['GET']) +def get_tasks(): + x = random.choice(train_x) + y = model.predict(x, output_dict=True) + return jsonify({'x': x, 'y': y}) + + +if __name__ == '__main__': + # must run predict once before `app.run` to prevent predict error + model.predict(train_x[10]) + app.run(debug=True, port=8080) diff --git a/kashgari/__init__.py b/kashgari/__init__.py index 17333ee9..47ddbef8 100644 --- a/kashgari/__init__.py +++ b/kashgari/__init__.py @@ -17,6 +17,8 @@ from kashgari.tasks import classification from kashgari.tasks import seq_labeling +from kashgari.macros import config + if __name__ == "__main__": print("Hello world") diff --git a/kashgari/embeddings/embeddings.py b/kashgari/embeddings/embeddings.py index 6dc943f2..59cdf393 100644 --- a/kashgari/embeddings/embeddings.py +++ b/kashgari/embeddings/embeddings.py @@ -133,7 +133,7 @@ def tokenize(self, def tokenize_sentence(text: TextSeqType) -> TokenSeqType: tokens = [self.token2idx.get(token, self.token2idx[k.UNK]) for token in text] if add_bos_eos: - tokens = [self.token2idx[k.BOS]] + tokens + [self.token2idx[k.BOS]] + tokens = [self.token2idx[k.BOS]] + tokens + [self.token2idx[k.EOS]] return tokens if is_list: diff --git a/kashgari/layers.py b/kashgari/layers.py new file mode 100644 index 00000000..fcaf5426 --- /dev/null +++ b/kashgari/layers.py @@ -0,0 +1,160 @@ +# encoding: utf-8 +""" +@author: BrikerMan +@contact: eliyar917@gmail.com +@blog: https://eliyar.biz + +@version: 1.0 +@license: Apache Licence +@file: layers +@time: 2019-02-23 + +""" +from __future__ import absolute_import, division +import logging + +import tensorflow as tf +from keras.layers import Flatten +from keras.layers import GRU, LSTM +from keras.layers import CuDNNGRU, CuDNNLSTM +from keras import initializers +from keras.engine import InputSpec, Layer +from keras import backend as K + +from kashgari.macros import config + +if config.use_CuDNN_cell: + GRULayer = CuDNNGRU + LSTMLayer = CuDNNLSTM +else: + GRULayer = GRU + LSTMLayer = LSTM + + +class AttentionWeightedAverage(Layer): + ''' + Computes a weighted average of the different channels across timesteps. + Uses 1 parameter pr. channel to compute the attention value for a single timestep. + ''' + + def __init__(self, return_attention=False, **kwargs): + self.init = initializers.get('uniform') + self.supports_masking = True + self.return_attention = return_attention + super(AttentionWeightedAverage, self).__init__(**kwargs) + + def build(self, input_shape): + self.input_spec = [InputSpec(ndim=3)] + assert len(input_shape) == 3 + + self.W = self.add_weight(shape=(input_shape[2], 1), + name='{}_w'.format(self.name), + initializer=self.init) + self.trainable_weights = [self.W] + super(AttentionWeightedAverage, self).build(input_shape) + + def call(self, x, mask=None): + # computes a probability distribution over the timesteps + # uses 'max trick' for numerical stability + # reshape is done to avoid issue with Tensorflow + # and 1-dimensional weights + logits = K.dot(x, self.W) + x_shape = K.shape(x) + logits = K.reshape(logits, (x_shape[0], x_shape[1])) + ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True)) + + # masked timesteps have zero weight + if mask is not None: + mask = K.cast(mask, K.floatx()) + ai = ai * mask + att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon()) + weighted_input = x * K.expand_dims(att_weights) + result = K.sum(weighted_input, axis=1) + if self.return_attention: + return [result, att_weights] + return result + + def get_output_shape_for(self, input_shape): + return self.compute_output_shape(input_shape) + + def compute_output_shape(self, input_shape): + output_len = input_shape[2] + if self.return_attention: + return [(input_shape[0], output_len), (input_shape[0], input_shape[1])] + return (input_shape[0], output_len) + + def compute_mask(self, input, input_mask=None): + if isinstance(input_mask, list): + return [None] * len(input_mask) + else: + return None + + +class KMaxPooling(Layer): + ''' + K-max pooling layer that extracts the k-highest activation from a sequence (2nd dimension). + TensorFlow backend. + + # Arguments + k: An int scale, + indicate k max steps of features to pool. + sorted: A bool, + if output is sorted (default) or not. + data_format: A string, + one of `channels_last` (default) or `channels_first`. + The ordering of the dimensions in the inputs. + `channels_last` corresponds to inputs with shape + `(batch, steps, features)` while `channels_first` + corresponds to inputs with shape + `(batch, features, steps)`. + # Input shape + - If `data_format='channels_last'`: + 3D tensor with shape: + `(batch_size, steps, features)` + - If `data_format='channels_first'`: + 3D tensor with shape: + `(batch_size, features, steps)` + # Output shape + 3D tensor with shape: + `(batch_size, top-k-steps, features)` + ''' + + def __init__(self, k=1, sorted=True, data_format='channels_last', **kwargs): + super(KMaxPooling, self).__init__(**kwargs) + self.input_spec = InputSpec(ndim=3) + self.k = k + self.sorted = sorted + self.data_format = K.normalize_data_format(data_format) + + # def build(self, input_shape): + # assert len(input_shape) == 3 + # super(KMaxPooling, self).build(input_shape) + + def compute_output_shape(self, input_shape): + if self.data_format == 'channels_first': + return (input_shape[0], self.k, input_shape[1]) + else: + return (input_shape[0], self.k, input_shape[2]) + + def call(self, inputs): + if self.data_format == 'channels_last': + # swap last two dimensions since top_k will be applied along the last dimension + shifted_input = tf.transpose(inputs, [0, 2, 1]) + + # extract top_k, returns two tensors [values, indices] + top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=self.sorted)[0] + else: + top_k = tf.nn.top_k(inputs, k=self.k, sorted=self.sorted)[0] + # return flattened output + return tf.transpose(top_k, [0, 2, 1]) + + def get_config(self): + config = {'k': self.k, + 'sorted': self.sorted, + 'data_format': self.data_format} + base_config = super(KMaxPooling, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + +if __name__ == '__main__': + print("hello, world") diff --git a/kashgari/macros.py b/kashgari/macros.py index 16f1014e..45f7938a 100644 --- a/kashgari/macros.py +++ b/kashgari/macros.py @@ -36,6 +36,14 @@ pathlib.Path(PROCESSED_CORPUS_PATH).mkdir(parents=True, exist_ok=True) +class _Config(object): + def __init__(self): + self.use_CuDNN_cell = False + + +config = _Config() + + class CustomEmbedding(object): def __init__(self, embedding_size=100): self.embedding_size = embedding_size diff --git a/kashgari/tasks/base/base_model.py b/kashgari/tasks/base/base_model.py index 4abb30a0..970d1361 100644 --- a/kashgari/tasks/base/base_model.py +++ b/kashgari/tasks/base/base_model.py @@ -12,17 +12,23 @@ """ import os import json +import pickle import pathlib +import traceback import logging +logger = logging.getLogger(__name__) import numpy as np from typing import Dict import keras from keras.models import Model +from keras import backend as K + from kashgari.utils import helper from kashgari.embeddings import CustomEmbedding, BaseEmbedding from kashgari.utils.crf import CRF, crf_loss, crf_accuracy from keras_bert.bert import get_custom_objects as get_bert_custom_objects +from kashgari.layers import AttentionWeightedAverage, KMaxPooling class BaseModel(object): @@ -72,8 +78,24 @@ def save(self, model_path: str): with open(os.path.join(model_path, 'model.json'), 'w', encoding='utf-8') as f: f.write(json.dumps(model_info, indent=2, ensure_ascii=False)) + with open(os.path.join(model_path, 'struct.json'), 'w', encoding='utf-8') as f: + f.write(self.model.to_json()) + + #self.model.save_weights(os.path.join(model_path, 'weights.h5')) + optimizer_weight_values = None + try: + symbolic_weights = getattr(self.model.optimizer, 'weights') + optimizer_weight_values = K.batch_get_value(symbolic_weights) + except Exception as e: + logger.warn('error occur: {}'.format(e)) + traceback.print_tb(e.__traceback__) + logger.warn('No optimizer weights found.') + if optimizer_weight_values is not None: + with open(os.path.join(model_path, 'optimizer.pkl'), 'wb') as f: + pickle.dump(optimizer_weight_values, f) + self.model.save(os.path.join(model_path, 'model.model')) - logging.info('model saved to {}'.format(os.path.abspath(model_path))) + logger.info('model saved to {}'.format(os.path.abspath(model_path))) @staticmethod def create_custom_objects(model_info): @@ -94,7 +116,8 @@ def create_custom_objects(model_info): if embedding and embedding['embedding_type'] == 'bert': custom_objects['NonMaskingLayer'] = helper.NonMaskingLayer custom_objects.update(get_bert_custom_objects()) - + custom_objects['AttentionWeightedAverage'] = AttentionWeightedAverage + custom_objects['KMaxPooling'] = KMaxPooling return custom_objects @classmethod @@ -111,15 +134,66 @@ def load_model(cls, model_path: str): custom_objects = cls.create_custom_objects(model_info) if custom_objects: - logging.debug('prepared custom objects: {}'.format(custom_objects)) - - agent.model = keras.models.load_model(os.path.join(model_path, 'model.model'), - custom_objects=custom_objects) + logger.debug('prepared custom objects: {}'.format(custom_objects)) + + try: + agent.model = keras.models.load_model(os.path.join(model_path, 'model.model'), + custom_objects=custom_objects) + except Exception as e: + logger.warn('Error `{}` occured trying directly model loading. Try to rebuild.'.format(e)) + logger.debug('Load model structure from json.') + with open(os.path.join(model_path, 'struct.json'), 'r', encoding='utf-8') as f: + model_struct = f.read() + agent.model = keras.models.model_from_json(model_struct, + custom_objects=custom_objects) + logger.debug('Build optimizer with model info.') + optimizer_conf = model_info['hyper_parameters'].get('optimizer', None) + optimizer = 'adam' #default + if optimizer_conf is not None and isinstance(optimizer_conf, dict): + module_str = optimizer_conf.get('module', 'None') + name_str = optimizer_conf.get('name', 'None') + params = optimizer_conf.get('params', None) + invalid_set = [None, 'None', '', {}] + if not any([module_str.strip() in invalid_set, + name_str.strip() in invalid_set, + params in invalid_set]): + try: + optimizer = getattr(eval(module_str), name_str)(**params) + except: + logger.warn('Invalid optimizer configuration in model info. Use `adam` as default.') + else: + logger.warn('No optimizer configuration found in model info. Use `adam` as default.') + + default_compile_params = {'loss': 'categorical_crossentropy', 'metrics':['accuracy']} + compile_params = model_info['hyper_parameters'].get('compile_params', default_compile_params) + logger.debug('Compile model from scratch.') + try: + agent.model.compile(optimizer=optimizer, **compile_params) + except: + logger.warn('Failed to compile model. Compile params seems incorrect.') + logger.warn('Use default options `{}` to compile.'.format(default_compile_params)) + agent.model.compile(optimizer=optimizer, **default_compile_params) + logger.debug('Load model weights.') + agent.model.summary() + agent.model.load_weights(os.path.join(model_path, 'model.model')) + agent.model._make_train_function() + optimizer_weight_values = None + logger.debug('Load optimizer weights.') + try: + with open(os.path.join(model_path, 'optimizer.pkl'), 'rb') as f: + optimizer_weight_values = pickle.load(f) + except Exception as e: + logger.warn('Try to load optimizer weights but no optimizer weights file found.') + if optimizer_weight_values is not None: + agent.model.optimizer.set_weights(optimizer_weight_values) + else: + logger.warn('Rebuild model but optimizer weights missed. Retrain needed.') + logger.info('Model rebuild finished.') agent.embedding.update(model_info.get('embedding', {})) agent.model.summary() agent.label2idx = label2idx agent.embedding.token2idx = token2idx - logging.info('loaded model from {}'.format(os.path.abspath(model_path))) + logger.info('loaded model from {}'.format(os.path.abspath(model_path))) return agent diff --git a/kashgari/tasks/classification/__init__.py b/kashgari/tasks/classification/__init__.py index 8b171209..a24d9593 100644 --- a/kashgari/tasks/classification/__init__.py +++ b/kashgari/tasks/classification/__init__.py @@ -11,6 +11,6 @@ """ from .base_model import ClassificationModel -from .blstm_model import BLSTMModel -from .cnn_lstm_model import CNNLSTMModel -from .cnn_model import CNNModel +from kashgari.tasks.classification.models import BLSTMModel, CNNLSTMModel, CNNModel +from kashgari.tasks.classification.models import AVCNNModel, KMaxCNNModel, RCNNModel, AVRNNModel +from kashgari.tasks.classification.models import DropoutBGRUModel, DropoutAVRNNModel diff --git a/kashgari/tasks/classification/blstm_model.py b/kashgari/tasks/classification/blstm_model.py deleted file mode 100644 index d2d20328..00000000 --- a/kashgari/tasks/classification/blstm_model.py +++ /dev/null @@ -1,56 +0,0 @@ -# encoding: utf-8 -""" -@author: BrikerMan -@contact: eliyar917@gmail.com -@blog: https://eliyar.biz - -@version: 1.0 -@license: Apache Licence -@file: blstm_model.py -@time: 2019-01-21 17:37 - -""" -import logging -from keras.layers import Dense, Bidirectional -from keras.layers.recurrent import LSTM -from keras.models import Model - -from kashgari.tasks.classification.base_model import ClassificationModel - - -class BLSTMModel(ClassificationModel): - __architect_name__ = 'BLSTMModel' - __base_hyper_parameters__ = { - 'lstm_layer': { - 'units': 256, - 'return_sequences': False - } - } - - def build_model(self): - base_model = self.embedding.model - blstm_layer = Bidirectional(LSTM(**self.hyper_parameters['lstm_layer']))(base_model.output) - dense_layer = Dense(len(self.label2idx), activation='sigmoid')(blstm_layer) - output_layers = [dense_layer] - - model = Model(base_model.inputs, output_layers) - model.compile(loss='categorical_crossentropy', - optimizer='adam', - metrics=['accuracy']) - self.model = model - self.model.summary() - - -if __name__ == "__main__": - from kashgari.utils.logger import init_logger - from kashgari.corpus import TencentDingdangSLUCorpus - - init_logger() - - x_data, y_data = TencentDingdangSLUCorpus.get_classification_data() - classifier = BLSTMModel() - classifier.fit(x_data, y_data, epochs=1) - classifier.save('./classifier_saved2') - - model = ClassificationModel.load_model('./classifier_saved2') - logging.info(model.predict('我要听音乐')) diff --git a/kashgari/tasks/classification/cnn_lstm_model.py b/kashgari/tasks/classification/cnn_lstm_model.py deleted file mode 100644 index a674c9ed..00000000 --- a/kashgari/tasks/classification/cnn_lstm_model.py +++ /dev/null @@ -1,67 +0,0 @@ -# encoding: utf-8 -""" -@author: BrikerMan -@contact: eliyar917@gmail.com -@blog: https://eliyar.biz - -@version: 1.0 -@license: Apache Licence -@file: cnn_lstm_model.py -@time: 2019-01-19 11:52 - -""" -import logging - -from keras.layers import Dense, Conv1D, MaxPooling1D -from keras.layers.recurrent import LSTM -from keras.models import Model - -from kashgari.tasks.classification.base_model import ClassificationModel - - -class CNNLSTMModel(ClassificationModel): - __architect_name__ = 'CNNLSTMModel' - __base_hyper_parameters__ = { - 'conv_layer': { - 'filters': 32, - 'kernel_size': 3, - 'padding': 'same', - 'activation': 'relu' - }, - 'max_pool_layer': { - 'pool_size': 2 - }, - 'lstm_layer': { - 'units': 100 - } - } - - def build_model(self): - base_model = self.embedding.model - conv_layer = Conv1D(**self.hyper_parameters['conv_layer'])(base_model.output) - max_pool_layer = MaxPooling1D(**self.hyper_parameters['max_pool_layer'])(conv_layer) - lstm_layer = LSTM(**self.hyper_parameters['lstm_layer'])(max_pool_layer) - dense_layer = Dense(len(self.label2idx), activation='sigmoid')(lstm_layer) - output_layers = [dense_layer] - - model = Model(base_model.inputs, output_layers) - model.compile(loss='categorical_crossentropy', - optimizer='adam', - metrics=['accuracy']) - self.model = model - self.model.summary() - - -if __name__ == "__main__": - from kashgari.utils.logger import init_logger - from kashgari.corpus import TencentDingdangSLUCorpus - - init_logger() - - x_data, y_data = TencentDingdangSLUCorpus.get_classification_data() - classifier = CNNLSTMModel() - classifier.fit(x_data, y_data, epochs=1) - classifier.save('./classifier_saved2') - - model = ClassificationModel.load_model('./classifier_saved2') - logging.info(model.predict('我要听音乐')) diff --git a/kashgari/tasks/classification/cnn_model.py b/kashgari/tasks/classification/cnn_model.py deleted file mode 100644 index 9512d0df..00000000 --- a/kashgari/tasks/classification/cnn_model.py +++ /dev/null @@ -1,62 +0,0 @@ -# encoding: utf-8 -""" -@author: BrikerMan -@contact: eliyar917@gmail.com -@blog: https://eliyar.biz - -@version: 1.0 -@license: Apache Licence -@file: cnn_model.py -@time: 2019-01-21 17:49 - -""" -import logging -from keras.layers import Dense, Conv1D, GlobalMaxPooling1D -from keras.models import Model - -from kashgari.tasks.classification.base_model import ClassificationModel - - -class CNNModel(ClassificationModel): - __architect_name__ = 'CNNModel' - __base_hyper_parameters__ = { - 'conv1d_layer': { - 'filters': 128, - 'kernel_size': 5, - 'activation': 'relu' - }, - 'max_pool_layer': {}, - 'dense_1_layer': { - 'units': 64, - 'activation': 'relu' - } - } - - def build_model(self): - base_model = self.embedding.model - conv1d_layer = Conv1D(**self.hyper_parameters['conv1d_layer'])(base_model.output) - max_pool_layer = GlobalMaxPooling1D(**self.hyper_parameters['max_pool_layer'])(conv1d_layer) - dense_1_layer = Dense(**self.hyper_parameters['dense_1_layer'])(max_pool_layer) - dense_2_layer = Dense(len(self.label2idx), activation='sigmoid')(dense_1_layer) - - model = Model(base_model.inputs, dense_2_layer) - model.compile(loss='categorical_crossentropy', - optimizer='adam', - metrics=['accuracy']) - self.model = model - self.model.summary() - - -if __name__ == "__main__": - from kashgari.utils.logger import init_logger - from kashgari.corpus import TencentDingdangSLUCorpus - - init_logger() - - x_data, y_data = TencentDingdangSLUCorpus.get_classification_data() - classifier = CNNModel() - classifier.fit(x_data, y_data, epochs=1) - classifier.save('./classifier_saved2') - - model = ClassificationModel.load_model('./classifier_saved2') - logging.info(model.predict('我要听音乐')) diff --git a/kashgari/tasks/classification/models.py b/kashgari/tasks/classification/models.py new file mode 100644 index 00000000..410864b0 --- /dev/null +++ b/kashgari/tasks/classification/models.py @@ -0,0 +1,750 @@ +# encoding: utf-8 +""" +@author: Alex +@contact: ialexwwang@gmail.com + +@version: 0.1 +@license: Apache Licence +@file: deep_models.py +@time: 2019-02-21 17:54 + +@Reference: https://github.com/zake7749/DeepToxic/blob/master/sotoxic/models/keras/model_zoo.py +""" +from __future__ import absolute_import, division + +import logging + +import keras +#from keras import optimizers + +from keras.models import Model +from keras.layers import Dense, Lambda, Flatten, Reshape +from keras.layers import Dropout, SpatialDropout1D +from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, MaxPooling1D +from keras.layers import Bidirectional, Conv1D +from keras.layers import concatenate + +from kashgari.layers import AttentionWeightedAverage, KMaxPooling, LSTMLayer, GRULayer + +from kashgari.tasks.classification.base_model import ClassificationModel + + +class CNNModel(ClassificationModel): + __architect_name__ = 'CNNModel' + __base_hyper_parameters__ = { + 'conv1d_layer': { + 'filters': 128, + 'kernel_size': 5, + 'activation': 'relu' + }, + 'max_pool_layer': {}, + 'dense_1_layer': { + 'units': 64, + 'activation': 'relu' + }, + 'activation_layer': { + 'activation': 'softmax' + }, + 'optimizer': { + 'module': 'keras.optimizers', + 'name': 'Adam', + 'params': { + 'lr': 1e-3, + 'decay': 0.0 + } + }, + 'compile_params': { + 'loss': 'categorical_crossentropy', + # 'optimizer': 'adam', + 'metrics': ['accuracy'] + } + } + + def build_model(self): + base_model = self.embedding.model + conv1d_layer = Conv1D(**self.hyper_parameters['conv1d_layer'])(base_model.output) + max_pool_layer = GlobalMaxPooling1D(**self.hyper_parameters['max_pool_layer'])(conv1d_layer) + dense_1_layer = Dense(**self.hyper_parameters['dense_1_layer'])(max_pool_layer) + dense_2_layer = Dense(len(self.label2idx), **self.hyper_parameters['activation_layer'])(dense_1_layer) + + model = Model(base_model.inputs, dense_2_layer) + optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), + self.hyper_parameters['optimizer']['name'])( + **self.hyper_parameters['optimizer']['params']) + model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) + self.model = model + self.model.summary() + + +class BLSTMModel(ClassificationModel): + __architect_name__ = 'BLSTMModel' + __base_hyper_parameters__ = { + 'lstm_layer': { + 'units': 256, + 'return_sequences': False + }, + 'activation_layer': { + 'activation': 'softmax' + }, + 'optimizer': { + 'module': 'keras.optimizers', + 'name': 'Adam', + 'params': { + 'lr': 1e-3, + 'decay': 0.0 + } + }, + 'compile_params': { + 'loss': 'categorical_crossentropy', + # 'optimizer': 'adam', + 'metrics': ['accuracy'] + } + } + + def build_model(self): + base_model = self.embedding.model + blstm_layer = Bidirectional(LSTMLayer(**self.hyper_parameters['lstm_layer']))(base_model.output) + dense_layer = Dense(len(self.label2idx), **self.hyper_parameters['activation_layer'])(blstm_layer) + output_layers = [dense_layer] + + model = Model(base_model.inputs, output_layers) + optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), + self.hyper_parameters['optimizer']['name'])( + **self.hyper_parameters['optimizer']['params']) + model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) + self.model = model + self.model.summary() + + +class CNNLSTMModel(ClassificationModel): + __architect_name__ = 'CNNLSTMModel' + __base_hyper_parameters__ = { + 'conv_layer': { + 'filters': 32, + 'kernel_size': 3, + 'padding': 'same', + 'activation': 'relu' + }, + 'max_pool_layer': { + 'pool_size': 2 + }, + 'lstm_layer': { + 'units': 100 + }, + 'activation_layer': { + 'activation': 'softmax' + }, + 'optimizer': { + 'module': 'keras.optimizers', + 'name': 'Adam', + 'params': { + 'lr': 1e-3, + 'decay': 0.0 + } + }, + 'compile_params': { + 'loss': 'categorical_crossentropy', + # 'optimizer': 'adam', + 'metrics': ['accuracy'] + } + } + + def build_model(self): + base_model = self.embedding.model + conv_layer = Conv1D(**self.hyper_parameters['conv_layer'])(base_model.output) + max_pool_layer = MaxPooling1D(**self.hyper_parameters['max_pool_layer'])(conv_layer) + lstm_layer = LSTMLayer(**self.hyper_parameters['lstm_layer'])(max_pool_layer) + dense_layer = Dense(len(self.label2idx), + **self.hyper_parameters['activation_layer'])(lstm_layer) + output_layers = [dense_layer] + + model = Model(base_model.inputs, output_layers) + optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), + self.hyper_parameters['optimizer']['name'])( + **self.hyper_parameters['optimizer']['params']) + model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) + self.model = model + self.model.summary() + + +class AVCNNModel(ClassificationModel): + __architect_name__ = 'AVCNNModel' + __base_hyper_parameters__ = { + 'spatial_dropout': { + 'rate': 0.25 + }, + 'conv_0': { + 'filters': 300, + 'kernel_size': 1, + 'kernel_initializer': 'normal', + 'padding': 'valid', + 'activation': 'relu' + }, + 'conv_1': { + 'filters': 300, + 'kernel_size': 2, + 'kernel_initializer': 'normal', + 'padding': 'valid', + 'activation': 'relu' + }, + 'conv_2': { + 'filters': 300, + 'kernel_size': 3, + 'kernel_initializer': 'normal', + 'padding': 'valid', + 'activation': 'relu' + }, + 'conv_3': { + 'filters': 300, + 'kernel_size': 4, + 'kernel_initializer': 'normal', + 'padding': 'valid', + 'activation': 'relu' + }, + # --- + 'attn_0': {}, + 'avg_0': {}, + 'maxpool_0': {}, + # --- + 'maxpool_1': {}, + 'attn_1': {}, + 'avg_1': {}, + # --- + 'maxpool_2': {}, + 'attn_2': {}, + 'avg_2': {}, + # --- + 'maxpool_3': {}, + 'attn_3': {}, + 'avg_3': {}, + # --- + 'v0_col': { + # 'mode': 'concat', + 'axis': 1 + }, + 'v1_col': { + # 'mode': 'concat', + 'axis': 1 + }, + 'v2_col': { + # 'mode': 'concat', + 'axis': 1 + }, + 'merged_tensor': { + # 'mode': 'concat', + 'axis': 1 + }, + 'dropout': { + 'rate': 0.7 + }, + 'dense': { + 'units': 144, + 'activation': 'relu' + }, + 'activation_layer': { + 'activation': 'softmax' + }, + 'optimizer': { + 'module': 'keras.optimizers', + 'name': 'Adam', + 'params': { + 'lr': 1e-3, + 'decay': 1e-7 + } + }, + 'compile_params': { + 'loss': 'categorical_crossentropy', + # 'optimizer': 'adam', + 'metrics': ['accuracy'] + } + } + + def build_model(self): + base_model = self.embedding.model + embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) + conv_0 = Conv1D(**self.hyper_parameters['conv_0'])(embedded_seq) + conv_1 = Conv1D(**self.hyper_parameters['conv_1'])(embedded_seq) + conv_2 = Conv1D(**self.hyper_parameters['conv_2'])(embedded_seq) + conv_3 = Conv1D(**self.hyper_parameters['conv_3'])(embedded_seq) + + maxpool_0 = GlobalMaxPooling1D()(conv_0) + attn_0 = AttentionWeightedAverage()(conv_0) + avg_0 = GlobalAveragePooling1D()(conv_0) + + maxpool_1 = GlobalMaxPooling1D()(conv_1) + attn_1 = AttentionWeightedAverage()(conv_1) + avg_1 = GlobalAveragePooling1D()(conv_1) + + maxpool_2 = GlobalMaxPooling1D()(conv_2) + attn_2 = AttentionWeightedAverage()(conv_2) + avg_2 = GlobalAveragePooling1D()(conv_2) + + maxpool_3 = GlobalMaxPooling1D()(conv_3) + attn_3 = AttentionWeightedAverage()(conv_3) + avg_3 = GlobalAveragePooling1D()(conv_3) + + v0_col = concatenate([maxpool_0, maxpool_1, maxpool_2, maxpool_3], + **self.hyper_parameters['v0_col']) + v1_col = concatenate([attn_0, attn_1, attn_2, attn_3], + **self.hyper_parameters['v1_col']) + v2_col = concatenate([avg_1, avg_2, avg_0, avg_3], + **self.hyper_parameters['v2_col']) + merged_tensor = concatenate([v0_col, v1_col, v2_col], + **self.hyper_parameters['merged_tensor']) + output = Dropout(**self.hyper_parameters['dropout'])(merged_tensor) + output = Dense(**self.hyper_parameters['dense'])(output) + output = Dense(len(self.label2idx), + **self.hyper_parameters['activation_layer'])(output) + + model = Model(base_model.inputs, output) + optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), + self.hyper_parameters['optimizer']['name'])( + **self.hyper_parameters['optimizer']['params']) + model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) + self.model = model + self.model.summary() + + +class KMaxCNNModel(ClassificationModel): + __architect_name__ = 'KMaxCNNModel' + __base_hyper_parameters__ = { + 'spatial_dropout': { + 'rate': 0.2 + }, + 'conv_0': { + 'filters': 180, + 'kernel_size': 1, + 'kernel_initializer': 'normal', + 'padding': 'valid', + 'activation': 'relu' + }, + 'conv_1': { + 'filters': 180, + 'kernel_size': 2, + 'kernel_initializer': 'normal', + 'padding': 'valid', + 'activation': 'relu' + }, + 'conv_2': { + 'filters': 180, + 'kernel_size': 3, + 'kernel_initializer': 'normal', + 'padding': 'valid', + 'activation': 'relu' + }, + 'conv_3': { + 'filters': 180, + 'kernel_size': 4, + 'kernel_initializer': 'normal', + 'padding': 'valid', + 'activation': 'relu' + }, + 'maxpool_0': { + 'k': 3 + }, + 'maxpool_1': { + 'k': 3 + }, + 'maxpool_2': { + 'k': 3 + }, + 'maxpool_3': { + 'k': 3 + }, + 'merged_tensor': { + # 'mode': 'concat', + 'axis': 1 + }, + 'dropout': { + 'rate': 0.6 + }, + 'dense': { + 'units': 144, + 'activation': 'relu' + }, + 'activation_layer': { + 'activation': 'softmax' + }, + 'optimizer': { + 'module': 'keras.optimizers', + 'name': 'Adam', + 'params': { + 'lr': 1e-3, + 'decay': 1e-7 + } + }, + 'compile_params': { + 'loss': 'categorical_crossentropy', + # 'optimizer': 'adam', + 'metrics': ['accuracy'] + } + } + + def build_model(self): + base_model = self.embedding.model + embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) + conv_0 = Conv1D(**self.hyper_parameters['conv_0'])(embedded_seq) + conv_1 = Conv1D(**self.hyper_parameters['conv_1'])(embedded_seq) + conv_2 = Conv1D(**self.hyper_parameters['conv_2'])(embedded_seq) + conv_3 = Conv1D(**self.hyper_parameters['conv_3'])(embedded_seq) + + maxpool_0 = KMaxPooling(**self.hyper_parameters['maxpool_0'])(conv_0) + # maxpool_0f = Reshape((-1,))(maxpool_0) + maxpool_0f = Flatten()(maxpool_0) + maxpool_1 = KMaxPooling(**self.hyper_parameters['maxpool_1'])(conv_1) + # maxpool_1f = Reshape((-1,))(maxpool_1) + maxpool_1f = Flatten()(maxpool_1) + maxpool_2 = KMaxPooling(**self.hyper_parameters['maxpool_2'])(conv_2) + # maxpool_2f = Reshape((-1,))(maxpool_2) + maxpool_2f = Flatten()(maxpool_2) + maxpool_3 = KMaxPooling(**self.hyper_parameters['maxpool_3'])(conv_3) + # maxpool_3f = Reshape((-1,))(maxpool_3) + maxpool_3f = Flatten()(maxpool_3) + # maxpool_0 = GlobalMaxPooling1D()(conv_0) + # maxpool_1 = GlobalMaxPooling1D()(conv_1) + # maxpool_2 = GlobalMaxPooling1D()(conv_2) + # maxpool_3 = GlobalMaxPooling1D()(conv_3) + + # merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2, maxpool_3], + # **self.hyper_parameters['merged_tensor']) + merged_tensor = concatenate([maxpool_0f, maxpool_1f, maxpool_2f, maxpool_3f], + **self.hyper_parameters['merged_tensor']) + # flatten = Reshape((-1,))(merged_tensor) + # output = Dropout(**self.hyper_parameters['dropout'])(flatten) + output = Dropout(**self.hyper_parameters['dropout'])(merged_tensor) + output = Dense(**self.hyper_parameters['dense'])(output) + output = Dense(len(self.label2idx), + **self.hyper_parameters['activation_layer'])(output) + + model = Model(base_model.inputs, output) + optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), + self.hyper_parameters['optimizer']['name'])( + **self.hyper_parameters['optimizer']['params']) + model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) + self.model = model + self.model.summary() + + +class RCNNModel(ClassificationModel): + __architect_name__ = 'RCNNModel' + __base_hyper_parameters__ = { + 'spatial_dropout': { + 'rate': 0.2 + }, + 'rnn_0': { + 'units': 64, + 'return_sequences': True + }, + 'conv_0': { + 'filters': 128, + 'kernel_size': 2, + 'kernel_initializer': 'normal', + 'padding': 'valid', + 'activation': 'relu', + 'strides': 1 + }, + 'maxpool': {}, + 'attn': {}, + 'average': {}, + 'concat': { + 'axis': 1 + }, + 'dropout': { + 'rate': 0.5 + }, + 'dense': { + 'units': 120, + 'activation': 'relu' + }, + 'activation_layer': { + 'activation': 'softmax' + }, + 'optimizer': { + 'module': 'keras.optimizers', + 'name': 'Adam', + 'params': { + 'lr': 1e-3, + 'clipvalue': 5, + 'decay': 1e-5 + } + }, + 'compile_params': { + 'loss': 'categorical_crossentropy', + # 'optimizer': 'adam', + 'metrics': ['accuracy'] + } + } + + def build_model(self): + base_model = self.embedding.model + embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) + rnn_0 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_0']))(embedded_seq) + conv_0 = Conv1D(**self.hyper_parameters['conv_0'])(rnn_0) + maxpool = GlobalMaxPooling1D()(conv_0) + attn = AttentionWeightedAverage()(conv_0) + average = GlobalAveragePooling1D()(conv_0) + + concatenated = concatenate([maxpool, attn, average], + **self.hyper_parameters['concat']) + output = Dropout(**self.hyper_parameters['dropout'])(concatenated) + output = Dense(**self.hyper_parameters['dense'])(output) + output = Dense(len(self.label2idx), + **self.hyper_parameters['activation_layer'])(output) + + model = Model(base_model.inputs, output) + optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), + self.hyper_parameters['optimizer']['name'])( + **self.hyper_parameters['optimizer']['params']) + model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) + self.model = model + self.model.summary() + + +class AVRNNModel(ClassificationModel): + __architect_name__ = 'AVRNNModel' + __base_hyper_parameters__ = { + 'spatial_dropout': { + 'rate': 0.25 + }, + 'rnn_0': { + 'units': 60, + 'return_sequences': True + }, + 'rnn_1': { + 'units': 60, + 'return_sequences': True + }, + 'concat_rnn': { + 'axis': 2 + }, + 'last': {}, + 'maxpool': {}, + 'attn': {}, + 'average': {}, + 'all_views': { + 'axis': 1 + }, + 'dropout': { + 'rate': 0.5 + }, + 'dense': { + 'units': 144, + 'activation': 'relu' + }, + 'activation_layer': { + 'activation': 'softmax' + }, + 'optimizer': { + 'module': 'keras.optimizers', + 'name': 'Adam', + 'params': { + 'lr': 1e-3, + 'clipvalue': 5, + 'decay': 1e-6 + } + }, + 'compile_params': { + 'loss': 'categorical_crossentropy', + # 'optimizer': 'adam', + 'metrics': ['accuracy'] + } + } + + def build_model(self): + base_model = self.embedding.model + embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) + rnn_0 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_0']))(embedded_seq) + rnn_1 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_1']))(rnn_0) + concat_rnn = concatenate([rnn_0, rnn_1], + **self.hyper_parameters['concat_rnn']) + + last = Lambda(lambda t: t[:, -1], name='last')(concat_rnn) + maxpool = GlobalMaxPooling1D()(concat_rnn) + attn = AttentionWeightedAverage()(concat_rnn) + average = GlobalAveragePooling1D()(concat_rnn) + + all_views = concatenate([last, maxpool, attn, average], + **self.hyper_parameters['all_views']) + output = Dropout(**self.hyper_parameters['dropout'])(all_views) + output = Dense(**self.hyper_parameters['dense'])(output) + output = Dense(len(self.label2idx), + **self.hyper_parameters['activation_layer'])(output) + + model = Model(base_model.inputs, output) + optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), + self.hyper_parameters['optimizer']['name'])( + **self.hyper_parameters['optimizer']['params']) + model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) + self.model = model + self.model.summary() + + +class DropoutBGRUModel(ClassificationModel): + __architect_name__ = 'DropoutBGRUModel' + __base_hyper_parameters__ = { + 'spatial_dropout': { + 'rate': 0.15 + }, + 'rnn_0': { + 'units': 64, + 'return_sequences': True + }, + 'dropout_rnn': { + 'rate': 0.35 + }, + 'rnn_1': { + 'units': 64, + 'return_sequences': True + }, + 'last': {}, + 'maxpool': {}, + 'average': {}, + 'all_views': { + 'axis': 1 + }, + 'dropout': { + 'rate': 0.5 + }, + 'dense': { + 'units': 72, + 'activation': 'relu' + }, + 'activation_layer': { + 'activation': 'softmax' + }, + 'optimizer': { + 'module': 'keras.optimizers', + 'name': 'Adam', + 'params': { + 'lr': 1e-3, + 'decay': 0.0 + } + }, + 'compile_params': { + 'loss': 'categorical_crossentropy', + # 'optimizer': 'adam', + 'metrics': ['accuracy'] + } + } + + def build_model(self): + base_model = self.embedding.model + embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) + rnn_0 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_0']))(embedded_seq) + dropout_rnn = Dropout(**self.hyper_parameters['dropout_rnn'])(rnn_0) + rnn_1 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_1']))(dropout_rnn) + last = Lambda(lambda t: t[:, -1], name='last')(rnn_1) + maxpool = GlobalMaxPooling1D()(rnn_1) + # attn = AttentionWeightedAverage()(rnn_1) + average = GlobalAveragePooling1D()(rnn_1) + + all_views = concatenate([last, maxpool, average], + **self.hyper_parameters['all_views']) + output = Dropout(**self.hyper_parameters['dropout'])(all_views) + output = Dense(**self.hyper_parameters['dense'])(output) + output = Dense(len(self.label2idx), + **self.hyper_parameters['activation_layer'])(output) + + model = Model(base_model.inputs, output) + optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), + self.hyper_parameters['optimizer']['name'])( + **self.hyper_parameters['optimizer']['params']) + model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) + self.model = model + self.model.summary() + + +class DropoutAVRNNModel(ClassificationModel): + __architect_name__ = 'DropoutAVRNNModel' + __base_hyper_parameters__ = { + 'spatial_dropout': { + 'rate': 0.25 + }, + 'rnn_0': { + 'units': 56, + 'return_sequences': True + }, + 'rnn_dropout': { + 'rate': 0.3 + }, + 'rnn_1': { + 'units': 56, + 'return_sequences': True + }, + 'last': {}, + 'maxpool': {}, + 'attn': {}, + 'average': {}, + 'all_views': { + 'axis': 1 + }, + 'dropout_0': { + 'rate': 0.5 + }, + 'dense': { + 'units': 128, + 'activation': 'relu' + }, + 'dropout_1': { + 'rate': 0.25 + }, + 'activation_layer': { + 'activation': 'softmax' + }, + 'optimizer': { + 'module': 'keras.optimizers', + 'name': 'Adam', + 'params': { + 'lr': 1e-3, + 'clipvalue': 5, + 'decay': 1e-7 + } + }, + 'compile_params': { + 'loss': 'categorical_crossentropy', + # 'optimizer': 'adam', + 'metrics': ['accuracy'] + } + } + + def build_model(self): + base_model = self.embedding.model + embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) + rnn_0 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_0']))(embedded_seq) + rnn_dropout = SpatialDropout1D(**self.hyper_parameters['rnn_dropout'])(rnn_0) + rnn_1 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_1']))(rnn_dropout) + + last = Lambda(lambda t: t[:, -1], name='last')(rnn_1) + maxpool = GlobalMaxPooling1D()(rnn_1) + attn = AttentionWeightedAverage()(rnn_1) + average = GlobalAveragePooling1D()(rnn_1) + + all_views = concatenate([last, maxpool, attn, average], + **self.hyper_parameters['all_views']) + output = Dropout(**self.hyper_parameters['dropout_0'])(all_views) + output = Dense(**self.hyper_parameters['dense'])(output) + output = Dropout(**self.hyper_parameters['dropout_1'])(output) + output = Dense(len(self.label2idx), + **self.hyper_parameters['activation_layer'])(output) + + model = Model(base_model.inputs, output) + optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), + self.hyper_parameters['optimizer']['name'])( + **self.hyper_parameters['optimizer']['params']) + model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) + self.model = model + self.model.summary() + + +if __name__ == '__main__': + from kashgari.corpus import TencentDingdangSLUCorpus + from kashgari.embeddings import WordEmbeddings, BERTEmbedding + + train_x, train_y = TencentDingdangSLUCorpus.get_classification_data() + + w2v = WordEmbeddings('sgns.weibo.bigram', + sequence_length=15, + limit=5000) + bert = BERTEmbedding('bert-base-chinese', sequence_length=15) + t_model = CNNModel(bert) + t_model.fit(train_x, train_y, epochs=1) diff --git a/kashgari/tasks/seq_labeling/base_model.py b/kashgari/tasks/seq_labeling/base_model.py index 42d434c2..565551f1 100644 --- a/kashgari/tasks/seq_labeling/base_model.py +++ b/kashgari/tasks/seq_labeling/base_model.py @@ -59,6 +59,8 @@ def build_token2id_label2id_dict(self, y_train: List[List[str]], x_validate: List[List[str]] = None, y_validate: List[List[str]] = None): + for index in range(len(x_train)): + assert len(x_train[index]) == len(y_train[index]) x_data = x_train y_data = y_train if x_validate: diff --git a/kashgari/tasks/seq_labeling/blstm_crf_model.py b/kashgari/tasks/seq_labeling/blstm_crf_model.py index a3a14383..c9796d99 100644 --- a/kashgari/tasks/seq_labeling/blstm_crf_model.py +++ b/kashgari/tasks/seq_labeling/blstm_crf_model.py @@ -41,7 +41,7 @@ def build_model(self): model = Model(base_model.inputs, crf_layer) model.compile(loss=crf_loss, optimizer='adam', - metrics=[crf_accuracy, 'acc']) + metrics=[crf_accuracy]) self.model = model self.model.summary() diff --git a/kashgari/utils/helper.py b/kashgari/utils/helper.py index f5df1089..531ac2c9 100644 --- a/kashgari/utils/helper.py +++ b/kashgari/utils/helper.py @@ -27,47 +27,28 @@ from kashgari.macros import STORAGE_HOST -def h5f_generator(h5path: str, - # indices: List[int], - num_classes: int, - batch_size: int = 128): - """ - fit generator for h5 file - :param h5path: target f5file - :param num_classes: label counts to covert y label to one hot array - :param batch_size: - :return: - """ - - db = h5py.File(h5path, "r") - while True: - page_list = list(range(len(db['x']) // batch_size + 1)) - random.shuffle(page_list) - for page in page_list: - x = db["x"][page: (page + 1) * batch_size] - y = to_categorical(db["y"][page: (page + 1) * batch_size], - num_classes=num_classes, - dtype=np.int) - yield (x, y) - - -def classification_list_generator(x_data: List, - y_data: List, - sequence_lenght: int, - num_classes: int, - batch_size: int = 128): - assert len(x_data) == len(y_data) - while True: - page_list = list(range(len(x_data) // batch_size + 1)) - random.shuffle(page_list) - for page in page_list: - x = x_data[page: (page + 1) * batch_size] - x = sequence.pad_sequences(x, - maxlen=sequence_lenght) - y = to_categorical(y_data[page: (page + 1) * batch_size], - num_classes=num_classes, - dtype=np.int) - yield (x, y) +# def h5f_generator(h5path: str, +# # indices: List[int], +# num_classes: int, +# batch_size: int = 128): +# """ +# fit generator for h5 file +# :param h5path: target f5file +# :param num_classes: label counts to covert y label to one hot array +# :param batch_size: +# :return: +# """ +# +# db = h5py.File(h5path, "r") +# while True: +# page_list = list(range(len(db['x']) // batch_size + 1)) +# random.shuffle(page_list) +# for page in page_list: +# x = db["x"][page: (page + 1) * batch_size] +# y = to_categorical(db["y"][page: (page + 1) * batch_size], +# num_classes=num_classes, +# dtype=np.int) +# yield (x, y) def unison_shuffled_copies(a, b): diff --git a/setup.py b/setup.py index 696c7cf1..29185a20 100644 --- a/setup.py +++ b/setup.py @@ -12,12 +12,17 @@ """ import pathlib -from version import __version__ + from setuptools import find_packages, setup +from version import __version__ + # Package meta-data. NAME = 'kashgari' -DESCRIPTION = 'simple and powerful state-of-the-art NLP framework with pre-trained word2vec and bert embedding.' +DESCRIPTION = 'Simple and powerful NLP framework, ' \ + 'build your state-of-art model in 5 minutes for ' \ + 'named entity recognition (NER), part-of-speech ' \ + 'tagging (PoS) and text classification tasks.' URL = 'https://github.com/BrikerMan/Kashgari' EMAIL = 'eliyar917@gmail.com' AUTHOR = 'BrikerMan' diff --git a/tests/test_classifier_models.py b/tests/test_classifier_models.py index e20ed1a2..4653fb3f 100644 --- a/tests/test_classifier_models.py +++ b/tests/test_classifier_models.py @@ -18,7 +18,12 @@ import unittest from kashgari.embeddings import WordEmbeddings, BERTEmbedding -from kashgari.tasks.classification import BLSTMModel, CNNModel, CNNLSTMModel, ClassificationModel + +from kashgari.tasks.classification import BLSTMModel, CNNLSTMModel, CNNModel +from kashgari.tasks.classification import AVCNNModel, KMaxCNNModel, RCNNModel, AVRNNModel +from kashgari.tasks.classification import DropoutBGRUModel, DropoutAVRNNModel + + from kashgari.utils.logger import init_logger init_logger() @@ -54,9 +59,7 @@ class EmbeddingManager(object): @classmethod def get_bert(cls): if cls.bert_embedding is None: - dir_path = os.path.dirname(os.path.realpath(__file__)) - bert_path = os.path.join(dir_path, 'data', 'test_bert_checkpoint') - cls.bert_embedding = BERTEmbedding(bert_path, sequence_length=15) + cls.bert_embedding = BERTEmbedding('bert-base-chinese', sequence_length=15) logging.info('bert_embedding seq len: {}'.format(cls.bert_embedding.sequence_length)) return cls.bert_embedding @@ -67,13 +70,54 @@ def get_w2v(cls): return cls.word2vec_embedding -class TestBLSTMModelModel(unittest.TestCase): - model: ClassificationModel = None +class TestBLSTMModelModelBasic(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.epochs = 2 + cls.model_class = BLSTMModel + cls.model = cls.model_class() + + def test_fit(self): + self.model.fit(train_x, train_y, eval_x, eval_y, epochs=self.epochs) + + def test_save_and_load(self): + self.test_fit() + model_path = os.path.join(tempfile.gettempdir(), 'kashgari_model', str(time.time())) + self.model.save(model_path) + new_model = BLSTMModel.load_model(model_path) + assert new_model is not None + sentence = list('语言学包含了几种分支领域。') + result = new_model.predict(sentence) + assert isinstance(result, str) + + def test_w2v_embedding(self): + embedding = EmbeddingManager.get_w2v() + w2v_model = self.model_class(embedding) + w2v_model.fit(train_x, train_y, epochs=1) + assert len(w2v_model.label2idx) == 4 + assert len(w2v_model.token2idx) > 4 + + sentence = list('语言学包含了几种分支领域。') + assert isinstance(w2v_model.predict(sentence), str) + assert isinstance(w2v_model.predict([sentence]), list) + logging.info('test predict: {} -> {}'.format(sentence, self.model.predict(sentence))) + w2v_model.predict(sentence, output_dict=True) + w2v_model.predict(sentence, output_dict=False) + + @classmethod + def tearDownClass(cls): + del cls.model + logging.info('tearDownClass {}'.format(cls)) + + +class TestAllCNNModelModel(unittest.TestCase): @classmethod def setUpClass(cls): - cls.epochs = 3 - cls.model = BLSTMModel() + cls.epochs = 2 + cls.model_class = CNNModel + cls.model = cls.model_class() def test_build(self): self.model.fit(train_x, train_y, epochs=1) @@ -119,86 +163,107 @@ def test_save_and_load(self): result = new_model.predict(sentence) assert isinstance(result, str) + # def test_bert_embedding(self): + # embedding = EmbeddingManager.get_bert() + # bert_model = self.model_class(embedding) + # bert_model.fit(train_x, train_y, epochs=1) + # assert len(bert_model.label2idx) == 4 + # assert len(bert_model.token2idx) > 4 + # + # sentence = list('语言学包含了几种分支领域。') + # assert isinstance(bert_model.predict(sentence), str) + # assert isinstance(bert_model.predict([sentence]), list) + # logging.info('test predict: {} -> {}'.format(sentence, self.model.predict(sentence))) + # bert_model.predict(sentence, output_dict=True) + # bert_model.predict(sentence, output_dict=False) + + def test_w2v_embedding(self): + embedding = EmbeddingManager.get_w2v() + w2v_model = self.model_class(embedding) + w2v_model.fit(train_x, train_y, epochs=1) + assert len(w2v_model.label2idx) == 4 + assert len(w2v_model.token2idx) > 4 + + sentence = list('语言学包含了几种分支领域。') + assert isinstance(w2v_model.predict(sentence), str) + assert isinstance(w2v_model.predict([sentence]), list) + logging.info('test predict: {} -> {}'.format(sentence, self.model.predict(sentence))) + w2v_model.predict(sentence, output_dict=True) + w2v_model.predict(sentence, output_dict=False) + @classmethod def tearDownClass(cls): del cls.model logging.info('tearDownClass {}'.format(cls)) -class TestBLSTMModelWithWord2Vec(TestBLSTMModelModel): +class TestCNNLSTMModelBasic(TestBLSTMModelModelBasic): @classmethod def setUpClass(cls): - cls.epochs = 3 - embedding = EmbeddingManager.get_w2v() - cls.model = BLSTMModel(embedding) + cls.epochs = 2 + cls.model_class = CNNLSTMModel + cls.model = cls.model_class() -class TestBLSTMModelWithBERT(TestBLSTMModelModel): +class TestCNNModelBasic(TestBLSTMModelModelBasic): @classmethod def setUpClass(cls): - cls.epochs = 1 - embedding = EmbeddingManager.get_bert() - cls.model = BLSTMModel(embedding) - - def test_save_and_load(self): - super(TestBLSTMModelWithBERT, self).test_save_and_load() + cls.epochs = 2 + cls.model_class = CNNModel + cls.model = cls.model_class() -class TestCNNModel(TestBLSTMModelModel): +class TestAVCNNModelBasic(TestBLSTMModelModelBasic): @classmethod def setUpClass(cls): - cls.epochs = 3 - TestCNNModel.model = CNNModel() - - def test_fit(self): - super(TestCNNModel, self).test_fit() + cls.epochs = 2 + cls.model_class = AVCNNModel + cls.model = cls.model_class() -class TestCNNModelWithWord2Vec(TestBLSTMModelModel): +class TestKMaxCNNModelBasic(TestBLSTMModelModelBasic): @classmethod def setUpClass(cls): - cls.epochs = 3 - embedding = EmbeddingManager.get_w2v() - cls.model = CNNModel(embedding) + cls.epochs = 2 + cls .model_class = KMaxCNNModel + cls.model = cls.model_class() + +class TestRCNNModelBasic(TestBLSTMModelModelBasic): -class TestCNNModelWithBERT(TestBLSTMModelModel): @classmethod def setUpClass(cls): - cls.epochs = 1 - embedding = EmbeddingManager.get_bert() - TestCNNModelWithBERT.model = CNNModel(embedding) + cls.epochs = 2 + cls.model_class = RCNNModel + cls.model = cls.model_class() -class TestLSTMCNNModel(TestBLSTMModelModel): +class TestAVRNNModelBasic(TestBLSTMModelModelBasic): @classmethod def setUpClass(cls): - cls.epochs = 3 - cls.model = CNNLSTMModel() + cls.epochs = 2 + cls.model_class = AVRNNModel + cls.model = cls.model_class() -class TestLSTMCNNModelWithWord2Vec(TestBLSTMModelModel): +class TestDropoutBGRUModelBasic(TestBLSTMModelModelBasic): @classmethod def setUpClass(cls): - cls.epochs = 3 - embedding = EmbeddingManager.get_w2v() - cls.model = CNNLSTMModel(embedding) + cls.epochs = 2 + cls.model_class = DropoutBGRUModel + cls.model = cls.model_class() -class TestLSTMCNNModelWithBERT(TestBLSTMModelModel): +class TestDropoutAVRNNModelBasic(TestBLSTMModelModelBasic): @classmethod def setUpClass(cls): - cls.epochs = 1 - embedding = EmbeddingManager.get_bert() - cls.model = CNNLSTMModel(embedding) - - -if __name__ == "__main__": - unittest.main() + cls.epochs = 2 + cls.model_class = DropoutAVRNNModel + cls.model = cls.model_class() diff --git a/version.py b/version.py index b9eed60e..44da0019 100644 --- a/version.py +++ b/version.py @@ -11,4 +11,4 @@ """ -__version__ = '0.1.8' +__version__ = '0.1.9'