Merge pull request #23 from BrikerMan/develop

v0.1.9
BrikerMan · Feb 28, 2019 · 26f85b0 · 26f85b0
2 parents 692e2c2 + 9fc6c2e
commit 26f85b0
Show file tree

Hide file tree

Showing 20 changed files with 1,213 additions and 299 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -12,6 +12,8 @@ exclude_lines =
     # Don't complain if tests don't hit defensive assertion code:
     raise AssertionError
     raise NotImplementedError
+    raise ValueError
+    except Exception as e:
 
     # Don't complain if non-runnable code isn't run:
     if __name__ == .__main__.:

diff --git a/.travis.yml b/.travis.yml
@@ -4,14 +4,21 @@ python:
   - "3.6"
 cache: pip
 # command to install dependencies
+env:
+  - TEST_FILE=tests/test_classifier_models.py
+  - TEST_FILE=tests/test_seq_labeling_models.py
+  - TEST_FILE=tests/test_corpus.py
+  - TEST_FILE=tests/test_embeddings.py
 before_install:
   - export BOTO_CONFIG=/dev/null
 install:
   - pip install python-coveralls
   - pip install -r requirements.txt
   - pip install .
+  - pip install coverage
+  - pip install nose
 # command to run tests
 script:
-  - sh test.sh
+  - nosetests --with-coverage --cover-html --cover-html-dir=htmlcov --cover-package="kashgari" $TEST_FILE
 after_success:
   - coveralls
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 [![Issues](https://img.shields.io/github/issues/BrikerMan/Kashgari.svg)](https://github.com/BrikerMan/Kashgari/issues)
 [![Contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg)](CONTRIBUTING.md)
 ![](https://img.shields.io/pypi/l/kashgari.svg?style=flat)
-[![](https://img.shields.io/pypi/dw/kashgari.svg)](https://pypi.org/project/kashgari/)
+[![](https://img.shields.io/pypi/dm/kashgari.svg)](https://pypi.org/project/kashgari/)
 
 Simple and powerful NLP framework, build your state-of-art model in 5 minutes for named entity recognition (NER), part-of-speech tagging (PoS) and text classification tasks.
 
@@ -25,14 +25,20 @@ Kashgare is:
 * Embedding support
     * Classic word2vec embedding
     * BERT embedding
-* Text Classification Models
-    * CNN Classification Model
-    * CNN LSTM Classification Model
-    * Bidirectional LSTM Classification Model
-* Text Labeling Models (NER, PoS)
-    * Bidirectional LSTM Labeling Model
-    * Bidirectional LSTM CRF Labeling Model
-    * CNN LSTM Labeling Model
+* Sequence(Text) Classification Models
+    * CNNModel
+    * BLSTMModel
+    * CNNLSTMModel
+    * AVCNNModel
+    * KMaxCNNModel
+    * RCNNModel
+    * AVRNNModel
+    * DropoutBGRUModel
+    * DropoutAVRNNModel
+* Sequence(Text) Labeling Models (NER, PoS)
+    * CNNLSTMModel
+    * BLSTMModel
+    * BLSTMCRFModel
 * Model Training
 * Model Evaluate
 * GPU Support

diff --git a/examples/run_flask_api.py b/examples/run_flask_api.py
@@ -0,0 +1,37 @@
+# encoding: utf-8
+"""
+@author: BrikerMan
+@contact: eliyar917@gmail.com
+@blog: https://eliyar.biz
+
+@version: 1.0
+@license: Apache Licence
+@file: run_flask_api
+@time: 2019-02-24
+
+"""
+import random
+from flask import Flask, jsonify
+from kashgari.tasks.classification import KMaxCNNModel
+from kashgari.corpus import SMP2017ECDTClassificationCorpus
+
+train_x, train_y = SMP2017ECDTClassificationCorpus.get_classification_data()
+
+model = KMaxCNNModel()
+model.fit(train_x, train_y)
+
+
+app = Flask(__name__)
+
+
+@app.route('/predict', methods=['GET'])
+def get_tasks():
+    x = random.choice(train_x)
+    y = model.predict(x, output_dict=True)
+    return jsonify({'x': x, 'y': y})
+
+
+if __name__ == '__main__':
+    # must run predict once before `app.run` to prevent predict error
+    model.predict(train_x[10])
+    app.run(debug=True, port=8080)
diff --git a/kashgari/__init__.py b/kashgari/__init__.py
@@ -17,6 +17,8 @@
 from kashgari.tasks import classification
 from kashgari.tasks import seq_labeling
 
+from kashgari.macros import config
+
 
 if __name__ == "__main__":
     print("Hello world")
diff --git a/kashgari/embeddings/embeddings.py b/kashgari/embeddings/embeddings.py
@@ -133,7 +133,7 @@ def tokenize(self,
         def tokenize_sentence(text: TextSeqType) -> TokenSeqType:
             tokens = [self.token2idx.get(token, self.token2idx[k.UNK]) for token in text]
             if add_bos_eos:
-                tokens = [self.token2idx[k.BOS]] + tokens + [self.token2idx[k.BOS]]
+                tokens = [self.token2idx[k.BOS]] + tokens + [self.token2idx[k.EOS]]
             return tokens
 
         if is_list:

diff --git a/kashgari/layers.py b/kashgari/layers.py
@@ -0,0 +1,160 @@
+# encoding: utf-8
+"""
+@author: BrikerMan
+@contact: eliyar917@gmail.com
+@blog: https://eliyar.biz
+
+@version: 1.0
+@license: Apache Licence
+@file: layers
+@time: 2019-02-23
+
+"""
+from __future__ import absolute_import, division
+import logging
+
+import tensorflow as tf
+from keras.layers import Flatten
+from keras.layers import GRU, LSTM
+from keras.layers import CuDNNGRU, CuDNNLSTM
+from keras import initializers
+from keras.engine import InputSpec, Layer
+from keras import backend as K
+
+from kashgari.macros import config
+
+if config.use_CuDNN_cell:
+    GRULayer = CuDNNGRU
+    LSTMLayer = CuDNNLSTM
+else:
+    GRULayer = GRU
+    LSTMLayer = LSTM
+
+
+class AttentionWeightedAverage(Layer):
+    '''
+    Computes a weighted average of the different channels across timesteps.
+    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
+    '''
+
+    def __init__(self, return_attention=False, **kwargs):
+        self.init = initializers.get('uniform')
+        self.supports_masking = True
+        self.return_attention = return_attention
+        super(AttentionWeightedAverage, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.input_spec = [InputSpec(ndim=3)]
+        assert len(input_shape) == 3
+
+        self.W = self.add_weight(shape=(input_shape[2], 1),
+                                 name='{}_w'.format(self.name),
+                                 initializer=self.init)
+        self.trainable_weights = [self.W]
+        super(AttentionWeightedAverage, self).build(input_shape)
+
+    def call(self, x, mask=None):
+        # computes a probability distribution over the timesteps
+        # uses 'max trick' for numerical stability
+        # reshape is done to avoid issue with Tensorflow
+        # and 1-dimensional weights
+        logits = K.dot(x, self.W)
+        x_shape = K.shape(x)
+        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
+        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))
+
+        # masked timesteps have zero weight
+        if mask is not None:
+            mask = K.cast(mask, K.floatx())
+            ai = ai * mask
+        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
+        weighted_input = x * K.expand_dims(att_weights)
+        result = K.sum(weighted_input, axis=1)
+        if self.return_attention:
+            return [result, att_weights]
+        return result
+
+    def get_output_shape_for(self, input_shape):
+        return self.compute_output_shape(input_shape)
+
+    def compute_output_shape(self, input_shape):
+        output_len = input_shape[2]
+        if self.return_attention:
+            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
+        return (input_shape[0], output_len)
+
+    def compute_mask(self, input, input_mask=None):
+        if isinstance(input_mask, list):
+            return [None] * len(input_mask)
+        else:
+            return None
+
+
+class KMaxPooling(Layer):
+    '''
+    K-max pooling layer that extracts the k-highest activation from a sequence (2nd dimension).
+    TensorFlow backend.
+
+    # Arguments
+        k: An int scale,
+            indicate k max steps of features to pool.
+        sorted: A bool,
+            if output is sorted (default) or not.
+        data_format: A string,
+            one of `channels_last` (default) or `channels_first`.
+            The ordering of the dimensions in the inputs.
+            `channels_last` corresponds to inputs with shape
+            `(batch, steps, features)` while `channels_first`
+            corresponds to inputs with shape
+            `(batch, features, steps)`.
+    # Input shape
+        - If `data_format='channels_last'`:
+            3D tensor with shape:
+            `(batch_size, steps, features)`
+        - If `data_format='channels_first'`:
+            3D tensor with shape:
+            `(batch_size, features, steps)`
+    # Output shape
+        3D tensor with shape:
+        `(batch_size, top-k-steps, features)`
+    '''
+
+    def __init__(self, k=1, sorted=True, data_format='channels_last', **kwargs):
+        super(KMaxPooling, self).__init__(**kwargs)
+        self.input_spec = InputSpec(ndim=3)
+        self.k = k
+        self.sorted = sorted
+        self.data_format = K.normalize_data_format(data_format)
+
+    # def build(self, input_shape):
+    #     assert len(input_shape) == 3
+    #     super(KMaxPooling, self).build(input_shape)
+
+    def compute_output_shape(self, input_shape):
+        if self.data_format == 'channels_first':
+            return (input_shape[0], self.k, input_shape[1])
+        else:
+            return (input_shape[0], self.k, input_shape[2])
+
+    def call(self, inputs):
+        if self.data_format == 'channels_last':
+            # swap last two dimensions since top_k will be applied along the last dimension
+            shifted_input = tf.transpose(inputs, [0, 2, 1])
+
+            # extract top_k, returns two tensors [values, indices]
+            top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=self.sorted)[0]
+        else:
+            top_k = tf.nn.top_k(inputs, k=self.k, sorted=self.sorted)[0]
+        # return flattened output
+        return tf.transpose(top_k, [0, 2, 1])
+
+    def get_config(self):
+        config = {'k': self.k,
+                  'sorted': self.sorted,
+                  'data_format': self.data_format}
+        base_config = super(KMaxPooling, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+if __name__ == '__main__':
+    print("hello, world")
diff --git a/kashgari/macros.py b/kashgari/macros.py
@@ -36,6 +36,14 @@
 pathlib.Path(PROCESSED_CORPUS_PATH).mkdir(parents=True, exist_ok=True)
 
 
+class _Config(object):
+    def __init__(self):
+        self.use_CuDNN_cell = False
+
+
+config = _Config()
+
+
 class CustomEmbedding(object):
     def __init__(self, embedding_size=100):
         self.embedding_size = embedding_size