Skip to content

Commit

Permalink
Merge pull request #23 from BrikerMan/develop
Browse files Browse the repository at this point in the history
v0.1.9
  • Loading branch information
BrikerMan committed Feb 28, 2019
2 parents 692e2c2 + 9fc6c2e commit 26f85b0
Show file tree
Hide file tree
Showing 20 changed files with 1,213 additions and 299 deletions.
2 changes: 2 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ exclude_lines =
# Don't complain if tests don't hit defensive assertion code:
raise AssertionError
raise NotImplementedError
raise ValueError
except Exception as e:

# Don't complain if non-runnable code isn't run:
if __name__ == .__main__.:
Expand Down
9 changes: 8 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,21 @@ python:
- "3.6"
cache: pip
# command to install dependencies
env:
- TEST_FILE=tests/test_classifier_models.py
- TEST_FILE=tests/test_seq_labeling_models.py
- TEST_FILE=tests/test_corpus.py
- TEST_FILE=tests/test_embeddings.py
before_install:
- export BOTO_CONFIG=/dev/null
install:
- pip install python-coveralls
- pip install -r requirements.txt
- pip install .
- pip install coverage
- pip install nose
# command to run tests
script:
- sh test.sh
- nosetests --with-coverage --cover-html --cover-html-dir=htmlcov --cover-package="kashgari" $TEST_FILE
after_success:
- coveralls
24 changes: 15 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
[![Issues](https://img.shields.io/github/issues/BrikerMan/Kashgari.svg)](https://github.com/BrikerMan/Kashgari/issues)
[![Contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg)](CONTRIBUTING.md)
![](https://img.shields.io/pypi/l/kashgari.svg?style=flat)
[![](https://img.shields.io/pypi/dw/kashgari.svg)](https://pypi.org/project/kashgari/)
[![](https://img.shields.io/pypi/dm/kashgari.svg)](https://pypi.org/project/kashgari/)

Simple and powerful NLP framework, build your state-of-art model in 5 minutes for named entity recognition (NER), part-of-speech tagging (PoS) and text classification tasks.

Expand All @@ -25,14 +25,20 @@ Kashgare is:
* Embedding support
* Classic word2vec embedding
* BERT embedding
* Text Classification Models
* CNN Classification Model
* CNN LSTM Classification Model
* Bidirectional LSTM Classification Model
* Text Labeling Models (NER, PoS)
* Bidirectional LSTM Labeling Model
* Bidirectional LSTM CRF Labeling Model
* CNN LSTM Labeling Model
* Sequence(Text) Classification Models
* CNNModel
* BLSTMModel
* CNNLSTMModel
* AVCNNModel
* KMaxCNNModel
* RCNNModel
* AVRNNModel
* DropoutBGRUModel
* DropoutAVRNNModel
* Sequence(Text) Labeling Models (NER, PoS)
* CNNLSTMModel
* BLSTMModel
* BLSTMCRFModel
* Model Training
* Model Evaluate
* GPU Support
Expand Down
37 changes: 37 additions & 0 deletions examples/run_flask_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# encoding: utf-8
"""
@author: BrikerMan
@contact: eliyar917@gmail.com
@blog: https://eliyar.biz
@version: 1.0
@license: Apache Licence
@file: run_flask_api
@time: 2019-02-24
"""
import random
from flask import Flask, jsonify
from kashgari.tasks.classification import KMaxCNNModel
from kashgari.corpus import SMP2017ECDTClassificationCorpus

train_x, train_y = SMP2017ECDTClassificationCorpus.get_classification_data()

model = KMaxCNNModel()
model.fit(train_x, train_y)


app = Flask(__name__)


@app.route('/predict', methods=['GET'])
def get_tasks():
x = random.choice(train_x)
y = model.predict(x, output_dict=True)
return jsonify({'x': x, 'y': y})


if __name__ == '__main__':
# must run predict once before `app.run` to prevent predict error
model.predict(train_x[10])
app.run(debug=True, port=8080)
2 changes: 2 additions & 0 deletions kashgari/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from kashgari.tasks import classification
from kashgari.tasks import seq_labeling

from kashgari.macros import config


if __name__ == "__main__":
print("Hello world")
2 changes: 1 addition & 1 deletion kashgari/embeddings/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def tokenize(self,
def tokenize_sentence(text: TextSeqType) -> TokenSeqType:
tokens = [self.token2idx.get(token, self.token2idx[k.UNK]) for token in text]
if add_bos_eos:
tokens = [self.token2idx[k.BOS]] + tokens + [self.token2idx[k.BOS]]
tokens = [self.token2idx[k.BOS]] + tokens + [self.token2idx[k.EOS]]
return tokens

if is_list:
Expand Down
160 changes: 160 additions & 0 deletions kashgari/layers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# encoding: utf-8
"""
@author: BrikerMan
@contact: eliyar917@gmail.com
@blog: https://eliyar.biz
@version: 1.0
@license: Apache Licence
@file: layers
@time: 2019-02-23
"""
from __future__ import absolute_import, division
import logging

import tensorflow as tf
from keras.layers import Flatten
from keras.layers import GRU, LSTM
from keras.layers import CuDNNGRU, CuDNNLSTM
from keras import initializers
from keras.engine import InputSpec, Layer
from keras import backend as K

from kashgari.macros import config

if config.use_CuDNN_cell:
GRULayer = CuDNNGRU
LSTMLayer = CuDNNLSTM
else:
GRULayer = GRU
LSTMLayer = LSTM


class AttentionWeightedAverage(Layer):
'''
Computes a weighted average of the different channels across timesteps.
Uses 1 parameter pr. channel to compute the attention value for a single timestep.
'''

def __init__(self, return_attention=False, **kwargs):
self.init = initializers.get('uniform')
self.supports_masking = True
self.return_attention = return_attention
super(AttentionWeightedAverage, self).__init__(**kwargs)

def build(self, input_shape):
self.input_spec = [InputSpec(ndim=3)]
assert len(input_shape) == 3

self.W = self.add_weight(shape=(input_shape[2], 1),
name='{}_w'.format(self.name),
initializer=self.init)
self.trainable_weights = [self.W]
super(AttentionWeightedAverage, self).build(input_shape)

def call(self, x, mask=None):
# computes a probability distribution over the timesteps
# uses 'max trick' for numerical stability
# reshape is done to avoid issue with Tensorflow
# and 1-dimensional weights
logits = K.dot(x, self.W)
x_shape = K.shape(x)
logits = K.reshape(logits, (x_shape[0], x_shape[1]))
ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

# masked timesteps have zero weight
if mask is not None:
mask = K.cast(mask, K.floatx())
ai = ai * mask
att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
weighted_input = x * K.expand_dims(att_weights)
result = K.sum(weighted_input, axis=1)
if self.return_attention:
return [result, att_weights]
return result

def get_output_shape_for(self, input_shape):
return self.compute_output_shape(input_shape)

def compute_output_shape(self, input_shape):
output_len = input_shape[2]
if self.return_attention:
return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
return (input_shape[0], output_len)

def compute_mask(self, input, input_mask=None):
if isinstance(input_mask, list):
return [None] * len(input_mask)
else:
return None


class KMaxPooling(Layer):
'''
K-max pooling layer that extracts the k-highest activation from a sequence (2nd dimension).
TensorFlow backend.
# Arguments
k: An int scale,
indicate k max steps of features to pool.
sorted: A bool,
if output is sorted (default) or not.
data_format: A string,
one of `channels_last` (default) or `channels_first`.
The ordering of the dimensions in the inputs.
`channels_last` corresponds to inputs with shape
`(batch, steps, features)` while `channels_first`
corresponds to inputs with shape
`(batch, features, steps)`.
# Input shape
- If `data_format='channels_last'`:
3D tensor with shape:
`(batch_size, steps, features)`
- If `data_format='channels_first'`:
3D tensor with shape:
`(batch_size, features, steps)`
# Output shape
3D tensor with shape:
`(batch_size, top-k-steps, features)`
'''

def __init__(self, k=1, sorted=True, data_format='channels_last', **kwargs):
super(KMaxPooling, self).__init__(**kwargs)
self.input_spec = InputSpec(ndim=3)
self.k = k
self.sorted = sorted
self.data_format = K.normalize_data_format(data_format)

# def build(self, input_shape):
# assert len(input_shape) == 3
# super(KMaxPooling, self).build(input_shape)

def compute_output_shape(self, input_shape):
if self.data_format == 'channels_first':
return (input_shape[0], self.k, input_shape[1])
else:
return (input_shape[0], self.k, input_shape[2])

def call(self, inputs):
if self.data_format == 'channels_last':
# swap last two dimensions since top_k will be applied along the last dimension
shifted_input = tf.transpose(inputs, [0, 2, 1])

# extract top_k, returns two tensors [values, indices]
top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=self.sorted)[0]
else:
top_k = tf.nn.top_k(inputs, k=self.k, sorted=self.sorted)[0]
# return flattened output
return tf.transpose(top_k, [0, 2, 1])

def get_config(self):
config = {'k': self.k,
'sorted': self.sorted,
'data_format': self.data_format}
base_config = super(KMaxPooling, self).get_config()
return dict(list(base_config.items()) + list(config.items()))


if __name__ == '__main__':
print("hello, world")
8 changes: 8 additions & 0 deletions kashgari/macros.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@
pathlib.Path(PROCESSED_CORPUS_PATH).mkdir(parents=True, exist_ok=True)


class _Config(object):
def __init__(self):
self.use_CuDNN_cell = False


config = _Config()


class CustomEmbedding(object):
def __init__(self, embedding_size=100):
self.embedding_size = embedding_size
Expand Down

0 comments on commit 26f85b0

Please sign in to comment.