Skip to content

Commit

Permalink
Merge 17e2cbf into cae052d
Browse files Browse the repository at this point in the history
  • Loading branch information
BrikerMan committed Dec 11, 2019
2 parents cae052d + 17e2cbf commit 3f7c1c3
Show file tree
Hide file tree
Showing 19 changed files with 618 additions and 176 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Expand Up @@ -110,4 +110,5 @@ venv.bak/
.vscode
venv-tf/*
.pytype/
mkdocs/site
mkdocs/site
node_modules
7 changes: 6 additions & 1 deletion .travis.yml
Expand Up @@ -5,6 +5,8 @@ env:
global:
- COVERALLS_PARALLEL=true
matrix:
# Scoring
- TEST_FILE=tests/scoring
# Labeling
- TEST_FILE=tests/labeling/
# classification part 1
Expand All @@ -17,6 +19,7 @@ env:
- TEST_FILE=tests/test_custom_multi_output_classification.py
# Embedding
- TEST_FILE=tests/embedding/
# Tokenizer
- TEST_FILE=tests/test_tokenizer.py

python:
Expand Down Expand Up @@ -44,6 +47,7 @@ install:
- pip install nose
- python -c "import kashgari;print(f'kashgari version {kashgari.__version__}')"
- git fetch --unshallow --quiet
- export PYTHONPATH=`pwd`

script: nosetests --with-coverage --cover-html --cover-html-dir=htmlcov
--cover-xml --cover-xml-file=coverage.xml --with-xunit
Expand All @@ -68,8 +72,9 @@ jobs:
- stage: Document
python: "3.6"
install:
- echo -e "machine github.com\n login ${GITHUB_TOKEN}" > ~/.netrc
- echo -e "machine github.com\n login ${GITHUB_TOKEN}" > ~/.netrc
- pip install mkdocs mkdocs-material pymdown-extensions
script:
- cp README.md mkdocs/docs/index.md
- cd mkdocs
- mkdocs gh-deploy --force --clean
3 changes: 2 additions & 1 deletion README.md
Expand Up @@ -67,7 +67,8 @@ Here is a set of quick tutorials to get you started with the library:

- [Tutorial 1: Text Classification](https://kashgari.bmio.net/tutorial/text-classification/)
- [Tutorial 2: Text Labeling](https://kashgari.bmio.net/tutorial/text-labeling/)
- [Tutorial 3: Language Embedding](https://kashgari.bmio.net/embeddings/)
- [Tutorial 3: Text Scoring](https://kashgari.bmio.net/tutorial/text-scoring/)
- [Tutorial 4: Language Embedding](https://kashgari.bmio.net/embeddings/)

There are also articles and posts that illustrate how to use Kashgari:

Expand Down
1 change: 1 addition & 0 deletions kashgari/__init__.py
Expand Up @@ -23,6 +23,7 @@
custom_objects = keras_bert.get_custom_objects()
CLASSIFICATION = TaskType.CLASSIFICATION
LABELING = TaskType.LABELING
SCORING = TaskType.SCORING

from kashgari.version import __version__

Expand Down
6 changes: 4 additions & 2 deletions kashgari/embeddings/base_embedding.py
Expand Up @@ -16,7 +16,7 @@
from tensorflow import keras

import kashgari
from kashgari.processors import ClassificationProcessor, LabelingProcessor
from kashgari.processors import ClassificationProcessor, LabelingProcessor, ScoringProcessor
from kashgari.processors.base_processor import BaseProcessor

L = keras.layers
Expand Down Expand Up @@ -74,8 +74,10 @@ def __init__(self,
self.processor = ClassificationProcessor()
elif task == kashgari.LABELING:
self.processor = LabelingProcessor()
elif task == kashgari.SCORING:
self.processor = ScoringProcessor()
else:
raise ValueError()
raise ValueError('Need to set the processor param, value: {labeling, classification, scoring}')
else:
self.processor = processor

Expand Down
1 change: 1 addition & 0 deletions kashgari/macros.py
Expand Up @@ -23,6 +23,7 @@
class TaskType(object):
CLASSIFICATION = 'classification'
LABELING = 'labeling'
SCORING = 'scoring'


class Config(object):
Expand Down
1 change: 1 addition & 0 deletions kashgari/processors/__init__.py
Expand Up @@ -10,3 +10,4 @@

from kashgari.processors.classification_processor import ClassificationProcessor
from kashgari.processors.labeling_processor import LabelingProcessor
from kashgari.processors.scoring_processor import ScoringProcessor
100 changes: 100 additions & 0 deletions kashgari/processors/scoring_processor.py
@@ -0,0 +1,100 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: scoring_processor.py
# time: 11:10 上午

from typing import List, Optional

import numpy as np

import kashgari
from kashgari import utils
from kashgari.processors.base_processor import BaseProcessor


def is_numeric(obj):
attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
return all(hasattr(obj, attr) for attr in attrs)


class ScoringProcessor(BaseProcessor):
"""
Corpus Pre Processor class
"""

def __init__(self, output_dim=None, **kwargs):
super(ScoringProcessor, self).__init__(**kwargs)
self.output_dim = output_dim

def info(self):
info = super(ScoringProcessor, self).info()
info['task'] = kashgari.SCORING
return info

def _build_label_dict(self,
label_list: List[List[float]]):
"""
Build label2idx dict for sequence labeling task
Args:
label_list: corpus label list
"""
if self.output_dim is None:
label_sample = label_list[0]
if isinstance(label_sample, np.ndarray) and len(label_sample.shape) == 1:
self.output_dim = label_sample.shape[0]
elif is_numeric(label_sample):
self.output_dim = 1
elif isinstance(label_sample, list):
self.output_dim = len(label_sample)
else:
raise ValueError('Scoring Label Sample must be a float, float array or 1D numpy array')
# np_labels = np.array(label_list)
# if np_labels.max() > 1 or np_labels.min() < 0:
# raise ValueError('Scoring Label Sample must be in range[0,1]')

def process_y_dataset(self,
data: List[List[str]],
max_len: Optional[int] = None,
subset: Optional[List[int]] = None) -> np.ndarray:
if subset is not None:
target = utils.get_list_subset(data, subset)
else:
target = data[:]
y = np.array(target)
return y

def numerize_token_sequences(self,
sequences: List[List[str]]):

result = []
for seq in sequences:
if self.add_bos_eos:
seq = [self.token_bos] + seq + [self.token_eos]
unk_index = self.token2idx[self.token_unk]
result.append([self.token2idx.get(token, unk_index) for token in seq])
return result

def numerize_label_sequences(self,
sequences: List[List[str]]) -> List[List[int]]:
return sequences

def reverse_numerize_label_sequences(self,
sequences,
lengths=None):
return sequences


if __name__ == "__main__":
from kashgari.corpus import SMP2018ECDTCorpus

x, y = SMP2018ECDTCorpus.load_data()
x = x[:3]
y = [0.2, 0.3, 0.2]
p = ScoringProcessor()
p.analyze_corpus(x, y)
print(p.process_y_dataset(y))
12 changes: 8 additions & 4 deletions kashgari/tasks/base_model.py
Expand Up @@ -414,12 +414,16 @@ def predict(self,
lengths = [len(sen) for sen in x_data]
tensor = self.embedding.process_x_dataset(x_data)
pred = self.tf_model.predict(tensor, batch_size=batch_size, **predict_kwargs)
res = self.embedding.reverse_numerize_label_sequences(pred.argmax(-1),
if self.task == 'scoring':
t_pred = pred
else:
t_pred = pred.argmax(-1)
res = self.embedding.reverse_numerize_label_sequences(t_pred,
lengths)
if debug_info:
logging.info('input: {}'.format(tensor))
logging.info('output: {}'.format(pred))
logging.info('output argmax: {}'.format(pred.argmax(-1)))
print('input: {}'.format(tensor))
print('output: {}'.format(pred))
print('output argmax: {}'.format(t_pred))
return res

def evaluate(self,
Expand Down
14 changes: 14 additions & 0 deletions kashgari/tasks/scoring/__init__.py
@@ -0,0 +1,14 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: __init__.py
# time: 11:36 上午


from kashgari.tasks.scoring.models import BiLSTM_Model

if __name__ == "__main__":
pass
92 changes: 92 additions & 0 deletions kashgari/tasks/scoring/base_model.py
@@ -0,0 +1,92 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: base_model.py
# time: 11:36 上午


from typing import Callable
from typing import Dict, Any

import numpy as np
from sklearn import metrics

from kashgari.tasks.base_model import BaseModel


class BaseScoringModel(BaseModel):
"""Base Sequence Labeling Model"""

__task__ = 'scoring'

@classmethod
def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
raise NotImplementedError

def compile_model(self, **kwargs):
if kwargs.get('loss') is None:
kwargs['loss'] = 'mse'
if kwargs.get('optimizer') is None:
kwargs['optimizer'] = 'rmsprop'
if kwargs.get('metrics') is None:
kwargs['metrics'] = ['mae']
super(BaseScoringModel, self).compile_model(**kwargs)

def evaluate(self,
x_data,
y_data,
batch_size=None,
should_round: bool = False,
round_func: Callable = None,
digits=4,
debug_info=False) -> Dict:
"""
Build a text report showing the main classification metrics.
Args:
x_data:
y_data:
batch_size:
should_round:
round_func:
digits:
debug_info:
Returns:
"""
y_pred = self.predict(x_data, batch_size=batch_size)

if should_round:
if round_func is None:
round_func = np.round
print(self.processor.output_dim)
if self.processor.output_dim != 1:
raise ValueError('Evaluate with round function only accept 1D output')
y_pred = [round_func(i) for i in y_pred]
report = metrics.classification_report(y_data,
y_pred,
digits=digits)

report_dic = metrics.classification_report(y_data,
y_pred,
output_dict=True,
digits=digits)
print(report)
else:
mean_squared_error = metrics.mean_squared_error(y_data, y_pred)
r2_score = metrics.r2_score(y_data, y_pred)
report_dic = {
'mean_squared_error': mean_squared_error,
'r2_score': r2_score
}
print(f"mean_squared_error : {mean_squared_error}\n"
f"r2_score : {r2_score}")
return report_dic


if __name__ == "__main__":
pass
57 changes: 57 additions & 0 deletions kashgari/tasks/scoring/models.py
@@ -0,0 +1,57 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: models.py
# time: 11:38 上午


import logging
from typing import Dict, Any

from tensorflow import keras

from kashgari.tasks.scoring.base_model import BaseScoringModel
from kashgari.layers import L


class BiLSTM_Model(BaseScoringModel):

@classmethod
def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
return {
'layer_bi_lstm': {
'units': 128,
'return_sequences': False
},
'layer_dense': {
'activation': 'linear'
}
}

def build_model_arc(self):
output_dim = self.processor.output_dim
config = self.hyper_parameters
embed_model = self.embedding.embed_model

layer_bi_lstm = L.Bidirectional(L.LSTM(**config['layer_bi_lstm']))
layer_dense = L.Dense(output_dim, **config['layer_dense'])

tensor = layer_bi_lstm(embed_model.output)
output_tensor = layer_dense(tensor)

self.tf_model = keras.Model(embed_model.inputs, output_tensor)


if __name__ == "__main__":
from kashgari.corpus import SMP2018ECDTCorpus
import numpy as np

x, y = SMP2018ECDTCorpus.load_data('valid')
y = np.random.random((len(x), 4))
model = BiLSTM_Model()
model.fit(x, y)
print(model.predict(x[:10]))

0 comments on commit 3f7c1c3

Please sign in to comment.