Skip to content

Commit

Permalink
Merge 1590e53 into 7714342
Browse files Browse the repository at this point in the history
  • Loading branch information
BrikerMan committed Dec 6, 2019
2 parents 7714342 + 1590e53 commit a9a5e76
Show file tree
Hide file tree
Showing 12 changed files with 331 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Expand Up @@ -67,7 +67,7 @@ jobs:
- stage: Document
python: "3.6"
install:
- echo -e "machine github.com\n login ${GITHUB_TOKEN}" > ~/.netrc
- echo -e "machine github.com\n login ${GITHUB_TOKEN}" > ~/.netrc
- pip install mkdocs mkdocs-material pymdown-extensions
script:
- cd mkdocs
Expand Down
1 change: 1 addition & 0 deletions kashgari/__init__.py
Expand Up @@ -23,6 +23,7 @@
custom_objects = keras_bert.get_custom_objects()
CLASSIFICATION = TaskType.CLASSIFICATION
LABELING = TaskType.LABELING
SCORING = TaskType.SCORING

from kashgari.version import __version__

Expand Down
6 changes: 4 additions & 2 deletions kashgari/embeddings/base_embedding.py
Expand Up @@ -16,7 +16,7 @@
from tensorflow import keras

import kashgari
from kashgari.processors import ClassificationProcessor, LabelingProcessor
from kashgari.processors import ClassificationProcessor, LabelingProcessor, ScoringProcessor
from kashgari.processors.base_processor import BaseProcessor

L = keras.layers
Expand Down Expand Up @@ -74,8 +74,10 @@ def __init__(self,
self.processor = ClassificationProcessor()
elif task == kashgari.LABELING:
self.processor = LabelingProcessor()
elif task == kashgari.SCORING:
self.processor = ScoringProcessor()
else:
raise ValueError()
raise ValueError('Need to set the processor param, value: {labeling, classification, scoring}')
else:
self.processor = processor

Expand Down
1 change: 1 addition & 0 deletions kashgari/macros.py
Expand Up @@ -23,6 +23,7 @@
class TaskType(object):
CLASSIFICATION = 'classification'
LABELING = 'labeling'
SCORING = 'scoring'


class Config(object):
Expand Down
1 change: 1 addition & 0 deletions kashgari/processors/__init__.py
Expand Up @@ -10,3 +10,4 @@

from kashgari.processors.classification_processor import ClassificationProcessor
from kashgari.processors.labeling_processor import LabelingProcessor
from kashgari.processors.scoring_processor import ScoringProcessor
100 changes: 100 additions & 0 deletions kashgari/processors/scoring_processor.py
@@ -0,0 +1,100 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: scoring_processor.py
# time: 11:10 上午

from typing import List, Optional

import numpy as np

import kashgari
from kashgari import utils
from kashgari.processors.base_processor import BaseProcessor


def is_numeric(obj):
attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
return all(hasattr(obj, attr) for attr in attrs)


class ScoringProcessor(BaseProcessor):
"""
Corpus Pre Processor class
"""

def __init__(self, output_dim=None, **kwargs):
super(ScoringProcessor, self).__init__(**kwargs)
self.output_dim = output_dim

def info(self):
info = super(ScoringProcessor, self).info()
info['task'] = kashgari.SCORING
return info

def _build_label_dict(self,
label_list: List[List[float]]):
"""
Build label2idx dict for sequence labeling task
Args:
label_list: corpus label list
"""
if self.output_dim is None:
label_sample = label_list[0]
if is_numeric(label_sample):
self.output_dim = 1
elif isinstance(label_sample, list):
self.output_dim = len(label_sample)
elif isinstance(label_sample, np.ndarray) and len(label_sample.shape) == 1:
self.output_dim = label_sample.shape[0]
else:
raise ValueError('Scoring Label Sample must be a float, float array or 1D numpy array')
# np_labels = np.array(label_list)
# if np_labels.max() > 1 or np_labels.min() < 0:
# raise ValueError('Scoring Label Sample must be in range[0,1]')

def process_y_dataset(self,
data: List[List[str]],
max_len: Optional[int] = None,
subset: Optional[List[int]] = None) -> np.ndarray:
if subset is not None:
target = utils.get_list_subset(data, subset)
else:
target = data[:]
y = np.array(target)
return y

def numerize_token_sequences(self,
sequences: List[List[str]]):

result = []
for seq in sequences:
if self.add_bos_eos:
seq = [self.token_bos] + seq + [self.token_eos]
unk_index = self.token2idx[self.token_unk]
result.append([self.token2idx.get(token, unk_index) for token in seq])
return result

def numerize_label_sequences(self,
sequences: List[List[str]]) -> List[List[int]]:
return sequences

def reverse_numerize_label_sequences(self,
sequences,
lengths=None):
return sequences


if __name__ == "__main__":
from kashgari.corpus import SMP2018ECDTCorpus

x, y = SMP2018ECDTCorpus.load_data()
x = x[:3]
y = [0.2, 0.3, 0.2]
p = ScoringProcessor()
p.analyze_corpus(x, y)
print(p.process_y_dataset(y))
12 changes: 8 additions & 4 deletions kashgari/tasks/base_model.py
Expand Up @@ -414,12 +414,16 @@ def predict(self,
lengths = [len(sen) for sen in x_data]
tensor = self.embedding.process_x_dataset(x_data)
pred = self.tf_model.predict(tensor, batch_size=batch_size, **predict_kwargs)
res = self.embedding.reverse_numerize_label_sequences(pred.argmax(-1),
if self.task == 'scoring':
t_pred = pred
else:
t_pred = pred.argmax(-1)
res = self.embedding.reverse_numerize_label_sequences(t_pred,
lengths)
if debug_info:
logging.info('input: {}'.format(tensor))
logging.info('output: {}'.format(pred))
logging.info('output argmax: {}'.format(pred.argmax(-1)))
print('input: {}'.format(tensor))
print('output: {}'.format(pred))
print('output argmax: {}'.format(t_pred))
return res

def evaluate(self,
Expand Down
14 changes: 14 additions & 0 deletions kashgari/tasks/scoring/__init__.py
@@ -0,0 +1,14 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: __init__.py
# time: 11:36 上午


from kashgari.tasks.scoring.models import BiLSTM_Model

if __name__ == "__main__":
pass
67 changes: 67 additions & 0 deletions kashgari/tasks/scoring/base_model.py
@@ -0,0 +1,67 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: base_model.py
# time: 11:36 上午


from typing import Dict, Any, Tuple

from sklearn import metrics

from kashgari.tasks.base_model import BaseModel


class BaseScoringModel(BaseModel):
"""Base Sequence Labeling Model"""

__task__ = 'scoring'

@classmethod
def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
raise NotImplementedError

def compile_model(self, **kwargs):
if kwargs.get('loss') is None:
kwargs['loss'] = 'mse'
if kwargs.get('optimizer') is None:
kwargs['optimizer'] = 'rmsprop'
if kwargs.get('metrics') is None:
kwargs['metrics'] = ['mae']
super(BaseScoringModel, self).compile_model(**kwargs)

def evaluate(self,
x_data,
y_data,
batch_size=None,
digits=4,
debug_info=False) -> Tuple[float, float, Dict]:
"""
Build a text report showing the main classification metrics.
Args:
x_data:
y_data:
batch_size:
digits:
debug_info:
Returns:
"""
y_pred = self.predict(x_data, batch_size=batch_size)
y_true = [seq[:len(y_pred[index])] for index, seq in enumerate(y_data)]
mean_squared_error = metrics.mean_squared_error(y_true, y_pred)
r2_score = metrics.r2_score(y_true, y_pred)
data = {
'mean_squared_error': mean_squared_error,
'r2_score': r2_score
}
return mean_squared_error, r2_score, data


if __name__ == "__main__":
pass
57 changes: 57 additions & 0 deletions kashgari/tasks/scoring/models.py
@@ -0,0 +1,57 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: models.py
# time: 11:38 上午


import logging
from typing import Dict, Any

from tensorflow import keras

from kashgari.tasks.scoring.base_model import BaseScoringModel
from kashgari.layers import L


class BiLSTM_Model(BaseScoringModel):

@classmethod
def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
return {
'layer_bi_lstm': {
'units': 128,
'return_sequences': False
},
'layer_dense': {
'activation': 'linear'
}
}

def build_model_arc(self):
output_dim = self.processor.output_dim
config = self.hyper_parameters
embed_model = self.embedding.embed_model

layer_bi_lstm = L.Bidirectional(L.LSTM(**config['layer_bi_lstm']))
layer_dense = L.Dense(output_dim, **config['layer_dense'])

tensor = layer_bi_lstm(embed_model.output)
output_tensor = layer_dense(tensor)

self.tf_model = keras.Model(embed_model.inputs, output_tensor)


if __name__ == "__main__":
from kashgari.corpus import SMP2018ECDTCorpus
import numpy as np

x, y = SMP2018ECDTCorpus.load_data('valid')
y = np.random.random((len(x), 4))
model = BiLSTM_Model()
model.fit(x, y)
print(model.predict(x[:10]))

47 changes: 47 additions & 0 deletions tests/scoring/test_bi_lstm_model.py
@@ -0,0 +1,47 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: blstm_model.py
# time: 12:17 下午
import os
import tempfile
import time
import unittest
import kashgari
import numpy as np

from tests.corpus import NERCorpus
from kashgari.tasks.scoring import BiLSTM_Model


class TestBiLSTM_Model(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_class = BiLSTM_Model

def test_basic_use_build(self):
x, _ = NERCorpus.load_corpus()
y = np.random.random((len(x), 4))
model = self.model_class()
model.fit(x, y)
res = model.predict(x[:20])
model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
model.save(model_path)

pd_model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
kashgari.utils.convert_to_saved_model(model,
pd_model_path)

new_model = kashgari.utils.load_model(model_path)
new_res = new_model.predict(x[:20])
assert np.array_equal(new_res, res)

new_model.compile_model()
model.fit(x, y, x, y, epochs=1)


if __name__ == "__main__":
pass

0 comments on commit a9a5e76

Please sign in to comment.