Skip to content

Commit

Permalink
Merge pull request #153 from BrikerMan/develop
Browse files Browse the repository at this point in the history
Release v0.5.1
  • Loading branch information
BrikerMan committed Jul 15, 2019
2 parents 6f6ff86 + bd6c159 commit 34ece9f
Show file tree
Hide file tree
Showing 14 changed files with 325 additions and 89 deletions.
8 changes: 8 additions & 0 deletions kashgari/embeddings/base_embedding.py
Expand Up @@ -80,6 +80,7 @@ def __init__(self,

self.sequence_length: Union[int, str] = sequence_length
self.embed_model: Optional[keras.Model] = None
self._tokenizer = None

@property
def token_count(self) -> int:
Expand Down Expand Up @@ -109,6 +110,13 @@ def token2idx(self) -> Dict[str, int]:
"""
return self.processor.token2idx

@property
def tokenizer(self):
if self._tokenizer:
return self._tokenizer
else:
raise ValueError('This embedding not support built-in tokenizer')

@sequence_length.setter
def sequence_length(self, val: Union[int, str]):
if isinstance(val, str):
Expand Down
1 change: 1 addition & 0 deletions kashgari/embeddings/bert_embedding.py
Expand Up @@ -94,6 +94,7 @@ def _build_token2idx_from_bert(self):
token2idx[token] = len(token2idx)

self.bert_token2idx = token2idx
self._tokenizer = keras_bert.Tokenizer(token2idx)
self.processor.token2idx = self.bert_token2idx
self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])

Expand Down
24 changes: 18 additions & 6 deletions kashgari/tasks/base_model.py
Expand Up @@ -48,6 +48,18 @@ def info(self):
'kashgari_version': tf.__version__
}

@property
def task(self):
return self.embedding.task

@property
def token2idx(self) -> Dict[str, int]:
return self.embedding.token2idx

@property
def label2idx(self) -> Dict[str, int]:
return self.embedding.label2idx

def __init__(self,
embedding: Optional[Embedding] = None,
hyper_parameters: Optional[Dict[str, Dict[str, Any]]] = None):
Expand Down Expand Up @@ -81,10 +93,6 @@ def __init__(self,
if hyper_parameters:
self.hyper_parameters.update(hyper_parameters)

@property
def task(self):
return self.embedding.task

def build_model(self,
x_train: Union[Tuple[List[List[str]], ...], List[List[str]]],
y_train: Union[List[List[str]], List[str]],
Expand Down Expand Up @@ -365,7 +373,8 @@ def compile_model(self, **kwargs):
def predict(self,
x_data,
batch_size=32,
debug_info=False):
debug_info=False,
predict_kwargs: Dict = None):
"""
Generates output predictions for the input samples.
Expand All @@ -375,17 +384,20 @@ def predict(self,
x_data: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
batch_size: Integer. If unspecified, it will default to 32.
debug_info: Bool, Should print out the logging info.
predict_kwargs: arguments passed to ``predict()`` function of ``tf.keras.Model``
Returns:
array(s) of predictions.
"""
if predict_kwargs is None:
predict_kwargs = {}
with utils.custom_object_scope():
if isinstance(x_data, tuple):
lengths = [len(sen) for sen in x_data[0]]
else:
lengths = [len(sen) for sen in x_data]
tensor = self.embedding.process_x_dataset(x_data)
pred = self.tf_model.predict(tensor, batch_size=batch_size)
pred = self.tf_model.predict(tensor, batch_size=batch_size, **predict_kwargs)
res = self.embedding.reverse_numerize_label_sequences(pred.argmax(-1),
lengths)
if debug_info:
Expand Down
97 changes: 89 additions & 8 deletions kashgari/tasks/classification/base_model.py
Expand Up @@ -10,7 +10,7 @@
import random
import logging
import kashgari
from typing import Dict, Any, Tuple, Optional
from typing import Dict, Any, Tuple, Optional, List
from kashgari.tasks.base_model import BaseModel, BareEmbedding

from kashgari.embeddings.base_embedding import Embedding
Expand Down Expand Up @@ -39,11 +39,17 @@ def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
def build_model_arc(self):
raise NotImplementedError

def compile_model(self, **kwargs):
if kwargs.get('loss') is None and self.embedding.processor.multi_label:
kwargs['loss'] = 'binary_crossentropy'
super(BaseClassificationModel, self).compile_model(**kwargs)

def predict(self,
x_data,
batch_size=32,
multi_label_threshold: float = 0.5,
debug_info=False):
debug_info=False,
predict_kwargs: Dict = None):
"""
Generates output predictions for the input samples.
Expand All @@ -54,15 +60,12 @@ def predict(self,
batch_size: Integer. If unspecified, it will default to 32.
multi_label_threshold:
debug_info: Bool, Should print out the logging info.
predict_kwargs: arguments passed to ``predict()`` function of ``tf.keras.Model``
Returns:
array(s) of predictions.
"""
with kashgari.utils.custom_object_scope():
if isinstance(x_data, tuple):
lengths = [len(sen) for sen in x_data[0]]
else:
lengths = [len(sen) for sen in x_data]
tensor = self.embedding.process_x_dataset(x_data)
pred = self.tf_model.predict(tensor, batch_size=batch_size)
if self.embedding.processor.multi_label:
Expand All @@ -74,14 +77,92 @@ def predict(self,
else:
pred = pred.argmax(-1)

res = self.embedding.reverse_numerize_label_sequences(pred,
lengths)
res = self.embedding.reverse_numerize_label_sequences(pred)
if debug_info:
logging.info('input: {}'.format(tensor))
logging.info('output: {}'.format(pred))
logging.info('output argmax: {}'.format(pred.argmax(-1)))
return res

def predict_top_k_class(self,
x_data,
top_k=5,
batch_size=32,
debug_info=False,
predict_kwargs: Dict = None) -> List[Dict]:
"""
Generates output predictions with confidence for the input samples.
Computation is done in batches.
Args:
x_data: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
top_k: int
batch_size: Integer. If unspecified, it will default to 32.
debug_info: Bool, Should print out the logging info.
predict_kwargs: arguments passed to ``predict()`` function of ``tf.keras.Model``
Returns:
array(s) of predictions.
single-label classification:
[
{
"label": "chat",
"confidence": 0.5801531,
"candidates": [
{ "label": "cookbook", "confidence": 0.1886314 },
{ "label": "video", "confidence": 0.13805099 },
{ "label": "health", "confidence": 0.013852648 },
{ "label": "translation", "confidence": 0.012913573 }
]
}
]
multi-label classification:
[
{
"candidates": [
{ "confidence": 0.9959336, "label": "toxic" },
{ "confidence": 0.9358089, "label": "obscene" },
{ "confidence": 0.6882098, "label": "insult" },
{ "confidence": 0.13540423, "label": "severe_toxic" },
{ "confidence": 0.017219543, "label": "identity_hate" }
]
}
]
"""
if predict_kwargs is None:
predict_kwargs = {}
with kashgari.utils.custom_object_scope():
tensor = self.embedding.process_x_dataset(x_data)
pred = self.tf_model.predict(tensor, batch_size=batch_size, **predict_kwargs)
new_results = []

for sample_prob in pred:
sample_res = zip(self.label2idx.keys(), sample_prob)
sample_res = sorted(sample_res, key=lambda k: k[1], reverse=True)
data = {}
for label, confidence in sample_res[:top_k]:
if 'candidates' not in data:
if self.embedding.processor.multi_label:
data['candidates'] = []
else:
data['label'] = label
data['confidence'] = confidence
data['candidates'] = []
continue
data['candidates'].append({
'label': label,
'confidence': confidence
})

new_results.append(data)

if debug_info:
logging.info('input: {}'.format(tensor))
logging.info('output: {}'.format(pred))
logging.info('output argmax: {}'.format(pred.argmax(-1)))
return new_results

def evaluate(self,
x_data,
y_data,
Expand Down
15 changes: 11 additions & 4 deletions kashgari/tasks/classification/models.py
Expand Up @@ -688,8 +688,15 @@ def build_model_arc(self):

x, y = SMP2018ECDTCorpus.load_data()

m = BiLSTM_Model()
m.build_model(x, y)
m.fit(x, y, epochs=5)
m.evaluate(x, y)
import kashgari
from kashgari.processors.classification_processor import ClassificationProcessor
from kashgari.embeddings import BareEmbedding

processor = ClassificationProcessor(multi_label=False)
embed = BareEmbedding(task=kashgari.CLASSIFICATION, sequence_length=30, processor=processor)
m = BiLSTM_Model(embed)
# m.build_model(x, y)
m.fit(x, y, epochs=2)
print(m.predict(x[:10]))
# m.evaluate(x, y)
print(m.predict_top_k_class(x[:10]))
6 changes: 4 additions & 2 deletions kashgari/tasks/labeling/base_model.py
Expand Up @@ -31,14 +31,16 @@ def predict_entities(self,
x_data,
batch_size=None,
join_chunk=' ',
debug_info=False):
debug_info=False,
predict_kwargs: Dict = None):
"""Gets entities from sequence.
Args:
x_data: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
batch_size: Integer. If unspecified, it will default to 32.
join_chunk: str or False,
debug_info: Bool, Should print out the logging info.
predict_kwargs: arguments passed to ``predict()`` function of ``tf.keras.Model``
Returns:
list: list of entity.
Expand All @@ -47,7 +49,7 @@ def predict_entities(self,
text_seq = x_data[0]
else:
text_seq = x_data
res = self.predict(x_data, batch_size, debug_info)
res = self.predict(x_data, batch_size, debug_info, predict_kwargs)
new_res = [get_entities(seq) for seq in res]
final_res = []
for index, seq in enumerate(new_res):
Expand Down
7 changes: 6 additions & 1 deletion mkdocs/docs/about/release-notes.md
Expand Up @@ -20,7 +20,11 @@ pip show kashgari-tf

- 📝 Rewrite documents with mkdocs
- 📝 Add Chinese documents
- 🚸 Add `label2idx`, `token2idx` properties to Embeddings
- 🚸 Add `label2idx`, `token2idx` properties to Embeddings and Models
- ✨ Add `predict_top_k_class` for classification model to get predict probabilities ([#146](https://github.com/BrikerMan/Kashgari/issues/146))
- 🚸 Add `tokenizer` property for BERT Embedding. ([#136](https://github.com/BrikerMan/Kashgari/issues/136))
- 🚸 Add `predict_kwargs` for models `predict()` function
- ⚡️ Change multi-label classification's default loss function to binary_crossentropy ([#151](https://github.com/BrikerMan/Kashgari/issues/151))

### [0.5.0] - 2019.07.11

Expand Down Expand Up @@ -88,6 +92,7 @@ pip show kashgari-tf
- fix classification model evaluate result output
- change test settings

[0.5.1]: https://github.com/BrikerMan/Kashgari/compare/v0.5.0...develop
[0.5.0]: https://github.com/BrikerMan/Kashgari/compare/milestone/tf.keras...v0.5.0
[0.2.6]: https://github.com/BrikerMan/Kashgari/compare/v0.2.4...v0.2.6
[0.2.4]: https://github.com/BrikerMan/Kashgari/compare/v0.2.1...v0.2.4
Expand Down
19 changes: 19 additions & 0 deletions mkdocs/docs/api/callbacks.md
Expand Up @@ -2,12 +2,31 @@

## class EvalCallBack

### \_\_init\_\_

Evaluate callback, calculate precision, recall and f1 at the end of each epoch step.

```python
def __init__(self,
kash_model: BaseModel,
valid_x,
valid_y,
step=5,
batch_size=256):
```

__Args__:

- **kash_model**: the kashgari model to evaluate
- **valid_x**: feature data for evaluation
- **valid_y**: label data for evaluation
- **step**: evaluate step, default 5
- **batch_size**: batch size, default 256

### Methods

#### on\_epoch\_end

```python
def on_epoch_end(self, epoch, logs=None):
```

0 comments on commit 34ece9f

Please sign in to comment.