Merge pull request #153 from BrikerMan/develop

Release v0.5.1
BrikerMan · Jul 15, 2019 · 34ece9f · 34ece9f
2 parents 6f6ff86 + bd6c159
commit 34ece9f
Show file tree

Hide file tree

Showing 14 changed files with 325 additions and 89 deletions.
diff --git a/kashgari/embeddings/base_embedding.py b/kashgari/embeddings/base_embedding.py
@@ -80,6 +80,7 @@ def __init__(self,
 
         self.sequence_length: Union[int, str] = sequence_length
         self.embed_model: Optional[keras.Model] = None
+        self._tokenizer = None
 
     @property
     def token_count(self) -> int:
@@ -109,6 +110,13 @@ def token2idx(self) -> Dict[str, int]:
         """
         return self.processor.token2idx
 
+    @property
+    def tokenizer(self):
+        if self._tokenizer:
+            return self._tokenizer
+        else:
+            raise ValueError('This embedding not support built-in tokenizer')
+
     @sequence_length.setter
     def sequence_length(self, val: Union[int, str]):
         if isinstance(val, str):

diff --git a/kashgari/embeddings/bert_embedding.py b/kashgari/embeddings/bert_embedding.py
@@ -94,6 +94,7 @@ def _build_token2idx_from_bert(self):
                 token2idx[token] = len(token2idx)
 
         self.bert_token2idx = token2idx
+        self._tokenizer = keras_bert.Tokenizer(token2idx)
         self.processor.token2idx = self.bert_token2idx
         self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])
 

diff --git a/kashgari/tasks/base_model.py b/kashgari/tasks/base_model.py
@@ -48,6 +48,18 @@ def info(self):
             'kashgari_version': tf.__version__
         }
 
+    @property
+    def task(self):
+        return self.embedding.task
+
+    @property
+    def token2idx(self) -> Dict[str, int]:
+        return self.embedding.token2idx
+
+    @property
+    def label2idx(self) -> Dict[str, int]:
+        return self.embedding.label2idx
+
     def __init__(self,
                  embedding: Optional[Embedding] = None,
                  hyper_parameters: Optional[Dict[str, Dict[str, Any]]] = None):
@@ -81,10 +93,6 @@ def __init__(self,
         if hyper_parameters:
             self.hyper_parameters.update(hyper_parameters)
 
-    @property
-    def task(self):
-        return self.embedding.task
-
     def build_model(self,
                     x_train: Union[Tuple[List[List[str]], ...], List[List[str]]],
                     y_train: Union[List[List[str]], List[str]],
@@ -365,7 +373,8 @@ def compile_model(self, **kwargs):
     def predict(self,
                 x_data,
                 batch_size=32,
-                debug_info=False):
+                debug_info=False,
+                predict_kwargs: Dict = None):
         """
         Generates output predictions for the input samples.
 
@@ -375,17 +384,20 @@ def predict(self,
             x_data: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
             batch_size: Integer. If unspecified, it will default to 32.
             debug_info: Bool, Should print out the logging info.
+            predict_kwargs: arguments passed to ``predict()`` function of ``tf.keras.Model``
 
         Returns:
             array(s) of predictions.
         """
+        if predict_kwargs is None:
+            predict_kwargs = {}
         with utils.custom_object_scope():
             if isinstance(x_data, tuple):
                 lengths = [len(sen) for sen in x_data[0]]
             else:
                 lengths = [len(sen) for sen in x_data]
             tensor = self.embedding.process_x_dataset(x_data)
-            pred = self.tf_model.predict(tensor, batch_size=batch_size)
+            pred = self.tf_model.predict(tensor, batch_size=batch_size, **predict_kwargs)
             res = self.embedding.reverse_numerize_label_sequences(pred.argmax(-1),
                                                                   lengths)
             if debug_info:

diff --git a/kashgari/tasks/classification/base_model.py b/kashgari/tasks/classification/base_model.py
@@ -10,7 +10,7 @@
 import random
 import logging
 import kashgari
-from typing import Dict, Any, Tuple, Optional
+from typing import Dict, Any, Tuple, Optional, List
 from kashgari.tasks.base_model import BaseModel, BareEmbedding
 
 from kashgari.embeddings.base_embedding import Embedding
@@ -39,11 +39,17 @@ def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
     def build_model_arc(self):
         raise NotImplementedError
 
+    def compile_model(self, **kwargs):
+        if kwargs.get('loss') is None and self.embedding.processor.multi_label:
+            kwargs['loss'] = 'binary_crossentropy'
+        super(BaseClassificationModel, self).compile_model(**kwargs)
+
     def predict(self,
                 x_data,
                 batch_size=32,
                 multi_label_threshold: float = 0.5,
-                debug_info=False):
+                debug_info=False,
+                predict_kwargs: Dict = None):
         """
         Generates output predictions for the input samples.
 
@@ -54,15 +60,12 @@ def predict(self,
             batch_size: Integer. If unspecified, it will default to 32.
             multi_label_threshold:
             debug_info: Bool, Should print out the logging info.
+            predict_kwargs: arguments passed to ``predict()`` function of ``tf.keras.Model``
 
         Returns:
             array(s) of predictions.
         """
         with kashgari.utils.custom_object_scope():
-            if isinstance(x_data, tuple):
-                lengths = [len(sen) for sen in x_data[0]]
-            else:
-                lengths = [len(sen) for sen in x_data]
             tensor = self.embedding.process_x_dataset(x_data)
             pred = self.tf_model.predict(tensor, batch_size=batch_size)
             if self.embedding.processor.multi_label:
@@ -74,14 +77,92 @@ def predict(self,
             else:
                 pred = pred.argmax(-1)
 
-            res = self.embedding.reverse_numerize_label_sequences(pred,
-                                                                  lengths)
+            res = self.embedding.reverse_numerize_label_sequences(pred)
             if debug_info:
                 logging.info('input: {}'.format(tensor))
                 logging.info('output: {}'.format(pred))
                 logging.info('output argmax: {}'.format(pred.argmax(-1)))
         return res
 
+    def predict_top_k_class(self,
+                            x_data,
+                            top_k=5,
+                            batch_size=32,
+                            debug_info=False,
+                            predict_kwargs: Dict = None) -> List[Dict]:
+        """
+        Generates output predictions with confidence for the input samples.
+
+        Computation is done in batches.
+
+        Args:
+            x_data: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
+            top_k: int
+            batch_size: Integer. If unspecified, it will default to 32.
+            debug_info: Bool, Should print out the logging info.
+            predict_kwargs: arguments passed to ``predict()`` function of ``tf.keras.Model``
+
+        Returns:
+            array(s) of predictions.
+            single-label classification:
+              [
+                {
+                  "label": "chat",
+                  "confidence": 0.5801531,
+                  "candidates": [
+                    { "label": "cookbook", "confidence": 0.1886314 },
+                    { "label": "video", "confidence": 0.13805099 },
+                    { "label": "health", "confidence": 0.013852648 },
+                    { "label": "translation", "confidence": 0.012913573 }
+                  ]
+                }
+              ]
+            multi-label classification:
+              [
+                {
+                  "candidates": [
+                    { "confidence": 0.9959336, "label": "toxic" },
+                    { "confidence": 0.9358089, "label": "obscene" },
+                    { "confidence": 0.6882098, "label": "insult" },
+                    { "confidence": 0.13540423, "label": "severe_toxic" },
+                    { "confidence": 0.017219543, "label": "identity_hate" }
+                  ]
+                }
+              ]
+        """
+        if predict_kwargs is None:
+            predict_kwargs = {}
+        with kashgari.utils.custom_object_scope():
+            tensor = self.embedding.process_x_dataset(x_data)
+            pred = self.tf_model.predict(tensor, batch_size=batch_size, **predict_kwargs)
+            new_results = []
+
+            for sample_prob in pred:
+                sample_res = zip(self.label2idx.keys(), sample_prob)
+                sample_res = sorted(sample_res, key=lambda k: k[1], reverse=True)
+                data = {}
+                for label, confidence in sample_res[:top_k]:
+                    if 'candidates' not in data:
+                        if self.embedding.processor.multi_label:
+                            data['candidates'] = []
+                        else:
+                            data['label'] = label
+                            data['confidence'] = confidence
+                            data['candidates'] = []
+                            continue
+                    data['candidates'].append({
+                        'label': label,
+                        'confidence': confidence
+                    })
+
+                new_results.append(data)
+
+            if debug_info:
+                logging.info('input: {}'.format(tensor))
+                logging.info('output: {}'.format(pred))
+                logging.info('output argmax: {}'.format(pred.argmax(-1)))
+        return new_results
+
     def evaluate(self,
                  x_data,
                  y_data,

diff --git a/kashgari/tasks/classification/models.py b/kashgari/tasks/classification/models.py
@@ -688,8 +688,15 @@ def build_model_arc(self):
 
     x, y = SMP2018ECDTCorpus.load_data()
 
-    m = BiLSTM_Model()
-    m.build_model(x, y)
-    m.fit(x, y, epochs=5)
-    m.evaluate(x, y)
+    import kashgari
+    from kashgari.processors.classification_processor import ClassificationProcessor
+    from kashgari.embeddings import BareEmbedding
+
+    processor = ClassificationProcessor(multi_label=False)
+    embed = BareEmbedding(task=kashgari.CLASSIFICATION, sequence_length=30, processor=processor)
+    m = BiLSTM_Model(embed)
+    # m.build_model(x, y)
+    m.fit(x, y, epochs=2)
     print(m.predict(x[:10]))
+    # m.evaluate(x, y)
+    print(m.predict_top_k_class(x[:10]))
diff --git a/kashgari/tasks/labeling/base_model.py b/kashgari/tasks/labeling/base_model.py
@@ -31,14 +31,16 @@ def predict_entities(self,
                          x_data,
                          batch_size=None,
                          join_chunk=' ',
-                         debug_info=False):
+                         debug_info=False,
+                         predict_kwargs: Dict = None):
         """Gets entities from sequence.
 
         Args:
             x_data: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
             batch_size: Integer. If unspecified, it will default to 32.
             join_chunk: str or False,
             debug_info: Bool, Should print out the logging info.
+            predict_kwargs: arguments passed to ``predict()`` function of ``tf.keras.Model``
 
         Returns:
             list: list of entity.
@@ -47,7 +49,7 @@ def predict_entities(self,
             text_seq = x_data[0]
         else:
             text_seq = x_data
-        res = self.predict(x_data, batch_size, debug_info)
+        res = self.predict(x_data, batch_size, debug_info, predict_kwargs)
         new_res = [get_entities(seq) for seq in res]
         final_res = []
         for index, seq in enumerate(new_res):

diff --git a/mkdocs/docs/about/release-notes.md b/mkdocs/docs/about/release-notes.md
@@ -20,7 +20,11 @@ pip show kashgari-tf
 
 - 📝 Rewrite documents with mkdocs
 - 📝 Add Chinese documents
-- 🚸 Add `label2idx`, `token2idx` properties to Embeddings
+- 🚸 Add `label2idx`, `token2idx` properties to Embeddings and Models
+- ✨ Add `predict_top_k_class` for classification model to get predict probabilities ([#146](https://github.com/BrikerMan/Kashgari/issues/146))
+- 🚸 Add `tokenizer` property for BERT Embedding. ([#136](https://github.com/BrikerMan/Kashgari/issues/136))
+- 🚸 Add `predict_kwargs` for models `predict()` function
+- ⚡️ Change multi-label classification's default loss function to binary_crossentropy ([#151](https://github.com/BrikerMan/Kashgari/issues/151))
 
 ### [0.5.0] - 2019.07.11
 
@@ -88,6 +92,7 @@ pip show kashgari-tf
 - fix classification model evaluate result output
 - change test settings
 
+[0.5.1]: https://github.com/BrikerMan/Kashgari/compare/v0.5.0...develop
 [0.5.0]: https://github.com/BrikerMan/Kashgari/compare/milestone/tf.keras...v0.5.0
 [0.2.6]: https://github.com/BrikerMan/Kashgari/compare/v0.2.4...v0.2.6
 [0.2.4]: https://github.com/BrikerMan/Kashgari/compare/v0.2.1...v0.2.4

diff --git a/mkdocs/docs/api/callbacks.md b/mkdocs/docs/api/callbacks.md
@@ -2,12 +2,31 @@
 
 ## class EvalCallBack
 
+### \_\_init\_\_
+
 Evaluate callback, calculate precision, recall and f1 at the end of each epoch step.
 
+```python
+def __init__(self,
+             kash_model: BaseModel,
+             valid_x,
+             valid_y,
+             step=5,
+             batch_size=256):
+```
+
 __Args__:
 
 - **kash_model**: the kashgari model to evaluate
 - **valid_x**: feature data for evaluation
 - **valid_y**: label data for evaluation
 - **step**: evaluate step, default 5
 - **batch_size**: batch size, default 256
+
+### Methods
+
+#### on\_epoch\_end
+
+```python
+def on_epoch_end(self, epoch, logs=None):
+```