Skip to content

Commit

Permalink
Merge pull request #38 from BrikerMan/feature/sequence_labeling_model
Browse files Browse the repository at this point in the history
Feature/sequence labeling model
  • Loading branch information
BrikerMan committed Mar 3, 2019
2 parents 26f85b0 + da7a149 commit 1ecf64d
Show file tree
Hide file tree
Showing 11 changed files with 315 additions and 289 deletions.
65 changes: 39 additions & 26 deletions kashgari/macros.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,50 @@

class _Config(object):
def __init__(self):
self.use_CuDNN_cell = False
self._use_CuDNN_cell = False

self._sequence_labeling_tokenize_add_bos_eos = False

@property
def use_CuDNN_cell(self):
"""
if true, will use `cuDNNLSTM` and `cuDNNGRU` layer instead of `LSTM` and `GRU` layer
which will speed up training when using GPU
:return:
"""
return self._use_CuDNN_cell

@use_CuDNN_cell.setter
def use_CuDNN_cell(self, value):
"""
if true, will use `cuDNNLSTM` and `cuDNNGRU` layer instead of `LSTM` and `GRU` layer
which will speed up training when using GPU
:param value:
:return:
"""
self._use_CuDNN_cell = value

@property
def sequence_labeling_tokenize_add_bos_eos(self):
"""
if true, will add BOS and EOS label to sequence labeling result.
:return:
"""
return self._sequence_labeling_tokenize_add_bos_eos

@sequence_labeling_tokenize_add_bos_eos.setter
def sequence_labeling_tokenize_add_bos_eos(self, value):
"""
if true, will add BOS and EOS label to sequence labeling result.
:param value:
:return:
"""
self._sequence_labeling_tokenize_add_bos_eos = value


config = _Config()


class CustomEmbedding(object):
def __init__(self, embedding_size=100):
self.embedding_size = embedding_size


class TaskType(Enum):
classification = 'classification'
tagging = 'tagging'
Expand All @@ -71,26 +104,6 @@ class SegmenterType(Enum):
}


def download_file(file: str):
url = STORAGE_HOST + file
target_path = os.path.join(DATA_PATH, file)
download.download(url, target_path)


def download_if_not_existed(file_path: str) -> str:
target_path = os.path.join(DATA_PATH, file_path)
if not os.path.exists(target_path[:-4]):
download_file(file_path)
with open(target_path, 'rb') as source, open(target_path[:-4], 'wb') as dest:
dest.write(bz2.decompress(source.read()))
return target_path[:-4]


def get_model_path(file: str) -> str:
file_path = URL_MAP.get(file, file)
return download_if_not_existed(file_path)


if __name__ == "__main__":
from kashgari.utils.logger import init_logger
init_logger()
59 changes: 48 additions & 11 deletions kashgari/tasks/classification/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from keras.utils import to_categorical
from sklearn import metrics
from sklearn.utils import class_weight as class_weight_calculte
from sklearn.preprocessing import MultiLabelBinarizer

from kashgari import macros as k
from kashgari.tasks.base import BaseModel
Expand All @@ -28,8 +29,21 @@

class ClassificationModel(BaseModel):

def __init__(self, embedding: BaseEmbedding = None, hyper_parameters: Dict = None, **kwargs):
def __init__(self,
embedding: BaseEmbedding = None,
hyper_parameters: Dict = None,
multi_label: bool = False,
**kwargs):
"""
:param embedding:
:param hyper_parameters:
:param multi_label:
:param kwargs:
"""
super(ClassificationModel, self).__init__(embedding, hyper_parameters, **kwargs)
self.multi_label = multi_label
self.multi_label_binarizer: MultiLabelBinarizer = None

@property
def label2idx(self) -> Dict[str, int]:
Expand Down Expand Up @@ -63,14 +77,20 @@ def build_token2id_label2id_dict(self,
y_data += y_validate
self.embedding.build_token2idx_dict(x_data, 3)

label_set = set(y_data)
label2idx = {
k.PAD: 0,
}
if self.multi_label:
label_set = []
for i in y_data:
label_set += list(i)
label_set = set(label_set)
else:
label_set = set(y_data)

label2idx = {}
for label in label_set:
label2idx[label] = len(label2idx)
self._label2idx = label2idx
self._idx2label = dict([(val, key) for (key, val) in label2idx.items()])
self.multi_label_binarizer = MultiLabelBinarizer(classes=list(self.label2idx.keys()))

def convert_label_to_idx(self, label: Union[List[str], str]) -> Union[List[int], int]:
if isinstance(label, str):
Expand Down Expand Up @@ -102,14 +122,18 @@ def get_data_generator(self,
target_y = y_data[0: batch_size]

tokenized_x = self.embedding.tokenize(target_x)
tokenized_y = self.convert_label_to_idx(target_y)

padded_x = sequence.pad_sequences(tokenized_x,
maxlen=self.embedding.sequence_length,
padding='post')
padded_y = to_categorical(tokenized_y,
num_classes=len(self.label2idx),
dtype=np.int)

if self.multi_label:
padded_y = self.multi_label_binarizer.fit_transform(target_y)
else:
tokenized_y = self.convert_label_to_idx(target_y)
padded_y = to_categorical(tokenized_y,
num_classes=len(self.label2idx),
dtype=np.int)
if is_bert:
padded_x_seg = np.zeros(shape=(len(padded_x), self.embedding.sequence_length))
x_input_data = [padded_x, padded_x_seg]
Expand Down Expand Up @@ -203,12 +227,14 @@ def predict(self,
sentence: Union[List[str], List[List[str]]],
batch_size=None,
output_dict=False,
multi_label_threshold=0.6,
debug_info=False) -> Union[List[str], str, List[Dict], Dict]:
"""
predict with model
:param sentence: single sentence as List[str] or list of sentence as List[List[str]]
:param batch_size: predict batch_size
:param output_dict: return dict with result with confidence
:param multi_label_threshold:
:param debug_info: print debug info using logging.debug when True
:return:
"""
Expand All @@ -227,7 +253,15 @@ def predict(self,
else:
x = padded_tokens
res = self.model.predict(x, batch_size=batch_size)
predict_result = res.argmax(-1)

if self.multi_label:
if debug_info:
logging.info('raw output: {}'.format(res))
res[res >= multi_label_threshold] = 1
res[res < multi_label_threshold] = 0
predict_result = res
else:
predict_result = res.argmax(-1)

if debug_info:
logging.info('input: {}'.format(x))
Expand All @@ -247,7 +281,10 @@ def predict(self,
else:
return results[0]
else:
results = self.convert_idx_to_label(predict_result)
if self.multi_label:
results = self.multi_label_binarizer.inverse_transform(predict_result)
else:
results = self.convert_idx_to_label(predict_result)
if is_list:
return results
else:
Expand Down
18 changes: 10 additions & 8 deletions kashgari/tasks/classification/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,15 @@
"""
from __future__ import absolute_import, division

import logging

import keras
#from keras import optimizers

from keras.models import Model
from keras.layers import Dense, Lambda, Flatten, Reshape
from keras.layers import Bidirectional, Conv1D
from keras.layers import Dense, Lambda, Flatten
from keras.layers import Dropout, SpatialDropout1D
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers import Bidirectional, Conv1D
from keras.layers import concatenate
from keras.models import Model

from kashgari.layers import AttentionWeightedAverage, KMaxPooling, LSTMLayer, GRULayer

from kashgari.tasks.classification.base_model import ClassificationModel


Expand Down Expand Up @@ -61,6 +56,13 @@ class CNNModel(ClassificationModel):
}

def build_model(self):

# TODO: maybe refactor this
if self.multi_label:
self.hyper_parameters['compile_params']['loss'] = 'binary_crossentropy'
self.hyper_parameters['compile_params']['metrics'] = ['categorical_accuracy']
self.hyper_parameters['activation_layer']['activation'] = 'sigmoid'

base_model = self.embedding.model
conv1d_layer = Conv1D(**self.hyper_parameters['conv1d_layer'])(base_model.output)
max_pool_layer = GlobalMaxPooling1D(**self.hyper_parameters['max_pool_layer'])(conv1d_layer)
Expand Down
5 changes: 1 addition & 4 deletions kashgari/tasks/seq_labeling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@
"""

from .blstm_model import BLSTMModel
from .blstm_crf_model import BLSTMCRFModel
from .cnn_lstm_model import CNNLSTMModel
from .base_model import SequenceLabelingModel
from .models import BLSTMModel, BLSTMCRFModel, CNNLSTMModel


if __name__ == '__main__':
Expand Down
17 changes: 10 additions & 7 deletions kashgari/tasks/seq_labeling/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
@time: 2019-01-21
"""
import random
import logging
import random
from typing import Tuple, Dict

import numpy as np
Expand All @@ -21,11 +21,11 @@
from seqeval.metrics.sequence_labeling import get_entities

import kashgari.macros as k
from kashgari.utils import helper
from kashgari.type_hints import *

from kashgari.tasks.base import BaseModel
from kashgari.embeddings import BaseEmbedding
from kashgari.tasks.base import BaseModel
from kashgari.type_hints import *
from kashgari.utils import helper
from kashgari.macros import config


class SequenceLabelingModel(BaseModel):
Expand All @@ -47,7 +47,7 @@ def label2idx(self, value):
self._label2idx = value
self._idx2label = dict([(val, key) for (key, val) in value.items()])

def build_model(self, loss_f=None, optimizer=None, metrics=None, **kwargs):
def build_model(self):
"""
build model function
:return:
Expand Down Expand Up @@ -92,7 +92,10 @@ def convert_labels_to_idx(self,
def tokenize_tokens(seq: List[str]):
tokens = [self._label2idx[i] for i in seq]
if add_eos_bos:
tokens = [self._label2idx[k.BOS]] + tokens + [self._label2idx[k.EOS]]
if config.sequence_labeling_tokenize_add_bos_eos:
tokens = [self._label2idx[k.BOS]] + tokens + [self._label2idx[k.EOS]]
else:
tokens = [self._label2idx[k.NO_TAG]] + tokens + [self._label2idx[k.NO_TAG]]
return tokens

if isinstance(label[0], str):
Expand Down
67 changes: 0 additions & 67 deletions kashgari/tasks/seq_labeling/blstm_crf_model.py

This file was deleted.

Loading

0 comments on commit 1ecf64d

Please sign in to comment.