Skip to content

Commit

Permalink
🚨 Removing linter warnings.
Browse files Browse the repository at this point in the history
  • Loading branch information
BrikerMan committed May 21, 2019
1 parent e6640cd commit 72d9042
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 31 deletions.
6 changes: 3 additions & 3 deletions kashgari/embeddings/bare_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ def __init__(self,
embedding_size=embedding_size,
processor=processor)
if processor:
self.build_model()
self._build_model()

def build_model(self, **kwargs):
def _build_model(self, **kwargs):
if self.token_count == 0:
logging.debug('need to build after build_word2idx')
else:
Expand Down Expand Up @@ -82,7 +82,7 @@ def build_model(self, **kwargs):

embedding = BareEmbedding(task=kashgari.CLASSIFICATION,
sequence_length=12, processor=p)
embedding.build_model()
embedding._build_model()
embedding.embed_model.summary()
r = embedding.embed(x[:2])
print(r)
Expand Down
50 changes: 47 additions & 3 deletions kashgari/embeddings/base_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,16 @@ def __init__(self,

@property
def token_count(self) -> int:
"""
corpus token count
"""
return len(self.processor.token2idx)

@property
def sequence_length(self) -> Tuple[int, ...]:
"""
model sequence length
"""
return self._sequence_length

@sequence_length.setter
Expand All @@ -66,12 +72,12 @@ def sequence_length(self, val: Union[Tuple[int], str]):
val = (val,)
self._sequence_length = val

def build_model(self, **kwargs):
def _build_model(self, **kwargs):
raise NotImplementedError

def analyze_corpus(self,
x: Union[Tuple[List[List[str]], ...], List[List[str]]],
y: List[List[str]]):
y: Union[List[List[str]], List[str]]):
"""
Prepare embedding layer and pre-processor for labeling task
Expand All @@ -87,13 +93,31 @@ def analyze_corpus(self,
self.processor.analyze_corpus(x, y)
if self.sequence_length == 'auto':
self.sequence_length = self.processor.dataset_info['RECOMMEND_LEN']
self.build_model()
self._build_model()

def embed_one(self, sentence: List[str]) -> np.array:
"""
Convert one sentence to vector
Args:
sentence: target sentence, list of str
Returns:
vectorized sentence
"""
return self.embed([sentence])[0]

def embed(self,
sentence_list: Union[Tuple[List[List[str]], ...], List[List[str]]]) -> np.ndarray:
"""
batch embed sentences
Args:
sentence_list: Sentence list to embed
Returns:
vectorized sentence list
"""
if len(sentence_list) == 1 or isinstance(sentence_list, list):
sentence_list = (sentence_list,)
x = self.processor.process_x_dataset(sentence_list,
Expand All @@ -108,11 +132,31 @@ def embed(self,
def process_x_dataset(self,
data: Tuple[List[List[str]], ...],
subset: Optional[List[int]] = None) -> Tuple[np.ndarray, ...]:
"""
batch process feature data while training
Args:
data: target dataset
subset: subset index list
Returns:
vectorized feature tensor
"""
return self.processor.process_x_dataset(data, self.sequence_length, subset)

def process_y_dataset(self,
data: Tuple[List[List[str]], ...],
subset: Optional[List[int]] = None) -> Tuple[np.ndarray, ...]:
"""
batch process labels data while training
Args:
data: target dataset
subset: subset index list
Returns:
vectorized feature tensor
"""
return self.processor.process_y_dataset(data, self.sequence_length, subset)


Expand Down
30 changes: 25 additions & 5 deletions kashgari/embeddings/bert_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@


class BertEmbedding(Embedding):
"""Pre-trained word2vec embedding"""
"""Pre-trained BERT embedding"""

def __init__(self,
bert_path: str,
Expand Down Expand Up @@ -65,7 +65,7 @@ def __init__(self,
self.bert_path = bert_path
if processor:
self._build_token2idx_from_bert()
self.build_model()
self._build_model()

def _build_token2idx_from_bert(self):
token2idx = {
Expand All @@ -85,7 +85,7 @@ def _build_token2idx_from_bert(self):
self.processor.token2idx = self.bert_token2idx
self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])

def build_model(self, **kwargs):
def _build_model(self, **kwargs):
if self.token_count == 0:
logging.debug('need to build after build_word2idx')
else:
Expand All @@ -103,6 +103,16 @@ def build_model(self, **kwargs):
def analyze_corpus(self,
x: Union[Tuple[List[List[str]], ...], List[List[str]]],
y: Union[List[List[Any]], List[Any]]):
"""
Prepare embedding layer and pre-processor for labeling task
Args:
x:
y:
Returns:
"""
x = utils.wrap_as_tuple(x)
y = utils.wrap_as_tuple(y)
if len(self.processor.token2idx) == 0:
Expand All @@ -111,6 +121,16 @@ def analyze_corpus(self,

def embed(self,
sentence_list: Union[Tuple[List[List[str]], ...], List[List[str]]]) -> np.ndarray:
"""
batch embed sentences
Args:
sentence_list: Sentence list to embed
Returns:
vectorized sentence list
"""

if len(sentence_list) == 1 or isinstance(sentence_list, list):
sentence_list = (sentence_list,)
x = self.processor.process_x_dataset(sentence_list,
Expand All @@ -137,8 +157,8 @@ def embed(self,

bert_path = os.path.join(utils.get_project_path(), 'tests/test-data/bert')

b = BertEmbedding(kashgari.CLASSIFICATION,
bert_path,
b = BertEmbedding(task=kashgari.CLASSIFICATION,
bert_path=bert_path,
sequence_length=(12, 12))

from kashgari.corpus import SMP2018ECDTCorpus
Expand Down
16 changes: 13 additions & 3 deletions kashgari/embeddings/word_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(self,
processor=processor)
if processor:
self._build_token2idx_from_w2v()
self.build_model()
self._build_model()

def _build_token2idx_from_w2v(self):
w2v = KeyedVectors.load_word2vec_format(self.w2v_path, **self.w2v_kwargs)
Expand Down Expand Up @@ -92,7 +92,7 @@ def _build_token2idx_from_w2v(self):
logging.debug('Top 50 word : {}'.format(self.w2v_top_words))
logging.debug('------------------------------------------------')

def build_model(self, **kwargs):
def _build_model(self, **kwargs):
if self.token_count == 0:
logging.debug('need to build after build_word2idx')
else:
Expand Down Expand Up @@ -122,6 +122,16 @@ def build_model(self, **kwargs):
def analyze_corpus(self,
x: Union[Tuple[List[List[str]], ...], List[List[str]]],
y: Union[List[List[Any]], List[Any]]):
"""
Prepare embedding layer and pre-processor for labeling task
Args:
x:
y:
Returns:
"""
x = utils.wrap_as_tuple(x)
y = utils.wrap_as_tuple(y)
if not self.w2v_model_loaded:
Expand All @@ -145,7 +155,7 @@ def analyze_corpus(self,
w2v_path=w2v_path,
sequence_length=(12, 20))
embedding.analyze_corpus((train_x, train_x), train_y)
embedding.build_model()
embedding._build_model()
embedding.embed_model.summary()
r = embedding.embed((train_x[:2], train_x[:2]))
print(r)
Expand Down
6 changes: 3 additions & 3 deletions kashgari/pre_processors/base_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self):

def analyze_corpus(self,
corpus: Union[Tuple[List[List[str]], ...], List[List[str]]],
labels: List[str]):
labels: Union[Tuple[List[List[str]], ...], Tuple[List[str], ...]]):
corpus = utils.wrap_as_tuple(corpus)
rec_seq_len = []
for cor in corpus:
Expand Down Expand Up @@ -83,7 +83,7 @@ def load_cached_processor(cls, cache_dir: str):

return processor

def _build_token_dict(self, corpus: List[List[str]]):
def _build_token_dict(self, corpus: Tuple):
"""
Build token index dictionary using corpus
Expand Down Expand Up @@ -119,7 +119,7 @@ def _build_token_dict(self, corpus: List[List[str]]):
logging.debug(f"build token2idx dict finished, contains {len(self.token2idx)} tokens.")
self.dataset_info['token_count'] = len(self.token2idx)

def _build_label_dict(self, corpus: List[List[str]]):
def _build_label_dict(self, corpus: Tuple):
raise NotImplementedError

def process_x_dataset(self,
Expand Down
35 changes: 24 additions & 11 deletions kashgari/tasks/labeling/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# time: 2019-05-20 13:07


from typing import Dict, Any, List, Optional
from typing import Dict, Any, List, Optional, Union, Tuple

import numpy as np
from tensorflow import keras
Expand Down Expand Up @@ -68,6 +68,18 @@ def get_data_generator(self,
x_data,
y_data,
batch_size: int = 64):
"""
data generator for fit_generator
Args:
x_data: Array of feature data (if the model has a single input),
or tuple of feature data array (if the model has multiple inputs)
y_data: Array of label data
batch_size: Number of samples per gradient update, default to 64.
Returns:
data generator
"""

index_list = np.arange(len(x_data[0]))
page_count = len(x_data) // batch_size + 1
Expand All @@ -89,9 +101,9 @@ def get_data_generator(self,
yield (x_tensor, y_tensor)

def fit(self,
x_train: List[List[str]],
x_train: Union[Tuple[List[List[str]], ...], List[List[str]]],
y_train: List[List[str]],
x_validate: List[List[str]] = None,
x_validate: Union[Tuple[List[List[str]], ...], List[List[str]]] = None,
y_validate: List[List[str]] = None,
batch_size: int = 64,
epochs: int = 5,
Expand All @@ -101,15 +113,16 @@ def fit(self,
Trains the model for a given number of epochs (iterations on a dataset).
Args:
x_train: Array of training data
y_train: Array of training data
x_validate: Array of validation data
y_validate: Array of validation data
x_train: Array of train feature data (if the model has a single input),
or tuple of train feature data array (if the model has multiple inputs)
y_train: Array of train label data
x_validate: Array of validation feature data (if the model has a single input),
or tuple of validation feature data array (if the model has multiple inputs)
y_validate: Array of validation label data
batch_size: Number of samples per gradient update, default to 64.
epochs: Integer. Number of epochs to train the model. default 5.
fit_kwargs: fit_kwargs: additional arguments passed to ``fit_generator()`` function from
``tensorflow.keras.Model`` -
https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#fit_generator
``tensorflow.keras.Model`` - https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#fit_generator
**kwargs:
Returns:
Expand All @@ -130,7 +143,7 @@ def fit(self,
self.embedding.analyze_corpus(x_all, y_all)

if self.tf_model is None:
self.prepare_model_arc()
self.build_model_arc()
self.compile_model()

train_generator = self.get_data_generator(x_train,
Expand Down Expand Up @@ -176,7 +189,7 @@ def compile_model(self, **kwargs):
self.tf_model.compile(**kwargs)
self.tf_model.summary()

def prepare_model_arc(self):
def build_model_arc(self):
raise NotImplementedError


Expand Down
21 changes: 18 additions & 3 deletions kashgari/tasks/labeling/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ class BLSTMModel(BaseLabelingModel):

@classmethod
def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
"""
Get hyper parameters of model
Returns:
hyper parameters dict
"""
return {
'layer_blstm': {
'units': 128,
Expand All @@ -36,8 +41,10 @@ def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
}
}

def prepare_model_arc(self):

def build_model_arc(self):
"""
build model architectural
"""
output_dim = len(self.pre_processor.label2idx)
config = self.hyper_parameters
embed_model = self.embedding.embed_model
Expand Down Expand Up @@ -65,6 +72,11 @@ class CNNLSTMModel(BaseLabelingModel):

@classmethod
def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
"""
Get hyper parameters of model
Returns:
hyper parameters dict
"""
return {
'layer_conv': {
'filters': 32,
Expand All @@ -84,7 +96,10 @@ def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
}
}

def prepare_model_arc(self):
def build_model_arc(self):
"""
build model architectural
"""
output_dim = len(self.pre_processor.label2idx)
config = self.hyper_parameters
embed_model = self.embedding.embed_model
Expand Down

0 comments on commit 72d9042

Please sign in to comment.