Skip to content

Commit

Permalink
Use a slightly optimized simpler FeatureGenerator per default.
Browse files Browse the repository at this point in the history
  • Loading branch information
thvitt committed Jul 1, 2021
1 parent 07a7612 commit d6494e8
Showing 1 changed file with 35 additions and 2 deletions.
37 changes: 35 additions & 2 deletions delta/corpus.py
Expand Up @@ -283,6 +283,39 @@ def metadata(self):
return Metadata(features='words', lower_case=self.lower_case)


class _NamedCounter(collections.Counter):

def __init__(self, iterable=None, _name='', **kwds):
super().__init__(iterable, **kwds)
self.name = _name


class SimpleFeatureGenerator(FeatureGenerator):
"""
A simplified, faster version of the FeatureGenerator.
With respect to feature generation the behaviour is the same as with FeatureGenerator, but it is slightly less
flexible with respect to subclassing. It does not read the files linewise, and it never creates pd.Series().
"""

def preprocess_text(self, text):
if self.lower_case:
return text.lower()
else:
return text

def postprocess_tokens(self, tokens):
if self.ngrams:
tokens = ngrams(tokens, n=self.ngrams, sep=" ")
return tokens

def process_file(self, filename):
with open(filename, encoding=self.encoding) as f:
text = self.preprocess_text(f.read())
tokens = self.postprocess_tokens(self.tokenize([text]))
return _NamedCounter(tokens, self.get_name(filename))


class CorpusNotComplete(ValueError):
def __init__(self, msg="Corpus not complete anymore"):
super().__init__(msg)
Expand Down Expand Up @@ -368,13 +401,13 @@ def __init__(self, source=None, *, subdir=None, file=None, corpus=None,
# initialize data
if subdir is not None:
if feature_generator is None: # generate default feature generator from matching args
fg_sig_arguments = signature(FeatureGenerator).parameters
fg_sig_arguments = signature(SimpleFeatureGenerator).parameters
fg_actual_args = {}
for key, value in kwargs.copy().items():
if key in fg_sig_arguments:
fg_actual_args[key] = value
del kwargs[key] # if they belong in metadata, FeatureGenerator will put them there
feature_generator = FeatureGenerator(**fg_actual_args)
feature_generator = SimpleFeatureGenerator(**fg_actual_args)

logger.info(
"Creating corpus by reading %s using %s",
Expand Down

0 comments on commit d6494e8

Please sign in to comment.