Skip to content

Commit

Permalink
Allow corpus to be sorted by default
Browse files Browse the repository at this point in the history
  • Loading branch information
thvitt committed Jun 28, 2021
1 parent 01d6c5d commit 65857ef
Showing 1 changed file with 19 additions and 7 deletions.
26 changes: 19 additions & 7 deletions delta/corpus.py
Expand Up @@ -66,8 +66,8 @@ def __init__(self, lower_case=False, encoding="utf-8", glob='*.txt',
token_pattern=LETTERS_PATTERN,
max_tokens=None,
ngrams=None,
parallel=False
):
parallel=False,
sort='documents'):
"""
Creates a customized default feature generator.
Expand All @@ -85,10 +85,15 @@ def __init__(self, lower_case=False, encoding="utf-8", glob='*.txt',
regular expressions*) that contains at least one letter.
max_tokens (int): If set, stop reading each file after that many words.
ngrams (int): Count token ngrams instead of single tokens
parallel(bool|int|Parallel): If truish, read and parse files in parallel. The actual argument may be
parallel(bool, int, Parallel): If truish, read and parse files in parallel. The actual argument may be
- None or False for no special processing
- an int for the required number of jobs
- a dictionary with Parallel arguments for finer control
sort (str): Sort the final feature matrix by index before returning. Possible values:
- ``documents``, ``index``: Sort by document names
- ``features``, ``columns``: sort by feature labels (ie words)
- ``both``: sort along both axes
- None or the empty string: Do not sort
"""
self.lower_case = lower_case
self.encoding = encoding
Expand All @@ -99,6 +104,7 @@ def __init__(self, lower_case=False, encoding="utf-8", glob='*.txt',
self.ngrams = ngrams
self.logger = logging.getLogger(__name__)
self.parallel = parallel
self.sort = sort

def __repr__(self):
return type(self).__name__ + '(' + \
Expand Down Expand Up @@ -248,10 +254,16 @@ def _get_parallel_executor(self) -> Parallel:
def __call__(self, directory):
"""
Runs the feature extraction using :meth:`process_directory` for the
given directory and returns a simple, unsorted pd.DataFrame for that.
"""
df = pd.DataFrame(self.process_directory(directory))
return df.T
given directory and returns a simple pd.DataFrame for that. The resulting
dataframe will be sorted according to the `sort` attribute.
"""
df = pd.DataFrame(self.process_directory(directory)).T
if self.sort:
if self.sort.lower() in {'documents', 'index', 'both'}:
df = df.sort_index(axis=0)
if self.sort.lower() in {'features', 'columns', 'both'}:
df = df.sort_index(axis=1)
return df

@property
def metadata(self):
Expand Down

0 comments on commit 65857ef

Please sign in to comment.