Skip to content

Commit

Permalink
Merge pull request #542 from EducationalTestingService/feature/add-ig…
Browse files Browse the repository at this point in the history
…nore-blanks

Add `ignore_blanks` and `replace_blanks_with` arguments
  • Loading branch information
desilinguist committed Oct 4, 2019
2 parents 42a9a99 + 76e1cb4 commit aba9003
Show file tree
Hide file tree
Showing 6 changed files with 361 additions and 44 deletions.
11 changes: 10 additions & 1 deletion doc/run_experiment.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,16 @@ A simple comma or tab-delimited format with the following restrictions:
* If the data has instance IDs, there should be a column with the name
specified by :ref:`id_col <id_col>` in the :ref:`Input` section of the configuration file you create for your experiment. This defaults to ``id``. If there is no such column, IDs will be generated automatically.
* All other columns contain feature values, and every feature value
must be specified (making this a poor choice for sparse data).
must be specified (making this a poor choice for sparse data).

.. warning::

SKLL will raise an error if there are blank values in *any* of the
columns. You must either drop all rows with blank values in any column
or replace the blanks with a value you specify. To drop or replace via
the command line, use the :ref:`filter_features <filter_features>` script.
You can also drop/replace via the SKLL Reader API, specifically :py:mod:`skll.data.readers.CSVReader` and :py:mod:`skll.data.readers.TSVReader`.


.. _ndj:

Expand Down
18 changes: 17 additions & 1 deletion doc/utilities.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,17 +76,33 @@ Optional Arguments

Instead of keeping features and/or examples in lists, remove them.

.. option:: --id_col <id_col>

Name of the column which contains the instance IDs in ARFF, CSV, or TSV files.
(default: ``id``)

.. option:: -L <label <label ...>>, --label <label <label ...>>

A label in the feature file you would like to keep. If unspecified, no
instances are removed based on their labels.

.. option:: -l label_col, --label_col label_col
.. option:: -l <label_col>, --label_col <label_col>

Name of the column which contains the class labels in ARFF, CSV, or TSV
files. For ARFF files, this must be the final column to count as the label.
(default: ``y``)

.. option:: -db, --drop-blanks

Drop all lines/rows that have any blank values.
(default: ``False``)

.. option:: -rb <replacement>, --replace-blanks-with <replacement>

Specifies a new value with which to replace blank values in all columns in the
file. To replace blanks differently in each column, use the SKLL Reader API directly.
(default: ``None``)

.. option:: -q, --quiet

Suppress printing of ``"Loading..."`` messages.
Expand Down
108 changes: 92 additions & 16 deletions skll/data/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,12 @@ def feat_dict_generator():

return ids, labels, features

def _parse_dataframe(self, df, id_col, label_col, features=None):
def _parse_dataframe(self,
df,
id_col,
label_col,
replace_blanks_with=None,
drop_blanks=False):
"""
Parse the data frame into ids, labels, and features.
For `Reader` objects that rely on `pandas`, this function
Expand All @@ -308,11 +313,19 @@ def _parse_dataframe(self, df, id_col, label_col, features=None):
The id column.
label_col : str or None
The label column.
features : list of dict or None
The features, if they already exist;
if not, then they will be extracted
from the data frame.
Defaults to None.
replace_blanks_with : value, ``dict``, or ``None``, optional
Specifies a new value with which to replace blank values.
Options are ::
- value = A (numeric) value with which to replace blank values.
- ``dict`` = A dictionary specifying the replacement value for each column.
- ``None`` = Blank values will be left as blanks, and not replaced.
Defaults to ``None``.
drop_blanks : bool, optional
If ``True``, remove lines/rows that have any blank
values.
Defaults to ``False``.
Returns
-------
Expand All @@ -327,6 +340,22 @@ def _parse_dataframe(self, df, id_col, label_col, features=None):
raise ValueError("No features found in possibly "
"empty file '{}'.".format(self.path_or_list))

if drop_blanks and replace_blanks_with is not None:
raise ValueError("You cannot both drop blanks and replace them. "
"'replace_blanks_with' can only have a value when "
"'drop_blanks' is `False`.")

# should we replace blank values with something?
if replace_blanks_with is not None:
self.logger.info('Blank values in all rows/lines will be replaced with '
'user-specified value(s).')
df = df.fillna(replace_blanks_with)

# should we remove lines that have any NaNs?
if drop_blanks:
self.logger.info('Rows/lines with any blank values will be dropped.')
df = df.dropna().reset_index(drop=True)

# if the id column exists,
# get them from the data frame and
# delete the column; otherwise, just
Expand Down Expand Up @@ -354,8 +383,7 @@ def _parse_dataframe(self, df, id_col, label_col, features=None):
# map the new classes to the labels;
# otherwise, just convert them to floats
if self.class_map is not None:
labels = labels.apply(safe_float,
replace_dict=self.class_map)
labels = labels.apply(safe_float, replace_dict=self.class_map)
else:
labels = labels.apply(safe_float)
labels = labels.values
Expand All @@ -364,10 +392,8 @@ def _parse_dataframe(self, df, id_col, label_col, features=None):
labels = np.array([None] * df.shape[0])

# convert the remaining features to
# a list of dictionaries, if no
# features argument was passed
if features is None:
features = df.to_dict(orient='records')
# a list of dictionaries
features = df.to_dict(orient='records')

return ids, labels, features

Expand Down Expand Up @@ -762,6 +788,21 @@ class CSVReader(Reader):
----------
path_or_list : str
The path to a comma-delimited file.
replace_blanks_with : value, ``dict``, or ``None``, optional
Specifies a new value with which to replace blank values.
Options are ::
- value = A (numeric) value with which to replace blank values.
- dict = A dictionary specifying the replacement value for each column.
- None = Blank values will be left as blanks, and not replaced.
The replacement occurs after the data set is read into a `pd.DataFrame`.
Defaults to ``None``.
drop_blanks : bool, optional
If ``True``, remove lines/rows that have any blank
values. These lines/rows are removed after the
the data set is read into a `pd.DataFrame`.
Defaults to ``False``.
pandas_kwargs : dict or None, optional
Arguments that will be passed directly
to the `pandas` I/O reader.
Expand All @@ -770,8 +811,15 @@ class CSVReader(Reader):
Other arguments to the Reader object.
"""

def __init__(self, path_or_list, pandas_kwargs=None, **kwargs):
def __init__(self,
path_or_list,
replace_blanks_with=None,
drop_blanks=False,
pandas_kwargs=None,
**kwargs):
super(CSVReader, self).__init__(path_or_list, **kwargs)
self._replace_blanks_with = replace_blanks_with
self._drop_blanks = drop_blanks
self._pandas_kwargs = {} if pandas_kwargs is None else pandas_kwargs
self._sep = self._pandas_kwargs.pop('sep', str(','))
self._engine = self._pandas_kwargs.pop('engine', 'c')
Expand All @@ -794,7 +842,11 @@ def _sub_read(self, file):
The features for the features set.
"""
df = pd.read_csv(file, sep=self._sep, engine=self._engine, **self._pandas_kwargs)
return self._parse_dataframe(df, self.id_col, self.label_col)
return self._parse_dataframe(df,
self.id_col,
self.label_col,
replace_blanks_with=self._replace_blanks_with,
drop_blanks=self._drop_blanks)


class TSVReader(CSVReader):
Expand All @@ -810,6 +862,21 @@ class TSVReader(CSVReader):
----------
path_or_list : str
The path to a comma-delimited file.
replace_blanks_with : value, ``dict``, or ``None``, optional
Specifies a new value with which to replace blank values.
Options are ::
- value = A (numeric) value with which to replace blank values.
- dict = A dictionary specifying the replacement value for each column.
- None = Blank values will be left as blanks, and not replaced.
The replacement occurs after the data set is read into a `pd.DataFrame`.
Defaults to ``None``.
drop_blanks : bool, optional
If ``True``, remove lines/rows that have any blank
values. These lines/rows are removed after the
the data set is read into a `pd.DataFrame`.
Defaults to ``False``.
pandas_kwargs : dict or None, optional
Arguments that will be passed directly
to the `pandas` I/O reader.
Expand All @@ -818,8 +885,17 @@ class TSVReader(CSVReader):
Other arguments to the Reader object.
"""

def __init__(self, path_or_list, pandas_kwargs=None, **kwargs):
super(TSVReader, self).__init__(path_or_list, pandas_kwargs, **kwargs)
def __init__(self,
path_or_list,
replace_blanks_with=None,
drop_blanks=False,
pandas_kwargs=None,
**kwargs):
super(TSVReader, self).__init__(path_or_list,
replace_blanks_with=replace_blanks_with,
drop_blanks=drop_blanks,
pandas_kwargs=pandas_kwargs,
**kwargs)
self._sep = str('\t')


Expand Down
75 changes: 50 additions & 25 deletions skll/utilities/filter_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import os
import sys

from skll.data.readers import EXT_TO_READER
from skll.data.readers import EXT_TO_READER, safe_float
from skll.data.writers import ARFFWriter, CSVWriter, TSVWriter, EXT_TO_WRITER
from skll.version import __version__

Expand All @@ -31,43 +31,52 @@ def main(argv=None):

# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes an input feature file and removes any instances or\
features that do not match the specified patterns.",
description="Takes an input feature file and removes any instances or "
"features that do not match the specified patterns.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('infile',
help='input feature file (ends in .arff, .csv, \
.jsonlines, .megam, .ndj, or .tsv)')
help='input feature file (ends in .arff, .csv, '
'.jsonlines, .megam, .ndj, or .tsv)')
parser.add_argument('outfile',
help='output feature file (must have same extension as\
input file)')
help='output feature file (must have same extension as '
'input file)')
parser.add_argument('-f', '--feature',
help='A feature in the feature file you would like to \
keep. If unspecified, no features are removed.',
help='A feature in the feature file you would like to '
'keep. If unspecified, no features are removed.',
nargs='*')
parser.add_argument('-I', '--id',
help='An instance ID in the feature file you would \
like to keep. If unspecified, no instances are \
removed based on their IDs.',
help='An instance ID in the feature file you would '
'like to keep. If unspecified, no instances are '
'removed based on their IDs.',
nargs='*')
parser.add_argument('--id_col',
help='Name of the column which contains the instance \
IDs in ARFF, CSV, or TSV files.',
help='Name of the column which contains the instance '
'IDs in ARFF, CSV, or TSV files.',
default='id')
parser.add_argument('-i', '--inverse',
help='Instead of keeping features and/or examples in \
lists, remove them.',
help='Instead of keeping features and/or examples in '
'lists, remove them.',
action='store_true')
parser.add_argument('-L', '--label',
help='A label in the feature file you would like to \
keep. If unspecified, no instances are removed \
based on their labels.',
help='A label in the feature file you would like to '
'keep. If unspecified, no instances are removed '
'based on their labels.',
nargs='*')
parser.add_argument('-l', '--label_col',
help='Name of the column which contains the class \
labels in ARFF, CSV, or TSV files. For ARFF \
files, this must be the final column to count as\
the label.',
help='Name of the column which contains the class '
'labels in ARFF, CSV, or TSV files. For ARFF '
'files, this must be the final column to count as '
'the label.',
default='y')
parser.add_argument('-rb', '--replace_blanks_with',
help='Specifies a new value with which to replace blank values '
'in all columns in the file. To replace blanks differently '
'in each column, use the SKLL Reader API directly.',
default=None)
parser.add_argument('-db', '--drop_blanks',
action='store_true',
help='Drop all lines/rows that have any blank values.',
default=False)
parser.add_argument('-q', '--quiet',
help='Suppress printing of "Loading..." messages.',
action='store_true')
Expand Down Expand Up @@ -104,10 +113,26 @@ def main(argv=None):
'file. You specified: {}').format(output_extension))
sys.exit(1)

if input_extension == '.csv' or input_extension == '.tsv':
replace_blanks_with = args.replace_blanks_with
drop_blanks = args.drop_blanks
if drop_blanks and replace_blanks_with is not None:
raise ValueError("You cannot both drop blanks and replace them. "
"'replace_blanks_with' can only have a value when "
"'drop_blanks' is `False`.")
replace_blanks_with = (None if replace_blanks_with is None
else safe_float(replace_blanks_with))
kwargs = {'replace_blanks_with': replace_blanks_with,
'drop_blanks': drop_blanks}
else:
kwargs = {}

# Read input file
reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet,
reader = EXT_TO_READER[input_extension](args.infile,
quiet=args.quiet,
label_col=args.label_col,
id_col=args.id_col)
id_col=args.id_col,
**kwargs)
feature_set = reader.read()

# Do the actual filtering
Expand Down

0 comments on commit aba9003

Please sign in to comment.