Merge pull request #542 from EducationalTestingService/feature/add-ig…

…nore-blanks Add `ignore_blanks` and `replace_blanks_with` arguments
EducationalTestingService · Oct 4, 2019 · aba9003 · aba9003
2 parents 42a9a99 + 76e1cb4
commit aba9003
Show file tree

Hide file tree

Showing 6 changed files with 361 additions and 44 deletions.
diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst
@@ -57,7 +57,16 @@ A simple comma or tab-delimited format with the following restrictions:
 *   If the data has instance IDs, there should be a column with the name
     specified by :ref:`id_col <id_col>` in the :ref:`Input` section of the configuration file you create for your experiment. This defaults to ``id``.  If there is no such column, IDs will be generated automatically.
 *   All other columns contain feature values, and every feature value
-    must be specified (making this a poor choice for sparse data).
+    must be specified (making this a poor choice for sparse data). 
+
+.. warning:: 
+
+    SKLL will raise an error if there are blank values in *any* of the
+    columns. You must either drop all rows with blank values in any column
+    or replace the blanks with a value you specify. To drop or replace via
+    the command line, use the :ref:`filter_features <filter_features>` script.
+    You can also drop/replace via the SKLL Reader API, specifically :py:mod:`skll.data.readers.CSVReader` and :py:mod:`skll.data.readers.TSVReader`.
+
 
 .. _ndj:
 

diff --git a/doc/utilities.rst b/doc/utilities.rst
@@ -76,17 +76,33 @@ Optional Arguments
 
     Instead of keeping features and/or examples in lists, remove them.
 
+.. option:: --id_col <id_col>
+
+    Name of the column which contains the instance IDs in ARFF, CSV, or TSV files.
+    (default: ``id``)
+
 .. option:: -L <label <label ...>>, --label <label <label ...>>
 
     A label in the feature file you would like to keep. If unspecified, no
     instances are removed based on their labels.
 
-.. option:: -l label_col, --label_col label_col
+.. option:: -l <label_col>, --label_col <label_col>
 
     Name of the column which contains the class labels in ARFF, CSV, or TSV
     files. For ARFF files, this must be the final column to count as the label.
     (default: ``y``)
 
+.. option:: -db, --drop-blanks
+
+    Drop all lines/rows that have any blank values.
+    (default: ``False``)
+
+.. option:: -rb <replacement>, --replace-blanks-with <replacement>
+
+    Specifies a new value with which to replace blank values in all columns in the
+    file. To replace blanks differently in each column, use the SKLL Reader API directly.
+    (default: ``None``)
+
 .. option:: -q, --quiet
 
     Suppress printing of ``"Loading..."`` messages.

diff --git a/skll/data/readers.py b/skll/data/readers.py
@@ -291,7 +291,12 @@ def feat_dict_generator():
 
         return ids, labels, features
 
-    def _parse_dataframe(self, df, id_col, label_col, features=None):
+    def _parse_dataframe(self,
+                         df,
+                         id_col,
+                         label_col,
+                         replace_blanks_with=None,
+                         drop_blanks=False):
         """
         Parse the data frame into ids, labels, and features.
         For `Reader` objects that rely on `pandas`, this function
@@ -308,11 +313,19 @@ def _parse_dataframe(self, df, id_col, label_col, features=None):
             The id column.
         label_col : str or None
             The label column.
-        features : list of dict or None
-            The features, if they already exist;
-            if not, then they will be extracted
-            from the data frame.
-            Defaults to None.
+        replace_blanks_with : value, ``dict``, or ``None``, optional
+            Specifies a new value with which to replace blank values.
+            Options are ::
+
+                -  value = A (numeric) value with which to replace blank values.
+                -  ``dict`` = A dictionary specifying the replacement value for each column.
+                -  ``None`` = Blank values will be left as blanks, and not replaced.
+
+            Defaults to ``None``.
+        drop_blanks : bool, optional
+            If ``True``, remove lines/rows that have any blank
+            values.
+            Defaults to ``False``.
 
         Returns
         -------
@@ -327,6 +340,22 @@ def _parse_dataframe(self, df, id_col, label_col, features=None):
             raise ValueError("No features found in possibly "
                              "empty file '{}'.".format(self.path_or_list))
 
+        if drop_blanks and replace_blanks_with is not None:
+            raise ValueError("You cannot both drop blanks and replace them. "
+                             "'replace_blanks_with' can only have a value when "
+                             "'drop_blanks' is `False`.")
+
+        # should we replace blank values with something?
+        if replace_blanks_with is not None:
+            self.logger.info('Blank values in all rows/lines will be replaced with '
+                             'user-specified value(s).')
+            df = df.fillna(replace_blanks_with)
+
+        # should we remove lines that have any NaNs?
+        if drop_blanks:
+            self.logger.info('Rows/lines with any blank values will be dropped.')
+            df = df.dropna().reset_index(drop=True)
+
         # if the id column exists,
         # get them from the data frame and
         # delete the column; otherwise, just
@@ -354,8 +383,7 @@ def _parse_dataframe(self, df, id_col, label_col, features=None):
             # map the new classes to the labels;
             # otherwise, just convert them to floats
             if self.class_map is not None:
-                labels = labels.apply(safe_float,
-                                      replace_dict=self.class_map)
+                labels = labels.apply(safe_float, replace_dict=self.class_map)
             else:
                 labels = labels.apply(safe_float)
             labels = labels.values
@@ -364,10 +392,8 @@ def _parse_dataframe(self, df, id_col, label_col, features=None):
             labels = np.array([None] * df.shape[0])
 
         # convert the remaining features to
-        # a list of dictionaries, if no
-        # features argument was passed
-        if features is None:
-            features = df.to_dict(orient='records')
+        # a list of dictionaries
+        features = df.to_dict(orient='records')
 
         return ids, labels, features
 
@@ -762,6 +788,21 @@ class CSVReader(Reader):
     ----------
     path_or_list : str
         The path to a comma-delimited file.
+    replace_blanks_with : value, ``dict``, or ``None``, optional
+        Specifies a new value with which to replace blank values.
+        Options are ::
+
+            -  value = A (numeric) value with which to replace blank values.
+            -  dict = A dictionary specifying the replacement value for each column.
+            -  None = Blank values will be left as blanks, and not replaced.
+
+        The replacement occurs after the data set is read into a `pd.DataFrame`.
+        Defaults to ``None``.
+    drop_blanks : bool, optional
+        If ``True``, remove lines/rows that have any blank
+        values. These lines/rows are removed after the
+        the data set is read into a `pd.DataFrame`.
+        Defaults to ``False``.
     pandas_kwargs : dict or None, optional
         Arguments that will be passed directly
         to the `pandas` I/O reader.
@@ -770,8 +811,15 @@ class CSVReader(Reader):
         Other arguments to the Reader object.
     """
 
-    def __init__(self, path_or_list, pandas_kwargs=None, **kwargs):
+    def __init__(self,
+                 path_or_list,
+                 replace_blanks_with=None,
+                 drop_blanks=False,
+                 pandas_kwargs=None,
+                 **kwargs):
         super(CSVReader, self).__init__(path_or_list, **kwargs)
+        self._replace_blanks_with = replace_blanks_with
+        self._drop_blanks = drop_blanks
         self._pandas_kwargs = {} if pandas_kwargs is None else pandas_kwargs
         self._sep = self._pandas_kwargs.pop('sep', str(','))
         self._engine = self._pandas_kwargs.pop('engine', 'c')
@@ -794,7 +842,11 @@ def _sub_read(self, file):
             The features for the features set.
         """
         df = pd.read_csv(file, sep=self._sep, engine=self._engine, **self._pandas_kwargs)
-        return self._parse_dataframe(df, self.id_col, self.label_col)
+        return self._parse_dataframe(df,
+                                     self.id_col,
+                                     self.label_col,
+                                     replace_blanks_with=self._replace_blanks_with,
+                                     drop_blanks=self._drop_blanks)
 
 
 class TSVReader(CSVReader):
@@ -810,6 +862,21 @@ class TSVReader(CSVReader):
     ----------
     path_or_list : str
         The path to a comma-delimited file.
+    replace_blanks_with : value, ``dict``, or ``None``, optional
+        Specifies a new value with which to replace blank values.
+        Options are ::
+
+            -  value = A (numeric) value with which to replace blank values.
+            -  dict = A dictionary specifying the replacement value for each column.
+            -  None = Blank values will be left as blanks, and not replaced.
+
+        The replacement occurs after the data set is read into a `pd.DataFrame`.
+        Defaults to ``None``.
+    drop_blanks : bool, optional
+        If ``True``, remove lines/rows that have any blank
+        values. These lines/rows are removed after the
+        the data set is read into a `pd.DataFrame`.
+        Defaults to ``False``.
     pandas_kwargs : dict or None, optional
         Arguments that will be passed directly
         to the `pandas` I/O reader.
@@ -818,8 +885,17 @@ class TSVReader(CSVReader):
         Other arguments to the Reader object.
     """
 
-    def __init__(self, path_or_list, pandas_kwargs=None, **kwargs):
-        super(TSVReader, self).__init__(path_or_list, pandas_kwargs, **kwargs)
+    def __init__(self,
+                 path_or_list,
+                 replace_blanks_with=None,
+                 drop_blanks=False,
+                 pandas_kwargs=None,
+                 **kwargs):
+        super(TSVReader, self).__init__(path_or_list,
+                                        replace_blanks_with=replace_blanks_with,
+                                        drop_blanks=drop_blanks,
+                                        pandas_kwargs=pandas_kwargs,
+                                        **kwargs)
         self._sep = str('\t')
 
 

diff --git a/skll/utilities/filter_features.py b/skll/utilities/filter_features.py
@@ -13,7 +13,7 @@
 import os
 import sys
 
-from skll.data.readers import EXT_TO_READER
+from skll.data.readers import EXT_TO_READER, safe_float
 from skll.data.writers import ARFFWriter, CSVWriter, TSVWriter, EXT_TO_WRITER
 from skll.version import __version__
 
@@ -31,43 +31,52 @@ def main(argv=None):
 
     # Get command line arguments
     parser = argparse.ArgumentParser(
-        description="Takes an input feature file and removes any instances or\
-                     features that do not match the specified patterns.",
+        description="Takes an input feature file and removes any instances or "
+                    "features that do not match the specified patterns.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('infile',
-                        help='input feature file (ends in .arff, .csv, \
-                              .jsonlines, .megam, .ndj, or .tsv)')
+                        help='input feature file (ends in .arff, .csv, '
+                             '.jsonlines, .megam, .ndj, or .tsv)')
     parser.add_argument('outfile',
-                        help='output feature file (must have same extension as\
-                              input file)')
+                        help='output feature file (must have same extension as '
+                             'input file)')
     parser.add_argument('-f', '--feature',
-                        help='A feature in the feature file you would like to \
-                              keep.  If unspecified, no features are removed.',
+                        help='A feature in the feature file you would like to '
+                             'keep.  If unspecified, no features are removed.',
                         nargs='*')
     parser.add_argument('-I', '--id',
-                        help='An instance ID in the feature file you would \
-                              like to keep.  If unspecified, no instances are \
-                              removed based on their IDs.',
+                        help='An instance ID in the feature file you would '
+                             'like to keep.  If unspecified, no instances are '
+                             'removed based on their IDs.',
                         nargs='*')
     parser.add_argument('--id_col',
-                        help='Name of the column which contains the instance \
-                              IDs in ARFF, CSV, or TSV files.',
+                        help='Name of the column which contains the instance '
+                             'IDs in ARFF, CSV, or TSV files.',
                         default='id')
     parser.add_argument('-i', '--inverse',
-                        help='Instead of keeping features and/or examples in \
-                              lists, remove them.',
+                        help='Instead of keeping features and/or examples in '
+                             'lists, remove them.',
                         action='store_true')
     parser.add_argument('-L', '--label',
-                        help='A label in the feature file you would like to \
-                              keep.  If unspecified, no instances are removed \
-                              based on their labels.',
+                        help='A label in the feature file you would like to '
+                             'keep.  If unspecified, no instances are removed '
+                             'based on their labels.',
                         nargs='*')
     parser.add_argument('-l', '--label_col',
-                        help='Name of the column which contains the class \
-                              labels in ARFF, CSV, or TSV files. For ARFF \
-                              files, this must be the final column to count as\
-                              the label.',
+                        help='Name of the column which contains the class '
+                             'labels in ARFF, CSV, or TSV files. For ARFF '
+                             'files, this must be the final column to count as '
+                             'the label.',
                         default='y')
+    parser.add_argument('-rb', '--replace_blanks_with',
+                        help='Specifies a new value with which to replace blank values '
+                             'in all columns in the file. To replace blanks differently '
+                             'in each column, use the SKLL Reader API directly.',
+                        default=None)
+    parser.add_argument('-db', '--drop_blanks',
+                        action='store_true',
+                        help='Drop all lines/rows that have any blank values.',
+                        default=False)
     parser.add_argument('-q', '--quiet',
                         help='Suppress printing of "Loading..." messages.',
                         action='store_true')
@@ -104,10 +113,26 @@ def main(argv=None):
                       'file.  You specified: {}').format(output_extension))
         sys.exit(1)
 
+    if input_extension == '.csv' or input_extension == '.tsv':
+        replace_blanks_with = args.replace_blanks_with
+        drop_blanks = args.drop_blanks
+        if drop_blanks and replace_blanks_with is not None:
+            raise ValueError("You cannot both drop blanks and replace them. "
+                             "'replace_blanks_with' can only have a value when "
+                             "'drop_blanks' is `False`.")
+        replace_blanks_with = (None if replace_blanks_with is None
+                               else safe_float(replace_blanks_with))
+        kwargs = {'replace_blanks_with': replace_blanks_with,
+                  'drop_blanks': drop_blanks}
+    else:
+        kwargs = {}
+
     # Read input file
-    reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet,
+    reader = EXT_TO_READER[input_extension](args.infile,
+                                            quiet=args.quiet,
                                             label_col=args.label_col,
-                                            id_col=args.id_col)
+                                            id_col=args.id_col,
+                                            **kwargs)
     feature_set = reader.read()
 
     # Do the actual filtering