Merge pull request #566 from EducationalTestingService/pos-label-str-…

…tests-and-fixes Major fixes for `pos_label_str`
EducationalTestingService · Oct 21, 2019 · e6da3ca · e6da3ca
2 parents b450a9f + 8e8bd11
commit e6da3ca
Show file tree

Hide file tree

Showing 8 changed files with 717 additions and 573 deletions.
diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst
@@ -1043,9 +1043,16 @@ SVR
 pos_label_str *(Optional)*
 """"""""""""""""""""""""""
 
-The string label for the positive class in the binary
-classification setting. If unspecified, an arbitrary class is
-picked.
+A string denoting the label of the class to be
+treated as the positive class in a binary classification
+setting. If unspecified, the class represented by the label
+that appears second when sorted is chosen as the positive
+class. For example, if the two labels in data are "A" and
+"B" and ``pos_label_str`` is not specified, "B" will be chosen
+as the positive class.
+
+Note that ``pos_label_str`` will be ignored if you have more
+than two classes.
 
 .. _output:
 

diff --git a/doc/utilities.rst b/doc/utilities.rst
@@ -121,7 +121,15 @@ generate_predictions
 
 Loads a trained model and outputs predictions based on input feature files.
 Useful if you want to reuse a trained model as part of a larger system without
-creating configuration files.
+creating configuration files. Offers the following modes of operation:
+
+- For non-probabilistic classification and regression, generate the predictions.
+- For probabilistic classification, generate either the most likely labels 
+  or the probabilities for each class label.
+- For binary probablistic classification, generate the positive class label
+  only if its probability exceeds the given threshold. The positive class
+  label is either read from the model file or inferred the same way as 
+  a SKLL learner would.
 
 Positional Arguments
 ^^^^^^^^^^^^^^^^^^^^
@@ -136,11 +144,6 @@ Positional Arguments
 
 Optional Arguments
 ^^^^^^^^^^^^^^^^^^
-.. option:: -a, --all_probabilities
-
-    Flag indicating whether to output the probabilities of all labels instead of just
-    the probability of the positive label.
-
 .. option:: -i <id_col>, --id_col <id_col>
 
     Name of the column which contains the instance IDs in ARFF, CSV, or TSV files.
@@ -152,22 +155,26 @@ Optional Arguments
     For ARFF files, this must be the final column to count as the label. 
     (default: ``y``)
 
-.. option:: -p <positive_label>, --positive_label <positive_label>
-
-    If the model is only being used to predict the probability of a particular
-    label, this specifies the index of the label we're predicting. 1 = second
-    label, which is default for binary classification. Keep in mind that labels
-    are sorted lexicographically. 
-    (default: 1)
+.. option:: -o <path>, --output_file <path>
+
+    Path to output TSV file. If not specified, predictions will be printed
+    to stdout. For probabilistic binary classification, the probability of
+    the positive class will always be in the last column.
+
+.. option:: -p, --predict_labels
+
+    If the model does probabilistic classification, output the class label
+    with the highest probability instead of the class probabilities.
 
 .. option:: -q, --quiet
 
     Suppress printing of ``"Loading..."`` messages.
 
 .. option:: -t <threshold>, --threshold <threshold>
 
-    If the model we're using is generating probabilities of the positive label,
-    return 1 if it meets/exceeds the given threshold and 0 otherwise.
+    If the model does binary probabilistic classification, 
+    return the positive class label only if it meets/exceeds
+    the given threshold and the other class label otherwise.
 
 .. option:: --version
 

diff --git a/skll/config.py b/skll/config.py
@@ -23,6 +23,7 @@
 from sklearn.metrics import SCORERS
 
 from skll import get_skll_logger
+from skll.data.readers import safe_float
 from skll.metrics import _PROBABILISTIC_METRICS
 
 _VALID_TASKS = frozenset(['cross_validate',
@@ -606,7 +607,7 @@ def _parse_config_file(config_path, log_level=logging.INFO):
                                                     "sampler_parameters"))
     fixed_sampler_parameters = yaml.safe_load(fixed_sampler_parameters)
     param_grid_list = yaml.safe_load(_fix_json(config.get("Tuning", "param_grids")))
-    pos_label_str = config.get("Tuning", "pos_label_str")
+    pos_label_str = safe_float(config.get("Tuning", "pos_label_str"))
 
     # ensure that feature_scaling is specified only as one of the
     # four available choices

diff --git a/skll/learner.py b/skll/learner.py
@@ -76,6 +76,7 @@
 
 from skll.data import FeatureSet
 from skll.data.dict_vectorizer import DictVectorizer
+from skll.data.readers import safe_float
 from skll.metrics import (_CLASSIFICATION_ONLY_METRICS,
                           _CORRELATION_METRICS,
                           _REGRESSION_ONLY_METRICS,
@@ -869,9 +870,13 @@ class Learner(object):
         initializer for the specified model.
         Defaults to ``None``.
     pos_label_str : str, optional
-        The string for the positive label in the binary
-        classification setting.  Otherwise, an arbitrary
-        label is picked.
+        A string denoting the label of the class to be
+        treated as the positive class in a binary classification
+        setting. If ``None``, the class represented by the label
+        that appears second when sorted is chosen as the positive
+        class. For example, if the two labels in data are "A"
+        and "B" and ``pos_label_str`` is not specified, "B" will
+        be chosen as the positive class.
         Defaults to ``None``.
     min_feature_count : int, optional
         The minimum number of examples a feature
@@ -910,7 +915,7 @@ def __init__(self, model_type, probability=False, pipeline=False,
         self.scaler = None
         self.label_dict = None
         self.label_list = None
-        self.pos_label_str = pos_label_str
+        self.pos_label_str = safe_float(pos_label_str) if pos_label_str is not None else pos_label_str
         self._model = None
         self._store_pipeline = pipeline
         self._feature_scaling = feature_scaling
@@ -1415,15 +1420,24 @@ def _create_label_dict(self, examples):
         if self.model_type._estimator_type == 'regressor':
             return
 
-        # extract list of unique labels if we are doing classification
+        # extract list of unique labels if we are doing classification;
+        # note that the output of np.unique() is sorted
         self.label_list = np.unique(examples.labels).tolist()
 
-        # if one label is specified as the positive class, make sure it's
-        # last
-        if self.pos_label_str:
-            self.label_list = sorted(self.label_list,
-                                     key=lambda x: (x == self.pos_label_str,
-                                                    x))
+        # for binary classification, if one label is specified as
+        # the positive class, re-sort the label list to make sure
+        # that it is last in the list; for multi-class classification
+        # raise a warning and set it back to None, since it does not
+        # make any sense anyway
+        if self.pos_label_str is not None:
+            if len(self.label_list) != 2:
+                self.logger.warning('Ignoring value of `pos_label_str` for '
+                                    'multi-class classification.')
+                self.pos_label_str = None
+            else:
+                self.label_list = sorted(self.label_list,
+                                         key=lambda x: (x == self.pos_label_str,
+                                                        x))
 
         # Given a list of all labels in the dataset and a list of the
         # unique labels in the set, convert the first list to an array of
@@ -1509,7 +1523,8 @@ def train(self, examples, param_grid=None, grid_search_folds=3,
             values.  This should only be done once per
             experiment, so when ``cross_validate`` calls
             ``train``, ``create_label_dict`` gets set to
-            ``False``.
+            ``False``. This option is only for internal
+            use.
             Defaults to ``True``.
 
         Returns