Merge pull request #453 from EducationalTestingService/unlabelled-rea…

…dwrite-compatibility Properly handle unlabeled data in multiple places in SKLL
EducationalTestingService · Feb 11, 2019 · 0b13413 · 0b13413
2 parents e911a0f + df20b46
commit 0b13413
Show file tree

Hide file tree

Showing 5 changed files with 115 additions and 41 deletions.
diff --git a/skll/data/featureset.py b/skll/data/featureset.py
@@ -413,7 +413,9 @@ def has_labels(self):
         has_labels : bool
             Whether or not this FeatureSet has any finite labels.
         """
-        if self.labels is not None:
+        # make sure that labels is not None or a list of Nones
+        if self.labels is not None and not all(label is None for label in self.labels):
+            # then check that they are not a list of NaNs
             return not (np.issubdtype(self.labels.dtype, np.floating) and
                         np.isnan(np.min(self.labels)))
         else:

diff --git a/skll/data/writers.py b/skll/data/writers.py
@@ -458,7 +458,7 @@ def _write_header(self, feature_set, output_file, filter_features):
             features to include in this file.
         """
         fieldnames = self._get_fieldnames(filter_features)
-        if self.label_col in fieldnames:
+        if self.label_col and self.label_col in fieldnames:
             fieldnames.remove(self.label_col)
 
         # Add relation to header
@@ -480,7 +480,8 @@ def _write_header(self, feature_set, output_file, filter_features):
                       "{" + ','.join(map(str,
                                          sorted(set(self.feat_set.labels)))) +
                       "}", file=output_file)
-        fieldnames.append(self.label_col)
+        if self.label_col:
+            fieldnames.append(self.label_col)
 
         # Create CSV writer to handle missing values for lines in data section
         # and to ignore the instance values for non-numeric attributes

diff --git a/skll/utilities/skll_convert.py b/skll/utilities/skll_convert.py
@@ -63,12 +63,18 @@ def main(argv=None):
                         help='Name of the column which contains the instance \
                               IDs in ARFF, CSV, or TSV files.',
                         default='id')
-    parser.add_argument('-l', '--label_col',
-                        help='Name of the column which contains the class \
-                              labels in ARFF, CSV, or TSV files. For ARFF \
-                              files, this must be the final column to count as\
-                              the label.',
-                        default='y')
+    label_group = parser.add_mutually_exclusive_group(required=False)
+    label_group.add_argument('-l',
+                             '--label_col',
+                             help='Name of the column which contains the class \
+                                   labels in ARFF, CSV, or TSV files. For ARFF \
+                                   files, this must be the final column to count as\
+                                   the label.',
+                             default='y')
+    label_group.add_argument('--no_labels',
+                             action='store_true',
+                             default=False,
+                             help='Used to indicate that the input data has no labels.')
     parser.add_argument('-q', '--quiet',
                         help='Suppress printing of "Loading..." messages.',
                         action='store_true')
@@ -132,19 +138,22 @@ def main(argv=None):
         feat_vectorizer = None
         label_map = None
 
+    label_col = None if args.no_labels else args.label_col
+
     # Iterate through input file and collect the information we need
-    reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet,
-                                            label_col=args.label_col,
+    reader = EXT_TO_READER[input_extension](args.infile,
+                                            quiet=args.quiet,
+                                            label_col=label_col,
                                             id_col=args.id_col)
     feature_set = reader.read()
     # write out the file in the requested output format
     writer_type = EXT_TO_WRITER[output_extension]
     writer_args = {'quiet': args.quiet}
     if writer_type is DelimitedFileWriter:
-        writer_args['label_col'] = args.label_col
+        writer_args['label_col'] = label_col
         writer_args['id_col'] = args.id_col
     elif writer_type is ARFFWriter:
-        writer_args['label_col'] = args.label_col
+        writer_args['label_col'] = label_col
         writer_args['id_col'] = args.id_col
         writer_args['regression'] = args.arff_regression
         writer_args['relation'] = args.arff_relation

diff --git a/tests/test_featureset.py b/tests/test_featureset.py
@@ -761,7 +761,7 @@ def test_dict_list_reader():
 
 
 # Tests related to converting featuresets
-def make_conversion_data(num_feat_files, from_suffix, to_suffix):
+def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True):
     num_examples = 500
     num_feats_per_file = 7
 
@@ -774,33 +774,49 @@ def make_conversion_data(num_feat_files, from_suffix, to_suffix):
     # Create lists we will write files from
     ids = []
     features = []
-    labels = []
+    labels = [] if with_labels else None
     for j in range(num_examples):
         y = "dog" if j % 2 == 0 else "cat"
         ex_id = "{}{}".format(y, j)
-        x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num
+        # if we are not using labels, we do not want zero-valued features
+        # because it may be the case that some subset of features end up
+        # being all 0 and if this subset ends up being written out to a file
+        # below, then for some formats (e.g., megam) nothing will get written
+        # out which can cause issues when reading this file
+        lowest_feature_value = 0 if with_labels else 1
+        x = {"f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num
              in range(num_feat_files * num_feats_per_file)}
         x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
         ids.append(ex_id)
-        labels.append(y)
+        if with_labels:
+            labels.append(y)
         features.append(x)
+
     # Create vectorizers/maps for libsvm subset writing
     feat_vectorizer = DictVectorizer()
     feat_vectorizer.fit(features)
-    label_map = {label: num for num, label in
-                 enumerate(sorted({label for label in labels if
-                                   not isinstance(label, (int, float))}))}
-    # Add fake item to vectorizer for None
-    label_map[None] = '00000'
+    if with_labels:
+        label_map = {label: num for num, label in
+                     enumerate(sorted({label for label in labels if
+                                       not isinstance(label, (int, float))}))}
+        # Add fake item to vectorizer for None
+        label_map[None] = '00000'
+    else:
+        label_map = None
 
     # get the feature name prefix
     feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                             to_suffix.lstrip('.'))
 
+    # use '_unlabeled' as part of any file names when not using labels
+    with_labels_part = '' if with_labels else '_unlabeled'
+
     # Write out unmerged features in the `from_suffix` file format
     for i in range(num_feat_files):
-        train_path = join(convert_dir, '{}_{}{}'.format(feature_name_prefix,
-                                                        i, from_suffix))
+        train_path = join(convert_dir, '{}_{}{}{}'.format(feature_name_prefix,
+                                                          i,
+                                                          with_labels_part,
+                                                          from_suffix))
         sub_features = []
         for example_num in range(num_examples):
             feat_num = i * num_feats_per_file
@@ -814,26 +830,43 @@ def make_conversion_data(num_feat_files, from_suffix, to_suffix):
         if from_suffix == '.libsvm':
             Writer.for_path(train_path, train_fs,
                             label_map=label_map).write()
+        elif from_suffix in ['.arff', '.csv', '.tsv']:
+            label_col = 'y' if with_labels else None
+            Writer.for_path(train_path, train_fs, label_col=label_col).write()
         else:
             Writer.for_path(train_path, train_fs).write()
 
     # Write out the merged features in the `to_suffix` file format
-    train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix,
-                                                     to_suffix))
+    train_path = join(convert_dir, '{}{}_all{}'.format(feature_name_prefix,
+                                                       with_labels_part,
+                                                       to_suffix))
     train_fs = FeatureSet('train', ids, labels=labels, features=features,
                           vectorizer=feat_vectorizer)
+
+    # we need to do this to get around the FeatureSet using NaNs
+    # instead of None when there are no labels which causes problems
+    # later when comparing featuresets
+    if not with_labels:
+        train_fs.labels = [None] * len(train_fs.labels)
+
     if to_suffix == '.libsvm':
         Writer.for_path(train_path, train_fs,
                         label_map=label_map).write()
+    elif to_suffix in ['.arff', '.csv', '.tsv']:
+        label_col = 'y' if with_labels else None
+        Writer.for_path(train_path, train_fs, label_col=label_col).write()
     else:
         Writer.for_path(train_path, train_fs).write()
 
 
-def check_convert_featureset(from_suffix, to_suffix):
+def check_convert_featureset(from_suffix, to_suffix, with_labels=True):
     num_feat_files = 5
 
     # Create test data
-    make_conversion_data(num_feat_files, from_suffix, to_suffix)
+    make_conversion_data(num_feat_files,
+                         from_suffix,
+                         to_suffix,
+                         with_labels=with_labels)
 
     # the path to the unmerged feature files
     dirpath = join(_my_dir, 'train', 'test_conversion')
@@ -842,30 +875,48 @@ def check_convert_featureset(from_suffix, to_suffix):
     feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                             to_suffix.lstrip('.'))
 
+    # use '_unlabeled' as part of any file names when not using labels
+    with_labels_part = '' if with_labels else '_unlabeled'
+
     # Load each unmerged feature file in the `from_suffix` format and convert
     # it to the `to_suffix` format
     for feature in range(num_feat_files):
-        input_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
-                                                         feature,
-                                                         from_suffix))
-        output_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
-                                                          feature, to_suffix))
-        skll_convert.main(['--quiet', input_file_path, output_file_path])
+        input_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
+                                                           feature,
+                                                           with_labels_part,
+                                                           from_suffix))
+        output_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
+                                                            feature,
+                                                            with_labels_part,
+                                                            to_suffix))
+        skll_convert_args = ['--quiet', input_file_path, output_file_path]
+        if not with_labels:
+            skll_convert_args.append('--no_labels')
+        skll_convert.main(skll_convert_args)
 
     # now load and merge all unmerged, converted features in the `to_suffix`
     # format
-    featureset = ['{}_{}'.format(feature_name_prefix, i) for i in
+    featureset = ['{}_{}{}'.format(feature_name_prefix, i, with_labels_part) for i in
                   range(num_feat_files)]
-    merged_exs = _load_featureset(dirpath, featureset, to_suffix,
+    label_col = 'y' if with_labels else None
+    merged_exs = _load_featureset(dirpath,
+                                  featureset,
+                                  to_suffix,
+                                  label_col=label_col,
                                   quiet=True)
 
     # Load pre-merged data in the `to_suffix` format
-    featureset = ['{}_all'.format(feature_name_prefix)]
-    premerged_exs = _load_featureset(dirpath, featureset, to_suffix,
+    featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)]
+    premerged_exs = _load_featureset(dirpath,
+                                     featureset,
+                                     to_suffix,
+                                     label_col=label_col,
                                      quiet=True)
 
     # make sure that the pre-generated merged data in the to_suffix format
     # is the same as the converted, merged data in the to_suffix format
+
+    # first check the IDs
     assert_array_equal(merged_exs.ids, premerged_exs.ids)
     assert_array_equal(merged_exs.labels, premerged_exs.labels)
     for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs,
@@ -877,11 +928,13 @@ def check_convert_featureset(from_suffix, to_suffix):
 
 def test_convert_featureset():
     # Test the conversion from every format to every other format
-    for from_suffix, to_suffix in itertools.permutations(['.jsonlines', '.ndj',
+    # with and without labels
+    for from_suffix, to_suffix in itertools.permutations(['.jsonlines',
                                                           '.megam', '.tsv',
                                                           '.csv', '.arff',
                                                           '.libsvm'], 2):
-        yield check_convert_featureset, from_suffix, to_suffix
+        yield check_convert_featureset, from_suffix, to_suffix, True
+        yield check_convert_featureset, from_suffix, to_suffix, False
 
 
 def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):

diff --git a/tests/test_utilities.py b/tests/test_utilities.py
@@ -766,7 +766,7 @@ def check_skll_convert(from_suffix, to_suffix):
 
 
 def test_skll_convert():
-    for from_suffix, to_suffix in itertools.permutations(['.jsonlines', '.ndj',
+    for from_suffix, to_suffix in itertools.permutations(['.jsonlines',
                                                           '.megam', '.tsv',
                                                           '.csv', '.arff',
                                                           '.libsvm'], 2):
@@ -834,6 +834,15 @@ def test_skll_convert_libsvm_map():
     eq_(orig_fs, converted_fs)
 
 
+@raises(SystemExit)
+def test_skll_convert_no_labels_with_label_col():
+    """
+    Check that --no_labels/--label_col cannot both be specified for skll_convert
+    """
+    skll_convert_cmd = ['--no_labels', '--label_col', 't', 'foo.tsv', 'foo.libsvm']
+    sk.main(argv=skll_convert_cmd)
+
+
 def check_print_model_weights(task='classification'):
 
     # create some simple classification or regression data