Update test_convert_featureset to also test for unlabeled data

- Remove unnecessary test and associated files.
EducationalTestingService · Feb 5, 2019 · a84f7dc · a84f7dc
1 parent faf511e
commit a84f7dc
Show file tree

Hide file tree

Showing 3 changed files with 83 additions and 57 deletions.
diff --git a/tests/other/featureset_no_labels_expected.jsonlines b/tests/other/featureset_no_labels_expected.jsonlines
diff --git a/tests/test_featureset.py b/tests/test_featureset.py
@@ -18,14 +18,14 @@
 from os.path import abspath, dirname, exists, join
 
 import numpy as np
-from nose.tools import eq_, raises, assert_not_equal, assert_equal
+from nose.tools import eq_, raises, assert_not_equal
 from nose.plugins.attrib import attr
 from numpy.testing import assert_array_equal
 from sklearn.feature_extraction import DictVectorizer, FeatureHasher
 from sklearn.datasets.samples_generator import make_classification
 
 import skll
-from skll.data import FeatureSet, Writer, Reader, CSVReader, NDJWriter
+from skll.data import FeatureSet, Writer, Reader
 from skll.data.readers import DictListReader
 from skll.experiments import _load_featureset
 from skll.learner import _DEFAULT_PARAM_GRIDS
@@ -90,6 +90,7 @@ def test_empty_ids():
     # create a feature set with ids set to None and raise ValueError
     FeatureSet('test', None, features=features, labels=y)
 
+
 @raises(ValueError)
 def check_empty_file_read(filetype, reader_type):
     empty_filepath = _create_empty_file(filetype)
@@ -755,7 +756,7 @@ def test_dict_list_reader():
 
 
 # Tests related to converting featuresets
-def make_conversion_data(num_feat_files, from_suffix, to_suffix):
+def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True):
     num_examples = 500
     num_feats_per_file = 7
 
@@ -768,33 +769,49 @@ def make_conversion_data(num_feat_files, from_suffix, to_suffix):
     # Create lists we will write files from
     ids = []
     features = []
-    labels = []
+    labels = [] if with_labels else None
     for j in range(num_examples):
         y = "dog" if j % 2 == 0 else "cat"
         ex_id = "{}{}".format(y, j)
-        x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num
+        # if we are not using labels, we do not want zero-valued features
+        # because it may be the case that some subset of features end up
+        # being all 0 and if this subset ends up being written out to a file
+        # below, then for some formats (e.g., megam) nothing will get written
+        # out which can cause issues when reading this file
+        lowest_feature_value = 0 if with_labels else 1
+        x = {"f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num
              in range(num_feat_files * num_feats_per_file)}
         x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
         ids.append(ex_id)
-        labels.append(y)
+        if with_labels:
+            labels.append(y)
         features.append(x)
+
     # Create vectorizers/maps for libsvm subset writing
     feat_vectorizer = DictVectorizer()
     feat_vectorizer.fit(features)
-    label_map = {label: num for num, label in
-                 enumerate(sorted({label for label in labels if
-                                   not isinstance(label, (int, float))}))}
-    # Add fake item to vectorizer for None
-    label_map[None] = '00000'
+    if with_labels:
+        label_map = {label: num for num, label in
+                     enumerate(sorted({label for label in labels if
+                                       not isinstance(label, (int, float))}))}
+        # Add fake item to vectorizer for None
+        label_map[None] = '00000'
+    else:
+        label_map = None
 
     # get the feature name prefix
     feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                             to_suffix.lstrip('.'))
 
+    # use '_unlabeled' as part of any file names when not using labels
+    with_labels_part = '' if with_labels else '_unlabeled'
+
     # Write out unmerged features in the `from_suffix` file format
     for i in range(num_feat_files):
-        train_path = join(convert_dir, '{}_{}{}'.format(feature_name_prefix,
-                                                        i, from_suffix))
+        train_path = join(convert_dir, '{}_{}{}{}'.format(feature_name_prefix,
+                                                          i,
+                                                          with_labels_part,
+                                                          from_suffix))
         sub_features = []
         for example_num in range(num_examples):
             feat_num = i * num_feats_per_file
@@ -808,26 +825,43 @@ def make_conversion_data(num_feat_files, from_suffix, to_suffix):
         if from_suffix == '.libsvm':
             Writer.for_path(train_path, train_fs,
                             label_map=label_map).write()
+        elif from_suffix in ['.arff', '.csv', '.tsv']:
+            label_col = 'y' if with_labels else None
+            Writer.for_path(train_path, train_fs, label_col=label_col).write()
         else:
             Writer.for_path(train_path, train_fs).write()
 
     # Write out the merged features in the `to_suffix` file format
-    train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix,
-                                                     to_suffix))
+    train_path = join(convert_dir, '{}{}_all{}'.format(feature_name_prefix,
+                                                       with_labels_part,
+                                                       to_suffix))
     train_fs = FeatureSet('train', ids, labels=labels, features=features,
                           vectorizer=feat_vectorizer)
+
+    # we need to do this to get around the FeatureSet using NaNs
+    # instead of None when there are no labels which causes problems
+    # later when comparing featuresets
+    if not with_labels:
+        train_fs.labels = [None] * len(train_fs.labels)
+
     if to_suffix == '.libsvm':
         Writer.for_path(train_path, train_fs,
                         label_map=label_map).write()
+    elif to_suffix in ['.arff', '.csv', '.tsv']:
+        label_col = 'y' if with_labels else None
+        Writer.for_path(train_path, train_fs, label_col=label_col).write()
     else:
         Writer.for_path(train_path, train_fs).write()
 
 
-def check_convert_featureset(from_suffix, to_suffix):
+def check_convert_featureset(from_suffix, to_suffix, with_labels=True):
     num_feat_files = 5
 
     # Create test data
-    make_conversion_data(num_feat_files, from_suffix, to_suffix)
+    make_conversion_data(num_feat_files,
+                         from_suffix,
+                         to_suffix,
+                         with_labels=with_labels)
 
     # the path to the unmerged feature files
     dirpath = join(_my_dir, 'train', 'test_conversion')
@@ -836,30 +870,48 @@ def check_convert_featureset(from_suffix, to_suffix):
     feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                             to_suffix.lstrip('.'))
 
+    # use '_unlabeled' as part of any file names when not using labels
+    with_labels_part = '' if with_labels else '_unlabeled'
+
     # Load each unmerged feature file in the `from_suffix` format and convert
     # it to the `to_suffix` format
     for feature in range(num_feat_files):
-        input_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
-                                                         feature,
-                                                         from_suffix))
-        output_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
-                                                          feature, to_suffix))
-        skll_convert.main(['--quiet', input_file_path, output_file_path])
+        input_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
+                                                           feature,
+                                                           with_labels_part,
+                                                           from_suffix))
+        output_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
+                                                            feature,
+                                                            with_labels_part,
+                                                            to_suffix))
+        skll_convert_args = ['--quiet', input_file_path, output_file_path]
+        if not with_labels:
+            skll_convert_args.append('--no_labels')
+        skll_convert.main(skll_convert_args)
 
     # now load and merge all unmerged, converted features in the `to_suffix`
     # format
-    featureset = ['{}_{}'.format(feature_name_prefix, i) for i in
+    featureset = ['{}_{}{}'.format(feature_name_prefix, i, with_labels_part) for i in
                   range(num_feat_files)]
-    merged_exs = _load_featureset(dirpath, featureset, to_suffix,
+    label_col = 'y' if with_labels else None
+    merged_exs = _load_featureset(dirpath,
+                                  featureset,
+                                  to_suffix,
+                                  label_col=label_col,
                                   quiet=True)
 
     # Load pre-merged data in the `to_suffix` format
-    featureset = ['{}_all'.format(feature_name_prefix)]
-    premerged_exs = _load_featureset(dirpath, featureset, to_suffix,
+    featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)]
+    premerged_exs = _load_featureset(dirpath,
+                                     featureset,
+                                     to_suffix,
+                                     label_col=label_col,
                                      quiet=True)
 
     # make sure that the pre-generated merged data in the to_suffix format
     # is the same as the converted, merged data in the to_suffix format
+
+    # first check the IDs
     assert_array_equal(merged_exs.ids, premerged_exs.ids)
     assert_array_equal(merged_exs.labels, premerged_exs.labels)
     for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs,
@@ -871,11 +923,13 @@ def check_convert_featureset(from_suffix, to_suffix):
 
 def test_convert_featureset():
     # Test the conversion from every format to every other format
-    for from_suffix, to_suffix in itertools.permutations(['.jsonlines', '.ndj',
+    # with and without labels
+    for from_suffix, to_suffix in itertools.permutations(['.jsonlines',
                                                           '.megam', '.tsv',
                                                           '.csv', '.arff',
                                                           '.libsvm'], 2):
-        yield check_convert_featureset, from_suffix, to_suffix
+        yield check_convert_featureset, from_suffix, to_suffix, True
+        yield check_convert_featureset, from_suffix, to_suffix, False
 
 
 def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
@@ -972,22 +1026,3 @@ def test_featureset_creation_from_dataframe_without_labels_with_vectorizer():
                         rtol=1e-6) and
             np.all(np.isnan(expected.labels)) and
             np.all(np.isnan(current.labels)))
-
-
-def test_read_write_no_labels():
-    featureset_file = os.path.join(_my_dir, 'train',
-                                   'featureset_no_labels_input.csv')
-    reader = CSVReader(featureset_file, label_col=None)
-    fs = reader.read()
-    outpath = os.path.join(_my_dir, 'output', 'featureset_no_labels.jsonlines')
-    writer = NDJWriter(outpath, fs)
-    writer.write()
-    expected_path = os.path.join(_my_dir, "other",
-                                 "featureset_no_labels_expected.jsonlines")
-
-    with open(expected_path) as f_expected:
-        expected_output = f_expected.read()
-    with open(outpath) as f_actual:
-        actual_output = f_actual.read()
-
-    assert_equal(actual_output, expected_output)
diff --git a/tests/train/featureset_no_labels_input.csv b/tests/train/featureset_no_labels_input.csv