Merge pull request #440 from EducationalTestingService/possible-ndjwr…

…iter-bug Fix NDJWriter bug
EducationalTestingService · Dec 5, 2018 · ac80bb0 · ac80bb0
2 parents 48e5f1f + d795a81
commit ac80bb0
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 10 deletions.
diff --git a/skll/data/writers.py b/skll/data/writers.py
@@ -237,24 +237,24 @@ class DelimitedFileWriter(Writer):
         type. For example ``/foo/.csv``.
     feature_set : skll.FeatureSet
         The ``FeatureSet`` instance to dump to the output file.
-    quiet : bool
+    quiet : bool, optional
         Do not print "Writing..." status message to stderr.
         Defaults to ``True``.
-    label_col : str
+    label_col : str, optional
         Name of the column which contains the class labels
         for ARFF/CSV/TSV files. If no column with that name
         exists, or ``None`` is specified, the data is
         considered to be unlabelled.
         Defaults to ``'y'``.
-    id_col : str
+    id_col : str, optional
         Name of the column which contains the instance IDs.
         If no column with that name exists, or ``None`` is
         specified, example IDs will be automatically generated.
         Defaults to ``'id'``.
-    dialect : str
-        Name of the column which contains the class labels for
-        CSV/TSV files.
-    logger : logging.Logger
+    dialect : str, optional
+        The dialect to use for writing out the delimited file.
+        Defaults to ``'excel-tab'``.
+    logger : logging.Logger, optional
         A logger instance to use to log messages instead of creating
         a new one by default.
         Defaults to ``None``.
@@ -585,9 +585,21 @@ def _write_line(self, id_, label_, feat_dict, output_file):
         """
         example_dict = {}
         # Don't try to add class column if this is label-less data
+        # Try to convert the label to a scalar assuming it'a numpy
+        # non-scalar type (e.g., int64) but if that doesn't work
+        # then use it as is
         if self.feat_set.has_labels:
-            example_dict['y'] = np.asscalar(label_)
-        example_dict['id'] = np.asscalar(id_)
+            try:
+                example_dict['y'] = label_.item()
+            except AttributeError:
+                example_dict['y'] = label_
+        # Try to convert the ID to a scalar assuming it'a numpy
+        # non-scalar type (e.g., int64) but if that doesn't work
+        # then use it as is
+        try:
+            example_dict['id'] = id_.item()
+        except AttributeError:
+            example_dict['id'] = id_
         example_dict["x"] = feat_dict
         print(json.dumps(example_dict, sort_keys=True), file=output_file)
 

diff --git a/tests/test_featureset.py b/tests/test_featureset.py
@@ -25,7 +25,7 @@
 from sklearn.datasets.samples_generator import make_classification
 
 import skll
-from skll.data import FeatureSet, Writer, Reader
+from skll.data import FeatureSet, Writer, Reader, NDJReader, NDJWriter
 from skll.data.readers import DictListReader
 from skll.experiments import _load_featureset
 from skll.learner import _DEFAULT_PARAM_GRIDS
@@ -62,6 +62,11 @@ def tearDown():
         if exists(filepath):
             os.unlink(filepath)
 
+    filepaths = [join(_my_dir, 'other', '{}.jsonlines'.format(x)) for x in ['test_string_ids', 'test_string_ids_df', 'test_string_labels_df']]
+    for filepath in filepaths:
+        if exists(filepath):
+            os.unlink(filepath)
+
 
 def _create_empty_file(filetype):
     filepath = join(_my_dir, 'other', 'empty.{}'.format(filetype))
@@ -90,6 +95,7 @@ def test_empty_ids():
     # create a feature set with ids set to None and raise ValueError
     FeatureSet('test', None, features=features, labels=y)
 
+
 @raises(ValueError)
 def check_empty_file_read(filetype, reader_type):
     empty_filepath = _create_empty_file(filetype)
@@ -972,3 +978,79 @@ def test_featureset_creation_from_dataframe_without_labels_with_vectorizer():
                         rtol=1e-6) and
             np.all(np.isnan(expected.labels)) and
             np.all(np.isnan(current.labels)))
+
+
+def test_writing_ndj_featureset_with_string_ids():
+    test_dict_vectorizer = DictVectorizer()
+    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
+    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
+    fs_test = FeatureSet('test',
+                         ids=['1', '2'],
+                         labels=[1, 2],
+                         features=Xtest,
+                         vectorizer=test_dict_vectorizer)
+    output_path = join(_my_dir, "other", "test_string_ids.jsonlines")
+    test_writer = NDJWriter(output_path, fs_test)
+    test_writer.write()
+
+    # read in the written file into a featureset and confirm that the
+    # two featuresets are equal
+    fs_test2 = NDJReader.for_path(output_path).read()
+
+    assert fs_test == fs_test2
+
+
+@attr('have_pandas_and_seaborn')
+def test_featureset_creation_from_dataframe_with_string_ids():
+
+    import pandas
+
+    dftest = pandas.DataFrame({"id": ['1', '2'],
+                               "score": [1, 2],
+                               "text": ["a b", "b c"]})
+    dftest.set_index("id", inplace=True)
+    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
+    test_dict_vectorizer = DictVectorizer()
+    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
+    fs_test = FeatureSet('test',
+                         ids=dftest.index.values,
+                         labels=dftest['score'].values,
+                         features=Xtest,
+                         vectorizer=test_dict_vectorizer)
+    output_path = join(_my_dir, "other", "test_string_ids_df.jsonlines")
+    test_writer = NDJWriter(output_path, fs_test)
+    test_writer.write()
+
+    # read in the written file into a featureset and confirm that the
+    # two featuresets are equal
+    fs_test2 = NDJReader.for_path(output_path).read()
+
+    assert fs_test == fs_test2
+
+
+@attr('have_pandas_and_seaborn')
+def test_featureset_creation_from_dataframe_with_string_labels():
+
+    import pandas
+
+    dftest = pandas.DataFrame({"id": [1, 2],
+                               "score": ['yes', 'no'],
+                               "text": ["a b", "b c"]})
+    dftest.set_index("id", inplace=True)
+    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
+    test_dict_vectorizer = DictVectorizer()
+    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
+    fs_test = FeatureSet('test',
+                         ids=dftest.index.values,
+                         labels=dftest['score'].values,
+                         features=Xtest,
+                         vectorizer=test_dict_vectorizer)
+    output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines")
+    test_writer = NDJWriter(output_path, fs_test)
+    test_writer.write()
+
+    # read in the written file into a featureset and confirm that the
+    # two featuresets are equal
+    fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read()
+
+    assert fs_test == fs_test2