Skip to content

Commit

Permalink
Merge pull request #440 from EducationalTestingService/possible-ndjwr…
Browse files Browse the repository at this point in the history
…iter-bug

Fix NDJWriter bug
  • Loading branch information
desilinguist committed Dec 5, 2018
2 parents 48e5f1f + d795a81 commit ac80bb0
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 10 deletions.
30 changes: 21 additions & 9 deletions skll/data/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,24 +237,24 @@ class DelimitedFileWriter(Writer):
type. For example ``/foo/.csv``.
feature_set : skll.FeatureSet
The ``FeatureSet`` instance to dump to the output file.
quiet : bool
quiet : bool, optional
Do not print "Writing..." status message to stderr.
Defaults to ``True``.
label_col : str
label_col : str, optional
Name of the column which contains the class labels
for ARFF/CSV/TSV files. If no column with that name
exists, or ``None`` is specified, the data is
considered to be unlabelled.
Defaults to ``'y'``.
id_col : str
id_col : str, optional
Name of the column which contains the instance IDs.
If no column with that name exists, or ``None`` is
specified, example IDs will be automatically generated.
Defaults to ``'id'``.
dialect : str
Name of the column which contains the class labels for
CSV/TSV files.
logger : logging.Logger
dialect : str, optional
The dialect to use for writing out the delimited file.
Defaults to ``'excel-tab'``.
logger : logging.Logger, optional
A logger instance to use to log messages instead of creating
a new one by default.
Defaults to ``None``.
Expand Down Expand Up @@ -585,9 +585,21 @@ def _write_line(self, id_, label_, feat_dict, output_file):
"""
example_dict = {}
# Don't try to add class column if this is label-less data
# Try to convert the label to a scalar assuming it'a numpy
# non-scalar type (e.g., int64) but if that doesn't work
# then use it as is
if self.feat_set.has_labels:
example_dict['y'] = np.asscalar(label_)
example_dict['id'] = np.asscalar(id_)
try:
example_dict['y'] = label_.item()
except AttributeError:
example_dict['y'] = label_
# Try to convert the ID to a scalar assuming it'a numpy
# non-scalar type (e.g., int64) but if that doesn't work
# then use it as is
try:
example_dict['id'] = id_.item()
except AttributeError:
example_dict['id'] = id_
example_dict["x"] = feat_dict
print(json.dumps(example_dict, sort_keys=True), file=output_file)

Expand Down
84 changes: 83 additions & 1 deletion tests/test_featureset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from sklearn.datasets.samples_generator import make_classification

import skll
from skll.data import FeatureSet, Writer, Reader
from skll.data import FeatureSet, Writer, Reader, NDJReader, NDJWriter
from skll.data.readers import DictListReader
from skll.experiments import _load_featureset
from skll.learner import _DEFAULT_PARAM_GRIDS
Expand Down Expand Up @@ -62,6 +62,11 @@ def tearDown():
if exists(filepath):
os.unlink(filepath)

filepaths = [join(_my_dir, 'other', '{}.jsonlines'.format(x)) for x in ['test_string_ids', 'test_string_ids_df', 'test_string_labels_df']]
for filepath in filepaths:
if exists(filepath):
os.unlink(filepath)


def _create_empty_file(filetype):
filepath = join(_my_dir, 'other', 'empty.{}'.format(filetype))
Expand Down Expand Up @@ -90,6 +95,7 @@ def test_empty_ids():
# create a feature set with ids set to None and raise ValueError
FeatureSet('test', None, features=features, labels=y)


@raises(ValueError)
def check_empty_file_read(filetype, reader_type):
empty_filepath = _create_empty_file(filetype)
Expand Down Expand Up @@ -972,3 +978,79 @@ def test_featureset_creation_from_dataframe_without_labels_with_vectorizer():
rtol=1e-6) and
np.all(np.isnan(expected.labels)) and
np.all(np.isnan(current.labels)))


def test_writing_ndj_featureset_with_string_ids():
test_dict_vectorizer = DictVectorizer()
test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
fs_test = FeatureSet('test',
ids=['1', '2'],
labels=[1, 2],
features=Xtest,
vectorizer=test_dict_vectorizer)
output_path = join(_my_dir, "other", "test_string_ids.jsonlines")
test_writer = NDJWriter(output_path, fs_test)
test_writer.write()

# read in the written file into a featureset and confirm that the
# two featuresets are equal
fs_test2 = NDJReader.for_path(output_path).read()

assert fs_test == fs_test2


@attr('have_pandas_and_seaborn')
def test_featureset_creation_from_dataframe_with_string_ids():

import pandas

dftest = pandas.DataFrame({"id": ['1', '2'],
"score": [1, 2],
"text": ["a b", "b c"]})
dftest.set_index("id", inplace=True)
test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
test_dict_vectorizer = DictVectorizer()
Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
fs_test = FeatureSet('test',
ids=dftest.index.values,
labels=dftest['score'].values,
features=Xtest,
vectorizer=test_dict_vectorizer)
output_path = join(_my_dir, "other", "test_string_ids_df.jsonlines")
test_writer = NDJWriter(output_path, fs_test)
test_writer.write()

# read in the written file into a featureset and confirm that the
# two featuresets are equal
fs_test2 = NDJReader.for_path(output_path).read()

assert fs_test == fs_test2


@attr('have_pandas_and_seaborn')
def test_featureset_creation_from_dataframe_with_string_labels():

import pandas

dftest = pandas.DataFrame({"id": [1, 2],
"score": ['yes', 'no'],
"text": ["a b", "b c"]})
dftest.set_index("id", inplace=True)
test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
test_dict_vectorizer = DictVectorizer()
Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
fs_test = FeatureSet('test',
ids=dftest.index.values,
labels=dftest['score'].values,
features=Xtest,
vectorizer=test_dict_vectorizer)
output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines")
test_writer = NDJWriter(output_path, fs_test)
test_writer.write()

# read in the written file into a featureset and confirm that the
# two featuresets are equal
fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read()

assert fs_test == fs_test2

0 comments on commit ac80bb0

Please sign in to comment.