Skip to content

Commit

Permalink
Merge pull request #453 from EducationalTestingService/unlabelled-rea…
Browse files Browse the repository at this point in the history
…dwrite-compatibility

Properly handle unlabeled data in multiple places in SKLL
  • Loading branch information
desilinguist committed Feb 11, 2019
2 parents e911a0f + df20b46 commit 0b13413
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 41 deletions.
4 changes: 3 additions & 1 deletion skll/data/featureset.py
Expand Up @@ -413,7 +413,9 @@ def has_labels(self):
has_labels : bool
Whether or not this FeatureSet has any finite labels.
"""
if self.labels is not None:
# make sure that labels is not None or a list of Nones
if self.labels is not None and not all(label is None for label in self.labels):
# then check that they are not a list of NaNs
return not (np.issubdtype(self.labels.dtype, np.floating) and
np.isnan(np.min(self.labels)))
else:
Expand Down
5 changes: 3 additions & 2 deletions skll/data/writers.py
Expand Up @@ -458,7 +458,7 @@ def _write_header(self, feature_set, output_file, filter_features):
features to include in this file.
"""
fieldnames = self._get_fieldnames(filter_features)
if self.label_col in fieldnames:
if self.label_col and self.label_col in fieldnames:
fieldnames.remove(self.label_col)

# Add relation to header
Expand All @@ -480,7 +480,8 @@ def _write_header(self, feature_set, output_file, filter_features):
"{" + ','.join(map(str,
sorted(set(self.feat_set.labels)))) +
"}", file=output_file)
fieldnames.append(self.label_col)
if self.label_col:
fieldnames.append(self.label_col)

# Create CSV writer to handle missing values for lines in data section
# and to ignore the instance values for non-numeric attributes
Expand Down
29 changes: 19 additions & 10 deletions skll/utilities/skll_convert.py
Expand Up @@ -63,12 +63,18 @@ def main(argv=None):
help='Name of the column which contains the instance \
IDs in ARFF, CSV, or TSV files.',
default='id')
parser.add_argument('-l', '--label_col',
help='Name of the column which contains the class \
labels in ARFF, CSV, or TSV files. For ARFF \
files, this must be the final column to count as\
the label.',
default='y')
label_group = parser.add_mutually_exclusive_group(required=False)
label_group.add_argument('-l',
'--label_col',
help='Name of the column which contains the class \
labels in ARFF, CSV, or TSV files. For ARFF \
files, this must be the final column to count as\
the label.',
default='y')
label_group.add_argument('--no_labels',
action='store_true',
default=False,
help='Used to indicate that the input data has no labels.')
parser.add_argument('-q', '--quiet',
help='Suppress printing of "Loading..." messages.',
action='store_true')
Expand Down Expand Up @@ -132,19 +138,22 @@ def main(argv=None):
feat_vectorizer = None
label_map = None

label_col = None if args.no_labels else args.label_col

# Iterate through input file and collect the information we need
reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet,
label_col=args.label_col,
reader = EXT_TO_READER[input_extension](args.infile,
quiet=args.quiet,
label_col=label_col,
id_col=args.id_col)
feature_set = reader.read()
# write out the file in the requested output format
writer_type = EXT_TO_WRITER[output_extension]
writer_args = {'quiet': args.quiet}
if writer_type is DelimitedFileWriter:
writer_args['label_col'] = args.label_col
writer_args['label_col'] = label_col
writer_args['id_col'] = args.id_col
elif writer_type is ARFFWriter:
writer_args['label_col'] = args.label_col
writer_args['label_col'] = label_col
writer_args['id_col'] = args.id_col
writer_args['regression'] = args.arff_regression
writer_args['relation'] = args.arff_relation
Expand Down
107 changes: 80 additions & 27 deletions tests/test_featureset.py
Expand Up @@ -761,7 +761,7 @@ def test_dict_list_reader():


# Tests related to converting featuresets
def make_conversion_data(num_feat_files, from_suffix, to_suffix):
def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True):
num_examples = 500
num_feats_per_file = 7

Expand All @@ -774,33 +774,49 @@ def make_conversion_data(num_feat_files, from_suffix, to_suffix):
# Create lists we will write files from
ids = []
features = []
labels = []
labels = [] if with_labels else None
for j in range(num_examples):
y = "dog" if j % 2 == 0 else "cat"
ex_id = "{}{}".format(y, j)
x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num
# if we are not using labels, we do not want zero-valued features
# because it may be the case that some subset of features end up
# being all 0 and if this subset ends up being written out to a file
# below, then for some formats (e.g., megam) nothing will get written
# out which can cause issues when reading this file
lowest_feature_value = 0 if with_labels else 1
x = {"f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num
in range(num_feat_files * num_feats_per_file)}
x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
ids.append(ex_id)
labels.append(y)
if with_labels:
labels.append(y)
features.append(x)

# Create vectorizers/maps for libsvm subset writing
feat_vectorizer = DictVectorizer()
feat_vectorizer.fit(features)
label_map = {label: num for num, label in
enumerate(sorted({label for label in labels if
not isinstance(label, (int, float))}))}
# Add fake item to vectorizer for None
label_map[None] = '00000'
if with_labels:
label_map = {label: num for num, label in
enumerate(sorted({label for label in labels if
not isinstance(label, (int, float))}))}
# Add fake item to vectorizer for None
label_map[None] = '00000'
else:
label_map = None

# get the feature name prefix
feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
to_suffix.lstrip('.'))

# use '_unlabeled' as part of any file names when not using labels
with_labels_part = '' if with_labels else '_unlabeled'

# Write out unmerged features in the `from_suffix` file format
for i in range(num_feat_files):
train_path = join(convert_dir, '{}_{}{}'.format(feature_name_prefix,
i, from_suffix))
train_path = join(convert_dir, '{}_{}{}{}'.format(feature_name_prefix,
i,
with_labels_part,
from_suffix))
sub_features = []
for example_num in range(num_examples):
feat_num = i * num_feats_per_file
Expand All @@ -814,26 +830,43 @@ def make_conversion_data(num_feat_files, from_suffix, to_suffix):
if from_suffix == '.libsvm':
Writer.for_path(train_path, train_fs,
label_map=label_map).write()
elif from_suffix in ['.arff', '.csv', '.tsv']:
label_col = 'y' if with_labels else None
Writer.for_path(train_path, train_fs, label_col=label_col).write()
else:
Writer.for_path(train_path, train_fs).write()

# Write out the merged features in the `to_suffix` file format
train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix,
to_suffix))
train_path = join(convert_dir, '{}{}_all{}'.format(feature_name_prefix,
with_labels_part,
to_suffix))
train_fs = FeatureSet('train', ids, labels=labels, features=features,
vectorizer=feat_vectorizer)

# we need to do this to get around the FeatureSet using NaNs
# instead of None when there are no labels which causes problems
# later when comparing featuresets
if not with_labels:
train_fs.labels = [None] * len(train_fs.labels)

if to_suffix == '.libsvm':
Writer.for_path(train_path, train_fs,
label_map=label_map).write()
elif to_suffix in ['.arff', '.csv', '.tsv']:
label_col = 'y' if with_labels else None
Writer.for_path(train_path, train_fs, label_col=label_col).write()
else:
Writer.for_path(train_path, train_fs).write()


def check_convert_featureset(from_suffix, to_suffix):
def check_convert_featureset(from_suffix, to_suffix, with_labels=True):
num_feat_files = 5

# Create test data
make_conversion_data(num_feat_files, from_suffix, to_suffix)
make_conversion_data(num_feat_files,
from_suffix,
to_suffix,
with_labels=with_labels)

# the path to the unmerged feature files
dirpath = join(_my_dir, 'train', 'test_conversion')
Expand All @@ -842,30 +875,48 @@ def check_convert_featureset(from_suffix, to_suffix):
feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
to_suffix.lstrip('.'))

# use '_unlabeled' as part of any file names when not using labels
with_labels_part = '' if with_labels else '_unlabeled'

# Load each unmerged feature file in the `from_suffix` format and convert
# it to the `to_suffix` format
for feature in range(num_feat_files):
input_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
feature,
from_suffix))
output_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
feature, to_suffix))
skll_convert.main(['--quiet', input_file_path, output_file_path])
input_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
feature,
with_labels_part,
from_suffix))
output_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
feature,
with_labels_part,
to_suffix))
skll_convert_args = ['--quiet', input_file_path, output_file_path]
if not with_labels:
skll_convert_args.append('--no_labels')
skll_convert.main(skll_convert_args)

# now load and merge all unmerged, converted features in the `to_suffix`
# format
featureset = ['{}_{}'.format(feature_name_prefix, i) for i in
featureset = ['{}_{}{}'.format(feature_name_prefix, i, with_labels_part) for i in
range(num_feat_files)]
merged_exs = _load_featureset(dirpath, featureset, to_suffix,
label_col = 'y' if with_labels else None
merged_exs = _load_featureset(dirpath,
featureset,
to_suffix,
label_col=label_col,
quiet=True)

# Load pre-merged data in the `to_suffix` format
featureset = ['{}_all'.format(feature_name_prefix)]
premerged_exs = _load_featureset(dirpath, featureset, to_suffix,
featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)]
premerged_exs = _load_featureset(dirpath,
featureset,
to_suffix,
label_col=label_col,
quiet=True)

# make sure that the pre-generated merged data in the to_suffix format
# is the same as the converted, merged data in the to_suffix format

# first check the IDs
assert_array_equal(merged_exs.ids, premerged_exs.ids)
assert_array_equal(merged_exs.labels, premerged_exs.labels)
for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs,
Expand All @@ -877,11 +928,13 @@ def check_convert_featureset(from_suffix, to_suffix):

def test_convert_featureset():
# Test the conversion from every format to every other format
for from_suffix, to_suffix in itertools.permutations(['.jsonlines', '.ndj',
# with and without labels
for from_suffix, to_suffix in itertools.permutations(['.jsonlines',
'.megam', '.tsv',
'.csv', '.arff',
'.libsvm'], 2):
yield check_convert_featureset, from_suffix, to_suffix
yield check_convert_featureset, from_suffix, to_suffix, True
yield check_convert_featureset, from_suffix, to_suffix, False


def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
Expand Down
11 changes: 10 additions & 1 deletion tests/test_utilities.py
Expand Up @@ -766,7 +766,7 @@ def check_skll_convert(from_suffix, to_suffix):


def test_skll_convert():
for from_suffix, to_suffix in itertools.permutations(['.jsonlines', '.ndj',
for from_suffix, to_suffix in itertools.permutations(['.jsonlines',
'.megam', '.tsv',
'.csv', '.arff',
'.libsvm'], 2):
Expand Down Expand Up @@ -834,6 +834,15 @@ def test_skll_convert_libsvm_map():
eq_(orig_fs, converted_fs)


@raises(SystemExit)
def test_skll_convert_no_labels_with_label_col():
"""
Check that --no_labels/--label_col cannot both be specified for skll_convert
"""
skll_convert_cmd = ['--no_labels', '--label_col', 't', 'foo.tsv', 'foo.libsvm']
sk.main(argv=skll_convert_cmd)


def check_print_model_weights(task='classification'):

# create some simple classification or regression data
Expand Down

0 comments on commit 0b13413

Please sign in to comment.