Skip to content

Commit

Permalink
Update test_convert_featureset to also test for unlabeled data
Browse files Browse the repository at this point in the history
- Remove unnecessary test and associated files.
  • Loading branch information
desilinguist committed Feb 5, 2019
1 parent faf511e commit a84f7dc
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 57 deletions.
4 changes: 0 additions & 4 deletions tests/other/featureset_no_labels_expected.jsonlines

This file was deleted.

131 changes: 83 additions & 48 deletions tests/test_featureset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
from os.path import abspath, dirname, exists, join

import numpy as np
from nose.tools import eq_, raises, assert_not_equal, assert_equal
from nose.tools import eq_, raises, assert_not_equal
from nose.plugins.attrib import attr
from numpy.testing import assert_array_equal
from sklearn.feature_extraction import DictVectorizer, FeatureHasher
from sklearn.datasets.samples_generator import make_classification

import skll
from skll.data import FeatureSet, Writer, Reader, CSVReader, NDJWriter
from skll.data import FeatureSet, Writer, Reader
from skll.data.readers import DictListReader
from skll.experiments import _load_featureset
from skll.learner import _DEFAULT_PARAM_GRIDS
Expand Down Expand Up @@ -90,6 +90,7 @@ def test_empty_ids():
# create a feature set with ids set to None and raise ValueError
FeatureSet('test', None, features=features, labels=y)


@raises(ValueError)
def check_empty_file_read(filetype, reader_type):
empty_filepath = _create_empty_file(filetype)
Expand Down Expand Up @@ -755,7 +756,7 @@ def test_dict_list_reader():


# Tests related to converting featuresets
def make_conversion_data(num_feat_files, from_suffix, to_suffix):
def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True):
num_examples = 500
num_feats_per_file = 7

Expand All @@ -768,33 +769,49 @@ def make_conversion_data(num_feat_files, from_suffix, to_suffix):
# Create lists we will write files from
ids = []
features = []
labels = []
labels = [] if with_labels else None
for j in range(num_examples):
y = "dog" if j % 2 == 0 else "cat"
ex_id = "{}{}".format(y, j)
x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num
# if we are not using labels, we do not want zero-valued features
# because it may be the case that some subset of features end up
# being all 0 and if this subset ends up being written out to a file
# below, then for some formats (e.g., megam) nothing will get written
# out which can cause issues when reading this file
lowest_feature_value = 0 if with_labels else 1
x = {"f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num
in range(num_feat_files * num_feats_per_file)}
x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
ids.append(ex_id)
labels.append(y)
if with_labels:
labels.append(y)
features.append(x)

# Create vectorizers/maps for libsvm subset writing
feat_vectorizer = DictVectorizer()
feat_vectorizer.fit(features)
label_map = {label: num for num, label in
enumerate(sorted({label for label in labels if
not isinstance(label, (int, float))}))}
# Add fake item to vectorizer for None
label_map[None] = '00000'
if with_labels:
label_map = {label: num for num, label in
enumerate(sorted({label for label in labels if
not isinstance(label, (int, float))}))}
# Add fake item to vectorizer for None
label_map[None] = '00000'
else:
label_map = None

# get the feature name prefix
feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
to_suffix.lstrip('.'))

# use '_unlabeled' as part of any file names when not using labels
with_labels_part = '' if with_labels else '_unlabeled'

# Write out unmerged features in the `from_suffix` file format
for i in range(num_feat_files):
train_path = join(convert_dir, '{}_{}{}'.format(feature_name_prefix,
i, from_suffix))
train_path = join(convert_dir, '{}_{}{}{}'.format(feature_name_prefix,
i,
with_labels_part,
from_suffix))
sub_features = []
for example_num in range(num_examples):
feat_num = i * num_feats_per_file
Expand All @@ -808,26 +825,43 @@ def make_conversion_data(num_feat_files, from_suffix, to_suffix):
if from_suffix == '.libsvm':
Writer.for_path(train_path, train_fs,
label_map=label_map).write()
elif from_suffix in ['.arff', '.csv', '.tsv']:
label_col = 'y' if with_labels else None
Writer.for_path(train_path, train_fs, label_col=label_col).write()
else:
Writer.for_path(train_path, train_fs).write()

# Write out the merged features in the `to_suffix` file format
train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix,
to_suffix))
train_path = join(convert_dir, '{}{}_all{}'.format(feature_name_prefix,
with_labels_part,
to_suffix))
train_fs = FeatureSet('train', ids, labels=labels, features=features,
vectorizer=feat_vectorizer)

# we need to do this to get around the FeatureSet using NaNs
# instead of None when there are no labels which causes problems
# later when comparing featuresets
if not with_labels:
train_fs.labels = [None] * len(train_fs.labels)

if to_suffix == '.libsvm':
Writer.for_path(train_path, train_fs,
label_map=label_map).write()
elif to_suffix in ['.arff', '.csv', '.tsv']:
label_col = 'y' if with_labels else None
Writer.for_path(train_path, train_fs, label_col=label_col).write()
else:
Writer.for_path(train_path, train_fs).write()


def check_convert_featureset(from_suffix, to_suffix):
def check_convert_featureset(from_suffix, to_suffix, with_labels=True):
num_feat_files = 5

# Create test data
make_conversion_data(num_feat_files, from_suffix, to_suffix)
make_conversion_data(num_feat_files,
from_suffix,
to_suffix,
with_labels=with_labels)

# the path to the unmerged feature files
dirpath = join(_my_dir, 'train', 'test_conversion')
Expand All @@ -836,30 +870,48 @@ def check_convert_featureset(from_suffix, to_suffix):
feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
to_suffix.lstrip('.'))

# use '_unlabeled' as part of any file names when not using labels
with_labels_part = '' if with_labels else '_unlabeled'

# Load each unmerged feature file in the `from_suffix` format and convert
# it to the `to_suffix` format
for feature in range(num_feat_files):
input_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
feature,
from_suffix))
output_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
feature, to_suffix))
skll_convert.main(['--quiet', input_file_path, output_file_path])
input_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
feature,
with_labels_part,
from_suffix))
output_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
feature,
with_labels_part,
to_suffix))
skll_convert_args = ['--quiet', input_file_path, output_file_path]
if not with_labels:
skll_convert_args.append('--no_labels')
skll_convert.main(skll_convert_args)

# now load and merge all unmerged, converted features in the `to_suffix`
# format
featureset = ['{}_{}'.format(feature_name_prefix, i) for i in
featureset = ['{}_{}{}'.format(feature_name_prefix, i, with_labels_part) for i in
range(num_feat_files)]
merged_exs = _load_featureset(dirpath, featureset, to_suffix,
label_col = 'y' if with_labels else None
merged_exs = _load_featureset(dirpath,
featureset,
to_suffix,
label_col=label_col,
quiet=True)

# Load pre-merged data in the `to_suffix` format
featureset = ['{}_all'.format(feature_name_prefix)]
premerged_exs = _load_featureset(dirpath, featureset, to_suffix,
featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)]
premerged_exs = _load_featureset(dirpath,
featureset,
to_suffix,
label_col=label_col,
quiet=True)

# make sure that the pre-generated merged data in the to_suffix format
# is the same as the converted, merged data in the to_suffix format

# first check the IDs
assert_array_equal(merged_exs.ids, premerged_exs.ids)
assert_array_equal(merged_exs.labels, premerged_exs.labels)
for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs,
Expand All @@ -871,11 +923,13 @@ def check_convert_featureset(from_suffix, to_suffix):

def test_convert_featureset():
# Test the conversion from every format to every other format
for from_suffix, to_suffix in itertools.permutations(['.jsonlines', '.ndj',
# with and without labels
for from_suffix, to_suffix in itertools.permutations(['.jsonlines',
'.megam', '.tsv',
'.csv', '.arff',
'.libsvm'], 2):
yield check_convert_featureset, from_suffix, to_suffix
yield check_convert_featureset, from_suffix, to_suffix, True
yield check_convert_featureset, from_suffix, to_suffix, False


def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
Expand Down Expand Up @@ -972,22 +1026,3 @@ def test_featureset_creation_from_dataframe_without_labels_with_vectorizer():
rtol=1e-6) and
np.all(np.isnan(expected.labels)) and
np.all(np.isnan(current.labels)))


def test_read_write_no_labels():
featureset_file = os.path.join(_my_dir, 'train',
'featureset_no_labels_input.csv')
reader = CSVReader(featureset_file, label_col=None)
fs = reader.read()
outpath = os.path.join(_my_dir, 'output', 'featureset_no_labels.jsonlines')
writer = NDJWriter(outpath, fs)
writer.write()
expected_path = os.path.join(_my_dir, "other",
"featureset_no_labels_expected.jsonlines")

with open(expected_path) as f_expected:
expected_output = f_expected.read()
with open(outpath) as f_actual:
actual_output = f_actual.read()

assert_equal(actual_output, expected_output)
5 changes: 0 additions & 5 deletions tests/train/featureset_no_labels_input.csv

This file was deleted.

0 comments on commit a84f7dc

Please sign in to comment.