Skip to content

Commit

Permalink
Bug fix in feature set split method (#731)
Browse files Browse the repository at this point in the history
* rename split_by_ids to split, update contributing readme

---------

Co-authored-by: Tamar Lavee <tlavee@ets.org>
  • Loading branch information
tamarl08 and Tamar Lavee committed May 23, 2023
1 parent 9f26caf commit a3f45d5
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 20 deletions.
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ How to contribute

3. Create an isolated environment for SKLL development. We recommend using the [conda](https://conda.io/en/latest/) package manager. To create a `conda` environment, run the following command in the root of the working directory:

$ conda create -n sklldev -c conda-forge --file conda_requirements.txt
$ conda create -n sklldev -c conda-forge --file requirements.txt python=3.10

4. Activate the conda environment

Expand Down
30 changes: 14 additions & 16 deletions skll/data/featureset.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,33 +564,33 @@ def __getitem__(
return self.ids[value], label, features

@staticmethod
def split_by_ids(
def split(
fs: "FeatureSet", ids_for_split1: List[int], ids_for_split2: Optional[List[int]] = None
) -> Tuple["FeatureSet", "FeatureSet"]:
"""
Split ``FeatureSet`` into two new ``FeatureSet`` instances.
The splitting is done based on the given IDs for the two splits.
The splitting is done based on the given indices for the two splits.
Parameters
----------
fs : skll.data.FeatureSet
The ``FeatureSet`` instance to split.
ids_for_split1 : List[int]
A list of example IDs which will be split out into
A list of example indices which will be split out into
the first ``FeatureSet`` instance. Note that the
FeatureSet instance will respect the order of the
specified IDs.
specified indices.
ids_for_split2 : Optional[List[int]], default=None
An optional list of example IDs which will be
An optional list of example indices which will be
split out into the second ``FeatureSet`` instance.
Note that the ``FeatureSet`` instance will respect
the order of the specified IDs. If this is
the order of the specified indices. If this is
not specified, then the second ``FeatureSet``
instance will contain the complement of the
first set of IDs sorted in ascending order.
first set of indices sorted in ascending order.
Returns
-------
Expand All @@ -609,16 +609,14 @@ def split_by_ids(
ids1 = fs.ids[ids_for_split1]
labels1 = fs.labels[ids_for_split1] if fs.labels is not None else None
features1 = fs.features[ids_for_split1] if fs.features is not None else None

# if ids_for_split2 is not given, it will be the complement of ids_split1
if ids_for_split2 is None:
ids2 = fs.ids[~np.in1d(fs.ids, ids_for_split1)]
labels2 = fs.labels[~np.in1d(fs.ids, ids_for_split1)] if fs.labels is not None else None
features2 = (
fs.features[~np.in1d(fs.ids, ids_for_split1)] if fs.features is not None else None
)
else:
ids2 = fs.ids[ids_for_split2]
labels2 = fs.labels[ids_for_split2] if fs.labels is not None else None
features2 = fs.features[ids_for_split2] if fs.features is not None else None
ids_for_split2 = [ind for ind in range(len(fs.ids)) if ind not in ids_for_split1]

ids2 = fs.ids[ids_for_split2]
labels2 = fs.labels[ids_for_split2] if fs.labels is not None else None
features2 = fs.features[ids_for_split2] if fs.features is not None else None

fs1 = FeatureSet(
f"{fs.name}_1", ids1, labels=labels1, features=features1, vectorizer=fs.vectorizer
Expand Down
2 changes: 1 addition & 1 deletion skll/experiments/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import numpy as np
from sklearn.pipeline import Pipeline
from tabulate import tabulate
from tabulate import tabulate # type: ignore

from skll.utils.logging import get_skll_logger

Expand Down
2 changes: 1 addition & 1 deletion skll/learner/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,7 @@ def setup_cv_split_iterator(cv_folds, examples):

# create an iterator over train/test featuresets based on the
# cross-validation index iterator
featureset_iter = (FeatureSet.split_by_ids(examples, train, test) for train, test in cv_iter)
featureset_iter = (FeatureSet.split(examples, train, test) for train, test in cv_iter)

return featureset_iter, n_max_training_samples

Expand Down
60 changes: 60 additions & 0 deletions tests/test_featureset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1228,3 +1228,63 @@ def test_drop_blanks_and_replace_blanks_with_raises_error():
csv_path = other_dir / "test_drop_blanks_error.csv"
_create_test_file(csv_path, test_csv)
CSVReader(csv_path, replace_blanks_with=4.5, drop_blanks=True).read()


def test_split_two_id_sets():
"""Test split() with two input id sets."""
fs, _ = make_classification_data(
num_examples=10, num_features=4, num_labels=2, train_test_ratio=1.0
)

# split by two id sets
ids_split1 = range(5)
ids_split2 = range(5, 10)

fs1, fs2 = FeatureSet.split(fs, ids_split1, ids_split2)

# verify that the ids, labels and features are split as expected
assert_array_equal(fs.ids[ids_split1], fs1.ids)
assert_array_equal(fs.ids[ids_split2], fs2.ids)
assert_array_equal(fs.labels[ids_split1], fs1.labels)
assert_array_equal(fs.labels[ids_split2], fs2.labels)
assert_array_equal(fs.features[ids_split1, :].data, fs1.features.data)
assert_array_equal(fs.features[ids_split2, :].data, fs2.features.data)


def test_split_one_id_set():
"""Test split() with one input id sets."""
fs, _ = make_classification_data(
num_examples=10, num_features=4, num_labels=2, train_test_ratio=1.0
)
# split by one id set
ids1_idx = [2, 3, 5, 6, 7]
ids2_idx = [0, 1, 4, 8, 9] # these are the expected ids for the second set

fs1, fs2 = FeatureSet.split(fs, ids1_idx)

# verify that the ids, labels and features are split as expected
assert_array_equal(fs.ids[ids1_idx], fs1.ids)
assert_array_equal(fs.ids[ids2_idx], fs2.ids)
assert_array_equal(fs.labels[ids1_idx], fs1.labels)
assert_array_equal(fs.labels[ids2_idx], fs2.labels)
assert_array_equal(fs.features[ids1_idx, :].data, fs1.features.data)
assert_array_equal(fs.features[ids2_idx, :].data, fs2.features.data)


def test_split_int_ids():
"""Test split() when ids are integers."""
fs, _ = make_classification_data(
num_examples=10, num_features=4, num_labels=2, train_test_ratio=1.0, id_type="integer"
)
ids1_idx = [1, 3, 5, 7, 9]
ids2_idx = [0, 2, 4, 6, 8]

fs1, fs2 = FeatureSet.split(fs, ids1_idx, ids2_idx)

# verify that the ids, labels and features are split as expected
assert_array_equal(fs.ids[ids1_idx], fs1.ids)
assert_array_equal(fs.ids[ids2_idx], fs2.ids)
assert_array_equal(fs.labels[ids1_idx], fs1.labels)
assert_array_equal(fs.labels[ids2_idx], fs2.labels)
assert_array_equal(fs.features[ids1_idx, :].data, fs1.features.data)
assert_array_equal(fs.features[ids2_idx, :].data, fs2.features.data)
2 changes: 1 addition & 1 deletion tests/test_voting_learners_api_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def get_argmax(row):
sklearn_model_kwargs["voting"] = voting_type
sklearn_vl = sklearn_model_type(**sklearn_model_kwargs)

train_fs_fold, test_fs_fold = FeatureSet.split_by_ids(featureset, train, test)
train_fs_fold, test_fs_fold = FeatureSet.split(featureset, train, test)
sklearn_vl.fit(train_fs_fold.features, train_fs_fold.labels)

# save the (argmax-ed) sklearn predictions into our data frame
Expand Down

0 comments on commit a3f45d5

Please sign in to comment.