Bug fix in feature set split method (#731)

* rename split_by_ids to split, update contributing readme --------- Co-authored-by: Tamar Lavee <tlavee@ets.org>
EducationalTestingService · May 23, 2023 · a3f45d5 · a3f45d5
1 parent 9f26caf
commit a3f45d5
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 20 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -19,7 +19,7 @@ How to contribute
 
 3. Create an isolated environment for SKLL development. We recommend using the [conda](https://conda.io/en/latest/) package manager. To create a `conda` environment, run the following command in the root of the working directory:
 
-         $ conda create -n sklldev -c conda-forge --file conda_requirements.txt
+         $ conda create -n sklldev -c conda-forge --file requirements.txt python=3.10
 
 4. Activate the conda environment
 

diff --git a/skll/data/featureset.py b/skll/data/featureset.py
@@ -564,33 +564,33 @@ def __getitem__(
             return self.ids[value], label, features
 
     @staticmethod
-    def split_by_ids(
+    def split(
         fs: "FeatureSet", ids_for_split1: List[int], ids_for_split2: Optional[List[int]] = None
     ) -> Tuple["FeatureSet", "FeatureSet"]:
         """
         Split ``FeatureSet`` into two new ``FeatureSet`` instances.
 
-        The splitting is done based on the given IDs for the two splits.
+        The splitting is done based on the given indices for the two splits.
 
         Parameters
         ----------
         fs : skll.data.FeatureSet
             The ``FeatureSet`` instance to split.
 
         ids_for_split1 : List[int]
-            A list of example IDs which will be split out into
+            A list of example indices which will be split out into
             the first ``FeatureSet`` instance. Note that the
             FeatureSet instance will respect the order of the
-            specified IDs.
+            specified indices.
 
         ids_for_split2 : Optional[List[int]], default=None
-            An optional list of example IDs which will be
+            An optional list of example indices which will be
             split out into the second ``FeatureSet`` instance.
             Note that the ``FeatureSet`` instance will respect
-            the order of the specified IDs. If this is
+            the order of the specified indices. If this is
             not specified, then the second ``FeatureSet``
             instance will contain the complement of the
-            first set of IDs sorted in ascending order.
+            first set of indices sorted in ascending order.
 
         Returns
         -------
@@ -609,16 +609,14 @@ def split_by_ids(
         ids1 = fs.ids[ids_for_split1]
         labels1 = fs.labels[ids_for_split1] if fs.labels is not None else None
         features1 = fs.features[ids_for_split1] if fs.features is not None else None
+
+        # if ids_for_split2 is not given, it will be the complement of ids_split1
         if ids_for_split2 is None:
-            ids2 = fs.ids[~np.in1d(fs.ids, ids_for_split1)]
-            labels2 = fs.labels[~np.in1d(fs.ids, ids_for_split1)] if fs.labels is not None else None
-            features2 = (
-                fs.features[~np.in1d(fs.ids, ids_for_split1)] if fs.features is not None else None
-            )
-        else:
-            ids2 = fs.ids[ids_for_split2]
-            labels2 = fs.labels[ids_for_split2] if fs.labels is not None else None
-            features2 = fs.features[ids_for_split2] if fs.features is not None else None
+            ids_for_split2 = [ind for ind in range(len(fs.ids)) if ind not in ids_for_split1]
+
+        ids2 = fs.ids[ids_for_split2]
+        labels2 = fs.labels[ids_for_split2] if fs.labels is not None else None
+        features2 = fs.features[ids_for_split2] if fs.features is not None else None
 
         fs1 = FeatureSet(
             f"{fs.name}_1", ids1, labels=labels1, features=features1, vectorizer=fs.vectorizer

diff --git a/skll/experiments/utils.py b/skll/experiments/utils.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 from sklearn.pipeline import Pipeline
-from tabulate import tabulate
+from tabulate import tabulate  # type: ignore
 
 from skll.utils.logging import get_skll_logger
 

diff --git a/skll/learner/utils.py b/skll/learner/utils.py
@@ -995,7 +995,7 @@ def setup_cv_split_iterator(cv_folds, examples):
 
     # create an iterator over train/test featuresets based on the
     # cross-validation index iterator
-    featureset_iter = (FeatureSet.split_by_ids(examples, train, test) for train, test in cv_iter)
+    featureset_iter = (FeatureSet.split(examples, train, test) for train, test in cv_iter)
 
     return featureset_iter, n_max_training_samples
 

diff --git a/tests/test_featureset.py b/tests/test_featureset.py
@@ -1228,3 +1228,63 @@ def test_drop_blanks_and_replace_blanks_with_raises_error():
     csv_path = other_dir / "test_drop_blanks_error.csv"
     _create_test_file(csv_path, test_csv)
     CSVReader(csv_path, replace_blanks_with=4.5, drop_blanks=True).read()
+
+
+def test_split_two_id_sets():
+    """Test split() with two input id sets."""
+    fs, _ = make_classification_data(
+        num_examples=10, num_features=4, num_labels=2, train_test_ratio=1.0
+    )
+
+    # split by two id sets
+    ids_split1 = range(5)
+    ids_split2 = range(5, 10)
+
+    fs1, fs2 = FeatureSet.split(fs, ids_split1, ids_split2)
+
+    # verify that the ids, labels and features are split as expected
+    assert_array_equal(fs.ids[ids_split1], fs1.ids)
+    assert_array_equal(fs.ids[ids_split2], fs2.ids)
+    assert_array_equal(fs.labels[ids_split1], fs1.labels)
+    assert_array_equal(fs.labels[ids_split2], fs2.labels)
+    assert_array_equal(fs.features[ids_split1, :].data, fs1.features.data)
+    assert_array_equal(fs.features[ids_split2, :].data, fs2.features.data)
+
+
+def test_split_one_id_set():
+    """Test split() with one input id sets."""
+    fs, _ = make_classification_data(
+        num_examples=10, num_features=4, num_labels=2, train_test_ratio=1.0
+    )
+    # split by one id set
+    ids1_idx = [2, 3, 5, 6, 7]
+    ids2_idx = [0, 1, 4, 8, 9]  # these are the expected ids for the second set
+
+    fs1, fs2 = FeatureSet.split(fs, ids1_idx)
+
+    # verify that the ids, labels and features are split as expected
+    assert_array_equal(fs.ids[ids1_idx], fs1.ids)
+    assert_array_equal(fs.ids[ids2_idx], fs2.ids)
+    assert_array_equal(fs.labels[ids1_idx], fs1.labels)
+    assert_array_equal(fs.labels[ids2_idx], fs2.labels)
+    assert_array_equal(fs.features[ids1_idx, :].data, fs1.features.data)
+    assert_array_equal(fs.features[ids2_idx, :].data, fs2.features.data)
+
+
+def test_split_int_ids():
+    """Test split() when ids are integers."""
+    fs, _ = make_classification_data(
+        num_examples=10, num_features=4, num_labels=2, train_test_ratio=1.0, id_type="integer"
+    )
+    ids1_idx = [1, 3, 5, 7, 9]
+    ids2_idx = [0, 2, 4, 6, 8]
+
+    fs1, fs2 = FeatureSet.split(fs, ids1_idx, ids2_idx)
+
+    # verify that the ids, labels and features are split as expected
+    assert_array_equal(fs.ids[ids1_idx], fs1.ids)
+    assert_array_equal(fs.ids[ids2_idx], fs2.ids)
+    assert_array_equal(fs.labels[ids1_idx], fs1.labels)
+    assert_array_equal(fs.labels[ids2_idx], fs2.labels)
+    assert_array_equal(fs.features[ids1_idx, :].data, fs1.features.data)
+    assert_array_equal(fs.features[ids2_idx, :].data, fs2.features.data)
diff --git a/tests/test_voting_learners_api_5.py b/tests/test_voting_learners_api_5.py
@@ -191,7 +191,7 @@ def get_argmax(row):
             sklearn_model_kwargs["voting"] = voting_type
         sklearn_vl = sklearn_model_type(**sklearn_model_kwargs)
 
-        train_fs_fold, test_fs_fold = FeatureSet.split_by_ids(featureset, train, test)
+        train_fs_fold, test_fs_fold = FeatureSet.split(featureset, train, test)
         sklearn_vl.fit(train_fs_fold.features, train_fs_fold.labels)
 
         # save the (argmax-ed) sklearn predictions into our data frame