Merge pull request #658 from EducationalTestingService/656-fix-check-…

…input-formatting-for-dense-featuresets Fix `Learner._check_input_formatting()` for dense feature sets.
EducationalTestingService · Feb 3, 2021 · f1a1e7f · f1a1e7f
2 parents 32823c0 + a9321a4
commit f1a1e7f
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 4 deletions.
diff --git a/skll/learner/__init__.py b/skll/learner/__init__.py
@@ -661,8 +661,14 @@ def _check_input_formatting(self, examples):
                                     "labels.  Convert them to integers or "
                                     "floats.")
 
-        # make sure that feature values are not strings
-        for val in examples.features.data:
+        # make sure that feature values are not strings; to check this
+        # we need to get a flattened version of the feature array,
+        # whether it is sparse (more likely) or dense
+        if sp.issparse(examples.features):
+            flattened_features = examples.features.data
+        else:
+            flattened_features = examples.features.flat
+        for val in flattened_features:
             if isinstance(val, str):
                 raise TypeError("You have feature values that are strings.  "
                                 "Convert them to floats.")

diff --git a/tests/test_classification.py b/tests/test_classification.py
@@ -1820,3 +1820,12 @@ def test_predict_return_and_write():
     for (learner, class_labels) in product([learner1, learner2],
                                            [False, True]):
         yield check_predict_return_and_write, learner, test_fs, class_labels
+
+
+def test_train_non_sparse_featureset():
+    """Test that we can train a classifier on a non-sparse featureset"""
+    train_file = join(other_dir, 'examples_train.jsonlines')
+    train_fs = NDJReader.for_path(train_file, sparse=False).read()
+    learner = Learner('LogisticRegression')
+    learner.train(train_fs, grid_search=False)
+    ok_(hasattr(learner.model, "coef_"))
diff --git a/tests/test_regression.py b/tests/test_regression.py
@@ -18,14 +18,21 @@
 from pathlib import Path
 
 import numpy as np
-from nose.tools import assert_almost_equal, assert_greater, assert_less, eq_, raises
+from nose.tools import (
+    assert_almost_equal,
+    assert_greater,
+    assert_less,
+    eq_,
+    ok_,
+    raises,
+)
 from numpy.testing import assert_allclose
 from scipy.stats import pearsonr
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.linear_model import LogisticRegression
 
 from skll.config import _setup_config_parser
-from skll.data import FeatureSet, NDJWriter
+from skll.data import FeatureSet, NDJReader, NDJWriter
 from skll.experiments import run_configuration
 from skll.learner import Learner
 from skll.learner.utils import rescaled
@@ -718,3 +725,22 @@ def test_invalid_regression_metric():
         for metric in CLASSIFICATION_ONLY_METRICS:
             yield check_invalid_regression_metric, learner, metric, True
             yield check_invalid_regression_metric, learner, metric, False
+
+
+def test_train_non_sparse_featureset():
+    """Test that we can train a regressor on a non-sparse featureset"""
+    train_file = join(other_dir, 'test_int_labels_cv.jsonlines')
+    train_fs = NDJReader.for_path(train_file, sparse=False).read()
+    learner = Learner('LinearRegression')
+    learner.train(train_fs, grid_search=False)
+    ok_(hasattr(learner.model, "coef_"))
+
+
+@raises(TypeError)
+def test_train_string_labels():
+    """Test that regression on string labels raises TypeError"""
+    train_file = join(other_dir, 'test_int_labels_cv.jsonlines')
+    train_fs = NDJReader.for_path(train_file).read()
+    train_fs.labels = train_fs.labels.astype('str')
+    learner = Learner('LinearRegression')
+    learner.train(train_fs, grid_search=False)