Merge pull request #659 from EducationalTestingService/653-update-skl…

…earn-to-0-24-1 Update scikit-learn to 0.24.1
EducationalTestingService · Feb 4, 2021 · 07de429 · 07de429
2 parents f1a1e7f + 1b4b870
commit 07de429
Show file tree

Hide file tree

Showing 7 changed files with 395 additions and 10 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,6 +1,6 @@
 language: python
 python:
-  - 3.6
+  - 3.7
 notifications:
   email: false
   slack:

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -5,6 +5,7 @@
 
 variables:
   MPLBACKEND: Agg
+  PYTHON_VERSION: 3.8
 
 trigger:
   branches:

diff --git a/conda_requirements.txt b/conda_requirements.txt
@@ -4,7 +4,7 @@ numpy
 pandas
 pre-commit
 ruamel.yaml
-scikit-learn==0.23.2
+scikit-learn==0.24.1
 scipy
 seaborn
 tabulate
diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@ joblib>=0.8
 numpy
 pandas
 ruamel.yaml
-scikit-learn>=0.23.1,<=0.23.2
+scikit-learn>=0.24.1,<=0.24.2
 scipy
 seaborn
 tabulate
diff --git a/skll/learner/__init__.py b/skll/learner/__init__.py
@@ -217,9 +217,18 @@ def __init__(self,  # noqa: C901
         self._probability = None
         # Use setter to set self.probability
         self.probability = probability
+
+        # we need to use dense features under certain conditions:
+        # - if we are using any of the estimators that are _known_
+        #   to accept only dense features
+        # - if we are doing centering as part of feature scaling
+        # - if we are using non-negative least squares regression
         self._use_dense_features = \
             (issubclass(self._model_type, _REQUIRES_DENSE) or
-             self._feature_scaling in {'with_mean', 'both'})
+                self._feature_scaling in {'with_mean', 'both'} or
+                (issubclass(self._model_type, LinearRegression) and
+                    model_kwargs is not None and
+                    model_kwargs.get("positive", False)))
 
         # Set default keyword arguments for models that we have some for.
         if issubclass(self._model_type, SVC):
@@ -1003,13 +1012,16 @@ def train(self,  # noqa: C901
                 folds = kfold.split(examples.features, examples.labels, fold_groups)
 
             # limit the number of grid_jobs to be no higher than five or the
-            # number of cores for the machine, whichever is lower
+            # number of cores for the machine, whichever is lower; we set
+            # `error_score` to "raise" since we want scikit-learn to explicitly
+            # raise an exception if the estimator fails to fit for any reason
             grid_jobs = min(grid_jobs, cpu_count(), MAX_CONCURRENT_PROCESSES)
             grid_searcher = GridSearchCV(estimator,
                                          param_grid,
                                          scoring=grid_objective,
                                          cv=folds,
                                          n_jobs=grid_jobs,
+                                         error_score="raise",
                                          pre_dispatch=grid_jobs)
 
             # run the grid search for hyperparameters

diff --git a/tests/test_regression.py b/tests/test_regression.py
@@ -20,16 +20,18 @@
 import numpy as np
 from nose.tools import (
     assert_almost_equal,
+    assert_false,
     assert_greater,
     assert_less,
+    assert_true,
     eq_,
     ok_,
     raises,
 )
-from numpy.testing import assert_allclose
+from numpy.testing import assert_allclose, assert_array_equal
 from scipy.stats import pearsonr
 from sklearn.exceptions import ConvergenceWarning
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LinearRegression, LogisticRegression
 
 from skll.config import _setup_config_parser
 from skll.data import FeatureSet, NDJReader, NDJWriter
@@ -163,9 +165,11 @@ def check_linear_models(name,
                                                              num_features=3)
 
     # create the learner
-    if use_rescaling:
-        name = f'Rescaled{name}'
-    learner = Learner(name)
+    model_kwargs = {} if name in ["BayesianRidge",
+                                  "Lars",
+                                  "LinearRegression"] else {"max_iter": 500}
+    name = f'Rescaled{name}' if use_rescaling else name
+    learner = Learner(name, model_kwargs=model_kwargs)
 
     # train it with the training feature set we created
     # make sure to set the grid objective to pearson
@@ -744,3 +748,32 @@ def test_train_string_labels():
     train_fs.labels = train_fs.labels.astype('str')
     learner = Learner('LinearRegression')
     learner.train(train_fs, grid_search=False)
+
+
+def test_non_negative_regression():
+    """Test that non-negative regression works as expected"""
+
+    # read in the Boston example training data into a featureset
+    train_path = join(train_dir, "test_non_negative.jsonlines")
+    train_fs = NDJReader.for_path(train_path).read()
+
+    # train a regular SKLL linear regerssion learner first
+    # and check that it gets some negative weights with this data
+    skll_estimator1 = Learner("LinearRegression", min_feature_count=0)
+    skll_estimator1.train(train_fs, grid_search=False)
+    assert_true(np.any(np.where(skll_estimator1.model.coef_ < 0, True, False)))
+
+    # now train a non-negative linear regression learner and check
+    # that _none_ of its weights are negative with the same data
+    skll_estimator2 = Learner("LinearRegression",
+                              model_kwargs={"positive": True},
+                              min_feature_count=0)
+    skll_estimator2.train(train_fs, grid_search=False)
+    assert_false(np.any(np.where(skll_estimator2.model.coef_ < 0, True, False)))
+
+    # just for good measure, train a non-negative learner directly
+    # in sklearn space and check that it has the same weights
+    sklearn_estimator = LinearRegression(positive=True)
+    X, y = train_fs.features.todense(), train_fs.labels
+    _ = sklearn_estimator.fit(X, y)
+    assert_array_equal(skll_estimator2.model.coef_, sklearn_estimator.coef_)