Skip to content

Commit

Permalink
Merge pull request #659 from EducationalTestingService/653-update-skl…
Browse files Browse the repository at this point in the history
…earn-to-0-24-1

Update scikit-learn to 0.24.1
  • Loading branch information
desilinguist committed Feb 4, 2021
2 parents f1a1e7f + 1b4b870 commit 07de429
Show file tree
Hide file tree
Showing 7 changed files with 395 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
language: python
python:
- 3.6
- 3.7
notifications:
email: false
slack:
Expand Down
1 change: 1 addition & 0 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

variables:
MPLBACKEND: Agg
PYTHON_VERSION: 3.8

trigger:
branches:
Expand Down
2 changes: 1 addition & 1 deletion conda_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ numpy
pandas
pre-commit
ruamel.yaml
scikit-learn==0.23.2
scikit-learn==0.24.1
scipy
seaborn
tabulate
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ joblib>=0.8
numpy
pandas
ruamel.yaml
scikit-learn>=0.23.1,<=0.23.2
scikit-learn>=0.24.1,<=0.24.2
scipy
seaborn
tabulate
16 changes: 14 additions & 2 deletions skll/learner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,18 @@ def __init__(self, # noqa: C901
self._probability = None
# Use setter to set self.probability
self.probability = probability

# we need to use dense features under certain conditions:
# - if we are using any of the estimators that are _known_
# to accept only dense features
# - if we are doing centering as part of feature scaling
# - if we are using non-negative least squares regression
self._use_dense_features = \
(issubclass(self._model_type, _REQUIRES_DENSE) or
self._feature_scaling in {'with_mean', 'both'})
self._feature_scaling in {'with_mean', 'both'} or
(issubclass(self._model_type, LinearRegression) and
model_kwargs is not None and
model_kwargs.get("positive", False)))

# Set default keyword arguments for models that we have some for.
if issubclass(self._model_type, SVC):
Expand Down Expand Up @@ -1003,13 +1012,16 @@ def train(self, # noqa: C901
folds = kfold.split(examples.features, examples.labels, fold_groups)

# limit the number of grid_jobs to be no higher than five or the
# number of cores for the machine, whichever is lower
# number of cores for the machine, whichever is lower; we set
# `error_score` to "raise" since we want scikit-learn to explicitly
# raise an exception if the estimator fails to fit for any reason
grid_jobs = min(grid_jobs, cpu_count(), MAX_CONCURRENT_PROCESSES)
grid_searcher = GridSearchCV(estimator,
param_grid,
scoring=grid_objective,
cv=folds,
n_jobs=grid_jobs,
error_score="raise",
pre_dispatch=grid_jobs)

# run the grid search for hyperparameters
Expand Down
43 changes: 38 additions & 5 deletions tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,18 @@
import numpy as np
from nose.tools import (
assert_almost_equal,
assert_false,
assert_greater,
assert_less,
assert_true,
eq_,
ok_,
raises,
)
from numpy.testing import assert_allclose
from numpy.testing import assert_allclose, assert_array_equal
from scipy.stats import pearsonr
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression, LogisticRegression

from skll.config import _setup_config_parser
from skll.data import FeatureSet, NDJReader, NDJWriter
Expand Down Expand Up @@ -163,9 +165,11 @@ def check_linear_models(name,
num_features=3)

# create the learner
if use_rescaling:
name = f'Rescaled{name}'
learner = Learner(name)
model_kwargs = {} if name in ["BayesianRidge",
"Lars",
"LinearRegression"] else {"max_iter": 500}
name = f'Rescaled{name}' if use_rescaling else name
learner = Learner(name, model_kwargs=model_kwargs)

# train it with the training feature set we created
# make sure to set the grid objective to pearson
Expand Down Expand Up @@ -744,3 +748,32 @@ def test_train_string_labels():
train_fs.labels = train_fs.labels.astype('str')
learner = Learner('LinearRegression')
learner.train(train_fs, grid_search=False)


def test_non_negative_regression():
"""Test that non-negative regression works as expected"""

# read in the Boston example training data into a featureset
train_path = join(train_dir, "test_non_negative.jsonlines")
train_fs = NDJReader.for_path(train_path).read()

# train a regular SKLL linear regerssion learner first
# and check that it gets some negative weights with this data
skll_estimator1 = Learner("LinearRegression", min_feature_count=0)
skll_estimator1.train(train_fs, grid_search=False)
assert_true(np.any(np.where(skll_estimator1.model.coef_ < 0, True, False)))

# now train a non-negative linear regression learner and check
# that _none_ of its weights are negative with the same data
skll_estimator2 = Learner("LinearRegression",
model_kwargs={"positive": True},
min_feature_count=0)
skll_estimator2.train(train_fs, grid_search=False)
assert_false(np.any(np.where(skll_estimator2.model.coef_ < 0, True, False)))

# just for good measure, train a non-negative learner directly
# in sklearn space and check that it has the same weights
sklearn_estimator = LinearRegression(positive=True)
X, y = train_fs.features.todense(), train_fs.labels
_ = sklearn_estimator.fit(X, y)
assert_array_equal(skll_estimator2.model.coef_, sklearn_estimator.coef_)

0 comments on commit 07de429

Please sign in to comment.