Skip to content

Commit

Permalink
Merge pull request #658 from EducationalTestingService/656-fix-check-…
Browse files Browse the repository at this point in the history
…input-formatting-for-dense-featuresets

Fix `Learner._check_input_formatting()` for dense feature sets.
  • Loading branch information
desilinguist committed Feb 3, 2021
2 parents 32823c0 + a9321a4 commit f1a1e7f
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 4 deletions.
10 changes: 8 additions & 2 deletions skll/learner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,8 +661,14 @@ def _check_input_formatting(self, examples):
"labels. Convert them to integers or "
"floats.")

# make sure that feature values are not strings
for val in examples.features.data:
# make sure that feature values are not strings; to check this
# we need to get a flattened version of the feature array,
# whether it is sparse (more likely) or dense
if sp.issparse(examples.features):
flattened_features = examples.features.data
else:
flattened_features = examples.features.flat
for val in flattened_features:
if isinstance(val, str):
raise TypeError("You have feature values that are strings. "
"Convert them to floats.")
Expand Down
9 changes: 9 additions & 0 deletions tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -1820,3 +1820,12 @@ def test_predict_return_and_write():
for (learner, class_labels) in product([learner1, learner2],
[False, True]):
yield check_predict_return_and_write, learner, test_fs, class_labels


def test_train_non_sparse_featureset():
"""Test that we can train a classifier on a non-sparse featureset"""
train_file = join(other_dir, 'examples_train.jsonlines')
train_fs = NDJReader.for_path(train_file, sparse=False).read()
learner = Learner('LogisticRegression')
learner.train(train_fs, grid_search=False)
ok_(hasattr(learner.model, "coef_"))
30 changes: 28 additions & 2 deletions tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,21 @@
from pathlib import Path

import numpy as np
from nose.tools import assert_almost_equal, assert_greater, assert_less, eq_, raises
from nose.tools import (
assert_almost_equal,
assert_greater,
assert_less,
eq_,
ok_,
raises,
)
from numpy.testing import assert_allclose
from scipy.stats import pearsonr
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression

from skll.config import _setup_config_parser
from skll.data import FeatureSet, NDJWriter
from skll.data import FeatureSet, NDJReader, NDJWriter
from skll.experiments import run_configuration
from skll.learner import Learner
from skll.learner.utils import rescaled
Expand Down Expand Up @@ -718,3 +725,22 @@ def test_invalid_regression_metric():
for metric in CLASSIFICATION_ONLY_METRICS:
yield check_invalid_regression_metric, learner, metric, True
yield check_invalid_regression_metric, learner, metric, False


def test_train_non_sparse_featureset():
"""Test that we can train a regressor on a non-sparse featureset"""
train_file = join(other_dir, 'test_int_labels_cv.jsonlines')
train_fs = NDJReader.for_path(train_file, sparse=False).read()
learner = Learner('LinearRegression')
learner.train(train_fs, grid_search=False)
ok_(hasattr(learner.model, "coef_"))


@raises(TypeError)
def test_train_string_labels():
"""Test that regression on string labels raises TypeError"""
train_file = join(other_dir, 'test_int_labels_cv.jsonlines')
train_fs = NDJReader.for_path(train_file).read()
train_fs.labels = train_fs.labels.astype('str')
learner = Learner('LinearRegression')
learner.train(train_fs, grid_search=False)

0 comments on commit f1a1e7f

Please sign in to comment.