Skip to content

Commit

Permalink
Merge pull request #737 from weixuanfu/fix_issue733
Browse files Browse the repository at this point in the history
Refine check dataset for checking features in predict() and other functions
  • Loading branch information
weixuanfu committed Jul 31, 2018
2 parents f66b893 + af260a3 commit 15234fa
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 15 deletions.
15 changes: 15 additions & 0 deletions tests/tpot_tests.py
Expand Up @@ -1558,6 +1558,21 @@ def test_check_dataset_4():
assert_raises(ValueError, tpot_obj._check_dataset, training_features, training_target, test_sample_weight)


def test_check_dataset_5():
"""Assert that the check_dataset function returns feature and target as expected."""
tpot_obj = TPOTClassifier(
random_state=42,
population_size=1,
offspring_size=2,
generations=1,
verbosity=0,
config_dict='TPOT light'
)

ret_features = tpot_obj._check_dataset(training_features, target=None)
assert np.allclose(ret_features, training_features)


def test_imputer():
"""Assert that the TPOT fit function will not raise a ValueError in a dataset where NaNs are present."""
tpot_obj = TPOTClassifier(
Expand Down
30 changes: 15 additions & 15 deletions tpot/base.py
Expand Up @@ -46,7 +46,7 @@
from copy import copy, deepcopy

from sklearn.base import BaseEstimator
from sklearn.utils import check_X_y, check_consistent_length
from sklearn.utils import check_X_y, check_consistent_length, check_array
from sklearn.externals.joblib import Parallel, delayed, Memory
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, Imputer
Expand Down Expand Up @@ -800,10 +800,7 @@ def predict(self, features):
if not self.fitted_pipeline_:
raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.')

features = features.astype(np.float64)

if np.any(np.isnan(features)):
features = self._impute_values(features)
features = self._check_dataset(features, target=None, sample_weight=None)

return self.fitted_pipeline_.predict(features)

Expand All @@ -830,6 +827,7 @@ def fit_predict(self, features, target, sample_weight=None, groups=None):
"""
self.fit(features, target, sample_weight=sample_weight, groups=groups)

return self.predict(features)

def score(self, testing_features, testing_target):
Expand All @@ -851,8 +849,7 @@ def score(self, testing_features, testing_target):
if self.fitted_pipeline_ is None:
raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.')

if np.any(np.isnan(testing_features)):
testing_features = self._impute_values(testing_features)
testing_features, testing_target = self._check_dataset(testing_features, testing_target, sample_weight=None)

# If the scoring function is a string, we must adjust to use the sklearn
# scoring interface
Expand Down Expand Up @@ -883,12 +880,10 @@ def predict_proba(self, features):
if not (hasattr(self.fitted_pipeline_, 'predict_proba')):
raise RuntimeError('The fitted pipeline does not have the predict_proba() function.')

features = features.astype(np.float64)

if np.any(np.isnan(features)):
features = self._impute_values(features)
#features = features.astype(np.float64)
features = self._check_dataset(features, target=None, sample_weight=None)

return self.fitted_pipeline_.predict_proba(features.astype(np.float64))
return self.fitted_pipeline_.predict_proba(features)

def set_params(self, **params):
"""Set the parameters of TPOT.
Expand Down Expand Up @@ -1024,7 +1019,7 @@ def _check_dataset(self, features, target, sample_weight=None):
----------
features: array-like {n_samples, n_features}
Feature matrix
target: array-like {n_samples}
target: array-like {n_samples} or None
List of class labels for prediction
sample_weight: array-like {n_samples} (optional)
List of weights indicating relative importance
Expand Down Expand Up @@ -1061,9 +1056,14 @@ def _check_dataset(self, features, target, sample_weight=None):
if np.any(np.isnan(features)):
self._imputed = True
features = self._impute_values(features)

try:
X, y = check_X_y(features, target, accept_sparse=True, dtype=np.float64)
return X, y
if target is not None:
X, y = check_X_y(features, target, accept_sparse=True, dtype=np.float64)
return X, y
else:
X = check_array(features, order="C", accept_sparse=True, dtype=np.float64)
return X
except (AssertionError, ValueError):
raise ValueError(
'Error: Input data is not in a valid format. Please confirm '
Expand Down

0 comments on commit 15234fa

Please sign in to comment.