Merge pull request #737 from weixuanfu/fix_issue733

Refine check dataset for checking features in predict() and other functions
EpistasisLab · Jul 31, 2018 · 15234fa · 15234fa
2 parents f66b893 + af260a3
commit 15234fa
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 15 deletions.
diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py
@@ -1558,6 +1558,21 @@ def test_check_dataset_4():
     assert_raises(ValueError, tpot_obj._check_dataset, training_features, training_target, test_sample_weight)
 
 
+def test_check_dataset_5():
+    """Assert that the check_dataset function returns feature and target as expected."""
+    tpot_obj = TPOTClassifier(
+        random_state=42,
+        population_size=1,
+        offspring_size=2,
+        generations=1,
+        verbosity=0,
+        config_dict='TPOT light'
+    )
+
+    ret_features = tpot_obj._check_dataset(training_features, target=None)
+    assert np.allclose(ret_features, training_features)
+
+
 def test_imputer():
     """Assert that the TPOT fit function will not raise a ValueError in a dataset where NaNs are present."""
     tpot_obj = TPOTClassifier(

diff --git a/tpot/base.py b/tpot/base.py
@@ -46,7 +46,7 @@
 from copy import copy, deepcopy
 
 from sklearn.base import BaseEstimator
-from sklearn.utils import check_X_y, check_consistent_length
+from sklearn.utils import check_X_y, check_consistent_length, check_array
 from sklearn.externals.joblib import Parallel, delayed, Memory
 from sklearn.pipeline import make_pipeline, make_union
 from sklearn.preprocessing import FunctionTransformer, Imputer
@@ -800,10 +800,7 @@ def predict(self, features):
         if not self.fitted_pipeline_:
             raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.')
 
-        features = features.astype(np.float64)
-
-        if np.any(np.isnan(features)):
-            features = self._impute_values(features)
+        features = self._check_dataset(features, target=None, sample_weight=None)
 
         return self.fitted_pipeline_.predict(features)
 
@@ -830,6 +827,7 @@ def fit_predict(self, features, target, sample_weight=None, groups=None):
 
         """
         self.fit(features, target, sample_weight=sample_weight, groups=groups)
+
         return self.predict(features)
 
     def score(self, testing_features, testing_target):
@@ -851,8 +849,7 @@ def score(self, testing_features, testing_target):
         if self.fitted_pipeline_ is None:
             raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.')
 
-        if np.any(np.isnan(testing_features)):
-            testing_features = self._impute_values(testing_features)
+        testing_features, testing_target = self._check_dataset(testing_features, testing_target, sample_weight=None)
 
         # If the scoring function is a string, we must adjust to use the sklearn
         # scoring interface
@@ -883,12 +880,10 @@ def predict_proba(self, features):
             if not (hasattr(self.fitted_pipeline_, 'predict_proba')):
                 raise RuntimeError('The fitted pipeline does not have the predict_proba() function.')
 
-            features = features.astype(np.float64)
-
-            if np.any(np.isnan(features)):
-                features = self._impute_values(features)
+            #features = features.astype(np.float64)
+            features = self._check_dataset(features, target=None, sample_weight=None)
 
-            return self.fitted_pipeline_.predict_proba(features.astype(np.float64))
+            return self.fitted_pipeline_.predict_proba(features)
 
     def set_params(self, **params):
         """Set the parameters of TPOT.
@@ -1024,7 +1019,7 @@ def _check_dataset(self, features, target, sample_weight=None):
         ----------
         features: array-like {n_samples, n_features}
             Feature matrix
-        target: array-like {n_samples}
+        target: array-like {n_samples} or None
             List of class labels for prediction
         sample_weight: array-like {n_samples} (optional)
             List of weights indicating relative importance
@@ -1061,9 +1056,14 @@ def _check_dataset(self, features, target, sample_weight=None):
             if np.any(np.isnan(features)):
                 self._imputed = True
                 features = self._impute_values(features)
+
         try:
-            X, y = check_X_y(features, target, accept_sparse=True, dtype=np.float64)
-            return X, y
+            if target is not None:
+                X, y = check_X_y(features, target, accept_sparse=True, dtype=np.float64)
+                return X, y
+            else:
+                X = check_array(features, order="C",  accept_sparse=True, dtype=np.float64)
+                return X
         except (AssertionError, ValueError):
             raise ValueError(
                 'Error: Input data is not in a valid format. Please confirm '