Merge ca26160 into c92fea5

EpistasisLab · Jun 25, 2018 · 4aa9117 · 4aa9117
2 parents c92fea5 + ca26160
commit 4aa9117
Show file tree

Hide file tree

Showing 17 changed files with 329 additions and 32 deletions.
diff --git a/docs_sources/api.md b/docs_sources/api.md
@@ -116,7 +116,7 @@ How many minutes TPOT has to optimize the pipeline.
 If not None, this setting will override the <em>generations</em> parameter and allow TPOT to run until <em>max_time_mins</em> minutes elapse.
 </blockquote>
 
-<strong>max_eval_time_mins</strong>: integer, optional (default=5)
+<strong>max_eval_time_mins</strong>: float, optional (default=5)
 <blockquote>
 How many minutes TPOT has to evaluate a single pipeline.
 <br /><br />
@@ -588,7 +588,7 @@ How many minutes TPOT has to optimize the pipeline.
 If not None, this setting will override the <em>generations</em> parameter and allow TPOT to run until <em>max_time_mins</em> minutes elapse.
 </blockquote>
 
-<strong>max_eval_time_mins</strong>: integer, optional (default=5)
+<strong>max_eval_time_mins</strong>: float, optional (default=5)
 <blockquote>
 How many minutes TPOT has to evaluate a single pipeline.
 <br /><br />

diff --git a/docs_sources/using.md b/docs_sources/using.md
@@ -253,7 +253,7 @@ If provided, this setting will override the "generations" parameter and allow TP
 <tr>
 <td>-maxeval</td>
 <td>MAX_EVAL_MINS</td>
-<td>Any positive integer</td>
+<td>Any positive float</td>
 <td>How many minutes TPOT has to evaluate a single pipeline.
 <br /><br />
 Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer.</td>

diff --git a/tests/export_tests.py b/tests/export_tests.py
@@ -505,9 +505,14 @@ def test_indent():
 
 def test_pipeline_score_save():
     """Assert that the TPOTClassifier can generate a scored pipeline export correctly."""
-    tpot_obj = TPOTClassifier(random_state=39)
+    tpot_obj = TPOTClassifier()
     tpot_obj._pbar = tqdm(total=1, disable=True)
-    pipeline = tpot_obj._toolbox.individual()
+    pipeline_string = (
+        'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),'
+        'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
+        'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
+    )
+    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
     expected_code = """import numpy as np
 import pandas as pd
 from sklearn.feature_selection import SelectPercentile, f_classif
@@ -521,16 +526,15 @@ def test_pipeline_score_save():
 training_features, testing_features, training_target, testing_target = \\
             train_test_split(features, tpot_data['target'].values, random_state=42)
 
-# Score on the training set was:0.929813743
+# Average CV score on the training set was:0.929813743
 exported_pipeline = make_pipeline(
-    SelectPercentile(score_func=f_classif, percentile=65),
-    DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18)
+    SelectPercentile(score_func=f_classif, percentile=20),
+    DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
 )
 
 exported_pipeline.fit(training_features, training_target)
 results = exported_pipeline.predict(testing_features)
 """
-
     assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))
 
 

diff --git a/tests/feature_transformers_tests.py b/tests/feature_transformers_tests.py
@@ -0,0 +1,80 @@
+from sklearn.datasets import load_iris
+from tpot.builtins import CategoricalSelector, ContinuousSelector
+from nose.tools import assert_equal, assert_raises
+
+iris_data = load_iris().data
+
+def test_CategoricalSelector():
+    """Assert that CategoricalSelector works as expected."""
+    cs = CategoricalSelector()
+    X_transformed = cs.transform(iris_data[0:16, :])
+
+    assert_equal(X_transformed.shape[1],2)
+
+
+def test_CategoricalSelector_2():
+    """Assert that CategoricalSelector works as expected with threshold=5."""
+    cs = CategoricalSelector(threshold=5)
+    X_transformed = cs.transform(iris_data[0:16, :])
+
+    assert_equal(X_transformed.shape[1],1)
+
+
+def test_CategoricalSelector_3():
+    """Assert that CategoricalSelector works as expected with threshold=20."""
+    cs = CategoricalSelector(threshold=20)
+    X_transformed = cs.transform(iris_data[0:16, :])
+
+    assert_equal(X_transformed.shape[1],7)
+
+
+def test_CategoricalSelector_4():
+    """Assert that CategoricalSelector rasies ValueError without categorical features."""
+    cs = CategoricalSelector()
+
+    assert_raises(ValueError, cs.transform, iris_data)
+
+
+def test_CategoricalSelector_fit():
+    """Assert that fit() in CategoricalSelector does nothing."""
+    op = CategoricalSelector()
+    ret_op = op.fit(iris_data)
+
+    assert ret_op==op
+
+
+def test_ContinuousSelector():
+    """Assert that ContinuousSelector works as expected."""
+    cs = ContinuousSelector(svd_solver='randomized')
+    X_transformed = cs.transform(iris_data[0:16, :])
+
+    assert_equal(X_transformed.shape[1],2)
+
+
+def test_ContinuousSelector_2():
+    """Assert that ContinuousSelector works as expected with threshold=5."""
+    cs = ContinuousSelector(threshold=5, svd_solver='randomized')
+    X_transformed = cs.transform(iris_data[0:16, :])
+    assert_equal(X_transformed.shape[1],3)
+
+
+def test_ContinuousSelector_3():
+    """Assert that ContinuousSelector works as expected with svd_solver='full'"""
+    cs = ContinuousSelector(threshold=10, svd_solver='full')
+    X_transformed = cs.transform(iris_data[0:16, :])
+    assert_equal(X_transformed.shape[1],2)
+
+
+def test_ContinuousSelector_4():
+    """Assert that ContinuousSelector rasies ValueError without categorical features."""
+    cs = ContinuousSelector()
+
+    assert_raises(ValueError, cs.transform, iris_data[0:10,:])
+
+
+def test_ContinuousSelector_fit():
+    """Assert that fit() in ContinuousSelector does nothing."""
+    op = ContinuousSelector()
+    ret_op = op.fit(iris_data)
+
+    assert ret_op==op
diff --git a/tests/one_hot_encoder_tests.py b/tests/one_hot_encoder_tests.py
@@ -37,7 +37,7 @@
 from sklearn.model_selection import cross_val_score, KFold
 from nose.tools import assert_equal
 
-from tpot.builtins.one_hot_encoder import OneHotEncoder, _auto_select_categorical_features
+from tpot.builtins import OneHotEncoder, auto_select_categorical_features, _transform_selected
 
 
 iris_data = load_iris().data
@@ -144,8 +144,9 @@ def fit_then_transform_dense(expected, input,
 
 def test_auto_detect_categorical():
     """Assert that automatic selection of categorical features works as expected with a threshold of 10."""
-    selected = _auto_select_categorical_features(iris_data[0:16, :], threshold=10)
+    selected = auto_select_categorical_features(iris_data[0:16, :], threshold=10)
     expected = [False, False, True, True]
+
     assert_equal(selected, expected)
 
 
@@ -295,10 +296,33 @@ def test_transform():
     assert np.sum(output) == 3
 
 
+def test_transform_selected():
+    """Assert _transform_selected return original X when selected is empty list"""
+    ohe = OneHotEncoder(categorical_features=[])
+    X = _transform_selected(
+            dense1,
+            ohe._fit_transform,
+            ohe.categorical_features,
+            copy=True
+        )
+    assert np.allclose(X, dense1)
+
+
+def test_transform_selected_2():
+    """Assert _transform_selected return original X when selected is a list of False values"""
+    ohe = OneHotEncoder(categorical_features=[False, False, False])
+    X = _transform_selected(
+            dense1,
+            ohe._fit_transform,
+            ohe.categorical_features,
+            copy=True
+        )
+    assert np.allclose(X, dense1)
+
+
 def test_k_fold_cv():
     """Test OneHotEncoder with categorical_features='auto'."""
     boston = load_boston()
-
     clf = make_pipeline(
         OneHotEncoder(
             categorical_features='auto',

diff --git a/tests/zero_count_tests.py b/tests/zero_count_tests.py
@@ -40,3 +40,11 @@ def test_ZeroCount():
 
     assert np.allclose(zero_col, X_transformed[:, 0])
     assert np.allclose(non_zero, X_transformed[:, 1])
+
+
+def test_ZeroCount_fit():
+    """Assert that fit() in ZeroCount does nothing."""
+    op = ZeroCount()
+    ret_op = op.fit(X)
+
+    assert ret_op==op
diff --git a/tpot/base.py b/tpot/base.py
@@ -174,7 +174,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
             How many minutes TPOT has to optimize the pipeline.
             If provided, this setting will override the "generations" parameter and allow
             TPOT to run until it runs out of time.
-        max_eval_time_mins: int, optional (default: 5)
+        max_eval_time_mins: float, optional (default: 5)
             How many minutes TPOT has to optimize a single pipeline.
             Setting this parameter to higher values will allow TPOT to explore more
             complex pipelines, but will also allow TPOT to run longer.
@@ -863,6 +863,12 @@ def predict_proba(self, features):
         else:
             if not (hasattr(self.fitted_pipeline_, 'predict_proba')):
                 raise RuntimeError('The fitted pipeline does not have the predict_proba() function.')
+
+            features = features.astype(np.float64)
+
+            if np.any(np.isnan(features)):
+                features = self._impute_values(features)
+
             return self.fitted_pipeline_.predict_proba(features.astype(np.float64))
 
     def set_params(self, **params):

diff --git a/tpot/builtins/__init__.py b/tpot/builtins/__init__.py
@@ -26,4 +26,5 @@
 from .zero_count import ZeroCount
 from .combine_dfs import CombineDFs
 from .stacking_estimator import StackingEstimator
-from .one_hot_encoder import OneHotEncoder
+from .one_hot_encoder import OneHotEncoder, auto_select_categorical_features, _transform_selected
+from .feature_transformers import CategoricalSelector, ContinuousSelector