Merge d33a238 into 815b0e2

EpistasisLab · Jul 26, 2019 · d9eec3e · d9eec3e
2 parents 815b0e2 + d33a238
commit d9eec3e
Show file tree

Hide file tree

Showing 5 changed files with 239 additions and 0 deletions.
diff --git a/optional-requirements.txt b/optional-requirements.txt
@@ -1,3 +1,4 @@
 xgboost==0.6a2
 scikit-mdr==0.4.4
 skrebate==0.3.4
+tensorflow>=1.12.0
diff --git a/setup.py b/setup.py
@@ -46,6 +46,7 @@ def calculate_version():
                     'joblib>=0.10.3'],
     extras_require={
         'xgboost': ['xgboost==0.6a2'],
+        'tensorflow': ['tensorflow>=1.12.0'],
         'skrebate': ['skrebate>=0.3.4'],
         'mdr': ['scikit-mdr>=0.4.4'],
         'dask': ['dask>=0.18.2',

diff --git a/tests/embedding_estimator_tests.py b/tests/embedding_estimator_tests.py
@@ -0,0 +1,93 @@
+from sklearn.datasets import make_classification, make_regression
+from tpot.builtins import EmbeddingEstimator
+from sklearn.neural_network import MLPClassifier, MLPRegressor
+
+
+def test_EmbeddingClassifier_1():
+    """Assert that Embedding for classification works as expected."""
+    X, y = make_classification(random_state=1)
+    cs = EmbeddingEstimator(MLPClassifier(random_state=1, tol=0.9))
+    X_transformed = cs.fit_transform(X, y)
+
+    # 20 features + 100 embedding size
+    assert X_transformed.shape[1] == 120
+
+
+def test_EmbeddingClassifier_2():
+    """Assert that correct embedding layer is selected (classifier)."""
+    X, y = make_classification(random_state=1)
+    cs = EmbeddingEstimator(
+        MLPClassifier(hidden_layer_sizes=[20, 10], random_state=1, tol=0.9)
+    )
+    cs_2 = EmbeddingEstimator(
+        MLPClassifier(hidden_layer_sizes=[20, 10], random_state=1, tol=0.9),
+        embedding_layer=1,
+    )
+    X_transformed = cs.fit_transform(X, y)
+    X_transformed_2 = cs_2.fit_transform(X, y)
+
+    assert X_transformed.shape[1] == 30  # 20 features + 20 embedding size
+    assert X_transformed_2.shape[1] == 40  # 20 features + 20 embedding size
+
+
+def test_EmbeddingRegressor_1():
+    """Assert that Embedding for regressor works as expected."""
+    X, y = make_regression(n_features=20, random_state=1)
+    cs = EmbeddingEstimator(MLPRegressor(random_state=1, tol=1000))
+    X_transformed = cs.fit_transform(X, y)
+
+    # 20 features + 100 embedding size
+    assert X_transformed.shape[1] == 120
+
+
+def test_EmbeddingRegressor_2():
+    """Assert that correct embedding layer is selected (regressor)."""
+    X, y = make_regression(n_features=20, random_state=1)
+    cs = EmbeddingEstimator(
+        MLPRegressor(hidden_layer_sizes=[20, 10], random_state=1, tol=1000)
+    )
+    cs_2 = EmbeddingEstimator(
+        MLPRegressor(hidden_layer_sizes=[20, 10], random_state=1, tol=1000),
+        embedding_layer=1,
+    )
+    X_transformed = cs.fit_transform(X, y)
+    X_transformed_2 = cs_2.fit_transform(X, y)
+
+    assert X_transformed.shape[1] == 30  # 20 features + 20 embedding size
+    assert X_transformed_2.shape[1] == 40  # 20 features + 20 embedding size
+
+
+def test_EmbeddingKeras():
+    """Check that this works also for keras models"""
+    try:
+        import tensorflow as tf
+    except ImportError:
+        tf = None
+    if tf is None:
+        return
+    from tensorflow.keras import backend as K
+    import tensorflow.keras as keras
+    from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
+    from tensorflow.keras.models import Sequential
+    from tensorflow.keras.layers import Dense, Activation
+
+    def make_model(input_shape):
+        model = Sequential()
+        model.add(Dense(20, activation="relu", input_dim=input_shape))
+        model.add(Dense(15, activation="relu"))
+        model.add(Dense(2, activation="softmax"))
+        model.compile(
+            optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
+        )
+        return model
+
+    X, y = make_classification(random_state=1)
+    cs = EmbeddingEstimator(KerasClassifier(make_model), backend=K)
+    cs_2 = EmbeddingEstimator(
+        KerasClassifier(make_model), embedding_layer=-3, backend=K
+    )
+    X_transformed = cs.fit_transform(X, y, verbose=0)
+    X_transformed_2 = cs_2.fit_transform(X, y, verbose=0)
+
+    assert X_transformed.shape[1] == 35  # 20 features + 15 embedding size
+    assert X_transformed_2.shape[1] == 40  # 20 features + 20 embedding size
diff --git a/tpot/builtins/__init__.py b/tpot/builtins/__init__.py
@@ -29,3 +29,4 @@
 from .one_hot_encoder import OneHotEncoder, auto_select_categorical_features, _transform_selected
 from .feature_transformers import CategoricalSelector, ContinuousSelector
 from .feature_set_selector import FeatureSetSelector
+from .embedding_estimator import EmbeddingEstimator
diff --git a/tpot/builtins/embedding_estimator.py b/tpot/builtins/embedding_estimator.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""This file is part of the TPOT library.
+
+TPOT was primarily developed at the University of Pennsylvania by:
+    - Randal S. Olson (rso@randalolson.com)
+    - Weixuan Fu (weixuanf@upenn.edu)
+    - Daniel Angell (dpa34@drexel.edu)
+    - and many more generous open source contributors
+
+TPOT is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as
+published by the Free Software Foundation, either version 3 of
+the License, or (at your option) any later version.
+
+TPOT is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with TPOT. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils import check_array
+from sklearn.neural_network import MLPClassifier, MLPRegressor
+
+
+class EmbeddingEstimator(TransformerMixin, BaseEstimator):
+    """Meta-transformer for creating neural network embeddings as features.
+    """
+
+    def __init__(self, estimator, embedding_layer=None, backend=None):
+        """Create a StackingEstimator object.
+
+        Parameters
+        ----------
+        estimator: neural network model; either from sklearn or Keras-like.
+            The estimator to generate embeddings.
+        embedding_layer: the particular layer used as the embedding. 
+            By default we use the second last layer. Layers are counted with
+            input layer being `0th` layer; negative indices are allowed. 
+        backend: (optional), the backend we use to query the neural network. 
+            Not required if using scikit-learn interface.
+            Currently only supports keras-like interface (incl. tensorflow)
+        """
+        second_last_layer = -2
+        self.estimator = estimator
+        self.embedding_layer = (
+            second_last_layer if embedding_layer is None else embedding_layer
+        )
+        self.backend = backend
+
+    def fit(self, X, y=None, **fit_params):
+        """Fit the StackingEstimator meta-transformer.
+
+        Parameters
+        ----------
+        X: array-like of shape (n_samples, n_features)
+            The training input samples.
+        y: array-like, shape (n_samples,)
+            The target values (integers that correspond to classes in classification, real numbers in regression).
+        fit_params:
+            Other estimator-specific parameters.
+
+        Returns
+        -------
+        self: object
+            Returns a copy of the estimator
+        """
+        if not issubclass(self.estimator.__class__, MLPClassifier) and not issubclass(
+            self.estimator.__class__, MLPRegressor
+        ):
+            input_shape = X.shape[1]
+            self.estimator.sk_params["input_shape"] = input_shape
+            self.estimator.check_params(self.estimator.sk_params)
+        self.estimator.fit(X, y, **fit_params)
+        return self
+
+    def transform(self, X):
+        """Transform data by adding embedding as features.
+
+        Parameters
+        ----------
+        X: numpy ndarray, {n_samples, n_components}
+            New data, where n_samples is the number of samples and n_components is the number of components.
+
+        Returns
+        -------
+        X_transformed: array-like, shape (n_samples, n_features + embedding) where embedding is the size of the embedding layer
+            The transformed feature set.
+        """
+        X = check_array(X)
+        X_transformed = np.copy(X)
+        # add class probabilities as a synthetic feature
+        if issubclass(self.estimator.__class__, MLPClassifier) or issubclass(
+            self.estimator.__class__, MLPRegressor
+        ):
+            X_transformed = np.hstack(
+                (self._embedding_mlp(self.estimator, X), X_transformed)
+            )
+        else:
+            X_transformed = np.hstack(
+                (self._embedding_keras(self.estimator, X), X_transformed)
+            )
+
+        return X_transformed
+
+    def _embedding_mlp(self, estimator, X):
+        # see also BaseMultilayerPerceptron._predict from
+        # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/neural_network/multilayer_perceptron.py
+        X = check_array(X, accept_sparse=["csr", "csc", "coo"])
+
+        # Make sure self.hidden_layer_sizes is a list
+        hidden_layer_sizes = estimator.hidden_layer_sizes
+        if not hasattr(hidden_layer_sizes, "__iter__"):
+            hidden_layer_sizes = [hidden_layer_sizes]
+        hidden_layer_sizes = list(hidden_layer_sizes)
+
+        layer_units = [X.shape[1]] + hidden_layer_sizes + [estimator.n_outputs_]
+
+        # Initialize layers
+        activations = [X]
+
+        for i in range(estimator.n_layers_ - 1):
+            activations.append(np.empty((X.shape[0], layer_units[i + 1])))
+        # forward propagate
+        estimator._forward_pass(activations)
+        y_embedding = activations[self.embedding_layer]
+
+        return y_embedding
+
+    def _embedding_keras(self, estimator, X):
+        X = check_array(X, accept_sparse=["csr", "csc", "coo"])
+        get_embedding = self.backend.function(
+            [estimator.model.layers[0].input],
+            [estimator.model.layers[self.embedding_layer].output],
+        )
+        return get_embedding([X])[0]