AlexanderFabisch · mralbu · Apr 3, 2021 · Apr 3, 2021 · Apr 4, 2021 · Apr 14, 2021
diff --git a/gmr/__init__.py b/gmr/__init__.py
@@ -20,7 +20,7 @@
 if not __GMR_SETUP__:
     from . import gmm, mvn, utils
 
-    __all__ = ["gmm", "mvn", "utils"]
+    __all__ = ["gmm", "mvn", "utils", "sklearn"]
 
     from .mvn import MVN, plot_error_ellipse
     from .gmm import (GMM, plot_error_ellipses, kmeansplusplus_initialization,

diff --git a/gmr/sklearn.py b/gmr/sklearn.py
@@ -0,0 +1,87 @@
+import numpy as np
+
+try:
+    from sklearn.base import BaseEstimator, RegressorMixin, MultiOutputMixin
+    from sklearn.utils import check_X_y
+    from sklearn.utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
+except ImportError:
+    raise ImportError("Install scikit-learn (e.g. pip install scikit-learn) to use this extension.")
+
+from .gmm import GMM
+
+
+class GaussianMixtureRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
+    """Gaussian mixture regression compatible to scikit-learn.
+
+    Parameters
+    ----------
+    n_components : int
+        Number of MVNs that compose the GMM.
+
+    priors : array, shape (n_components,), optional
+        Weights of the components.
+
+    means : array, shape (n_components, n_features), optional
+        Means of the components.
+
+    covariances : array, shape (n_components, n_features, n_features), optional
+        Covariances of the components.
+
+    verbose : int, optional (default: 0)
+        Verbosity level.
+
+    random_state : int or RandomState, optional (default: global random state)
+        If an integer is given, it fixes the seed. Defaults to the global numpy
+        random number generator.
+
+    R_diff : float, optional (default: 1e-4)
+        Minimum allowed difference of responsibilities between successive
+        EM iterations.
+
+    n_iter : int, optional (default: 500)
+        Maximum number of iterations.
+
+    init_params : str, optional (default: 'random')
+        Parameter initialization strategy. If means and covariances are
+        given in the constructor, this parameter will have no effect.
+        'random' will sample initial means randomly from the dataset
+        and set covariances to identity matrices. This is the
+        computationally cheap solution.
+        'kmeans++' will use k-means++ initialization for means and
+        initialize covariances to diagonal matrices with variances
+        set based on the average distances of samples in each dimensions.
+        This is computationally more expensive but often gives much
+        better results.
+    """
+
+    def __init__(self, n_components, priors=None, means=None, covariances=None,
+                 verbose=0, random_state=None, R_diff=1e-4, n_iter=500, init_params="random"):
+        self.n_components = n_components
+        self.priors = priors
+        self.means = means
+        self.covariances = covariances
+        self.verbose = verbose
+        self.random_state = random_state
+        self.R_diff = R_diff
+        self.n_iter = n_iter
+        self.init_params = init_params
+
+    def fit(self, X, y):
+        self.gmm_ = GMM(self.n_components, priors=self.priors, means=self.means,
+                        covariances=self.covariances, verbose=self.verbose, random_state=self.random_state)
+
+        X, y = check_X_y(X, y, estimator=self.gmm_, dtype=FLOAT_DTYPES, multi_output=True)
+        if y.ndim == 1:
+            y = np.expand_dims(y, 1)
+
+        self.indices_ = np.arange(X.shape[1])
+
+        self.gmm_.from_samples(np.hstack((X, y)),
+                               R_diff=self.R_diff, n_iter=self.n_iter, init_params=self.init_params)
+        return self
+
+    def predict(self, X):
+        check_is_fitted(self, ["gmm_", "indices_"])
+        X = check_array(X, estimator=self.gmm_, dtype=FLOAT_DTYPES)
+
+        return self.gmm_.predict(self.indices_, X)
diff --git a/gmr/tests/test_gmm.py b/gmr/tests/test_gmm.py
@@ -251,6 +251,14 @@ def test_regression_with_2d_input():
     pred = gmm.predict(np.array([0, 1]), np.hstack((x, x[::-1])))
     mse = np.sum((y - pred) ** 2) / n_samples
 
+    n_samples = 200
+    x = np.linspace(0, 2, n_samples)[:, np.newaxis]
+    y1 = 3 * x[:n_samples // 2] + 1
+    y2 = -3 * x[n_samples // 2:] + 7
+    noise = random_state.randn(n_samples, 1) * 0.01
+    y = np.vstack((y1, y2)) + noise
+    samples = np.hstack((x, x[::-1], y))
+
 
 def test_regression_without_noise():
     """Test regression without noise."""

diff --git a/gmr/tests/test_sklearn.py b/gmr/tests/test_sklearn.py
@@ -0,0 +1,84 @@
+import numpy as np
+from nose.tools import assert_less
+from numpy.testing import assert_array_almost_equal
+from gmr.utils import check_random_state
+
+from gmr.sklearn import GaussianMixtureRegressor
+
+
+def test_sklearn_regression():
+    """Test regression with GaussianMixtureRegressor."""
+    random_state = check_random_state(0)
+
+    n_samples = 200
+    x = np.linspace(0, 2, n_samples)[:, np.newaxis]
+    y1 = 3 * x[:n_samples // 2] + 1
+    y2 = -3 * x[n_samples // 2:] + 7
+    noise = random_state.randn(n_samples, 1) * 0.01
+    y = np.vstack((y1, y2)) + noise
+
+    gmr = GaussianMixtureRegressor(n_components=2, random_state=random_state)
+    gmr.fit(x, y)
+    assert_array_almost_equal(gmr.gmm_.priors, 0.5 * np.ones(2), decimal=2)
+    assert_array_almost_equal(gmr.gmm_.means[0], np.array([0.5, 2.5]), decimal=2)
+    assert_array_almost_equal(gmr.gmm_.means[1], np.array([1.5, 2.5]), decimal=1)
+
+    pred = gmr.predict(x)
+    mse = np.sum((y - pred) ** 2) / n_samples
+    assert_less(mse, 0.01)
+
+
+def test_sklearn_regression_with_2d_input():
+    """Test regression with GaussianMixtureRegressor and two-dimensional input."""
+    random_state = check_random_state(0)
+
+    n_samples = 200
+    x = np.linspace(0, 2, n_samples)[:, np.newaxis]
+    y1 = 3 * x[:n_samples // 2] + 1
+    y2 = -3 * x[n_samples // 2:] + 7
+    noise = random_state.randn(n_samples, 1) * 0.01
+    y = np.vstack((y1, y2)) + noise
+
+    gmr = GaussianMixtureRegressor(n_components=2, random_state=random_state)
+    gmr.fit(x, y)
+
+    pred = gmr.predict(x)
+    mse = np.sum((y - pred) ** 2) / n_samples
+    assert_less(mse, 0.01)
+
+
+def test_sklearn_regression_with_1d_output():
+    """Test regression with GaussianMixtureRegressor and two-dimensional input."""
+    random_state = check_random_state(0)
+
+    n_samples = 200
+    x = np.linspace(0, 2, n_samples)[:, np.newaxis]
+    y = 3 * x + 1
+    y = y.flatten()
+
+    gmr = GaussianMixtureRegressor(n_components=1, random_state=random_state)
+    gmr.fit(x, y)
+
+    pred = gmr.predict(x)
+    mse = np.sum((y - pred) ** 2) / n_samples
+
+
+def test_sklearn_regression_without_noise():
+    """Test regression without noise."""
+    random_state = 0
+
+    n_samples = 200
+    x = np.linspace(0, 2, n_samples)[:, np.newaxis]
+    y1 = 3 * x[:n_samples // 2] + 1
+    y2 = -3 * x[n_samples // 2:] + 7
+    y = np.vstack((y1, y2))
+
+    gmr = GaussianMixtureRegressor(n_components=2, random_state=random_state)
+    gmr.fit(x, y)
+    assert_array_almost_equal(gmr.gmm_.priors, 0.5 * np.ones(2), decimal=2)
+    assert_array_almost_equal(gmr.gmm_.means[0], np.array([1.5, 2.5]), decimal=2)
+    assert_array_almost_equal(gmr.gmm_.means[1], np.array([0.5, 2.5]), decimal=1)
+
+    pred = gmr.predict(x)
+    mse = np.sum((y - pred) ** 2) / n_samples
+    assert_less(mse, 0.01)