Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GMMRegression scikit-learn RegressorMixin #29

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gmr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
if not __GMR_SETUP__:
from . import gmm, mvn, utils

__all__ = ["gmm", "mvn", "utils"]
__all__ = ["gmm", "mvn", "utils", "sklearn"]

from .mvn import MVN, plot_error_ellipse
from .gmm import (GMM, plot_error_ellipses, kmeansplusplus_initialization,
Expand Down
87 changes: 87 additions & 0 deletions gmr/sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import numpy as np

try:
from sklearn.base import BaseEstimator, RegressorMixin, MultiOutputMixin
from sklearn.utils import check_X_y
from sklearn.utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
except ImportError:
raise ImportError("Install scikit-learn (e.g. pip install scikit-learn) to use this extension.")

from .gmm import GMM


class GaussianMixtureRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
"""Gaussian mixture regression compatible to scikit-learn.

Parameters
AlexanderFabisch marked this conversation as resolved.
Show resolved Hide resolved
----------
n_components : int
Number of MVNs that compose the GMM.

priors : array, shape (n_components,), optional
Weights of the components.

means : array, shape (n_components, n_features), optional
Means of the components.

covariances : array, shape (n_components, n_features, n_features), optional
Covariances of the components.

verbose : int, optional (default: 0)
Verbosity level.

random_state : int or RandomState, optional (default: global random state)
If an integer is given, it fixes the seed. Defaults to the global numpy
random number generator.

R_diff : float, optional (default: 1e-4)
Minimum allowed difference of responsibilities between successive
EM iterations.

n_iter : int, optional (default: 500)
Maximum number of iterations.

init_params : str, optional (default: 'random')
Parameter initialization strategy. If means and covariances are
given in the constructor, this parameter will have no effect.
'random' will sample initial means randomly from the dataset
and set covariances to identity matrices. This is the
computationally cheap solution.
'kmeans++' will use k-means++ initialization for means and
initialize covariances to diagonal matrices with variances
set based on the average distances of samples in each dimensions.
This is computationally more expensive but often gives much
better results.
"""

mralbu marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self, n_components, priors=None, means=None, covariances=None,
verbose=0, random_state=None, R_diff=1e-4, n_iter=500, init_params="random"):
self.n_components = n_components
self.priors = priors
self.means = means
self.covariances = covariances
self.verbose = verbose
self.random_state = random_state
self.R_diff = R_diff
self.n_iter = n_iter
self.init_params = init_params

def fit(self, X, y):
self.gmm_ = GMM(self.n_components, priors=self.priors, means=self.means,
AlexanderFabisch marked this conversation as resolved.
Show resolved Hide resolved
covariances=self.covariances, verbose=self.verbose, random_state=self.random_state)

X, y = check_X_y(X, y, estimator=self.gmm_, dtype=FLOAT_DTYPES, multi_output=True)
if y.ndim == 1:
y = np.expand_dims(y, 1)

self.indices_ = np.arange(X.shape[1])

self.gmm_.from_samples(np.hstack((X, y)),
R_diff=self.R_diff, n_iter=self.n_iter, init_params=self.init_params)
return self

def predict(self, X):
check_is_fitted(self, ["gmm_", "indices_"])
X = check_array(X, estimator=self.gmm_, dtype=FLOAT_DTYPES)

return self.gmm_.predict(self.indices_, X)
8 changes: 8 additions & 0 deletions gmr/tests/test_gmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,14 @@ def test_regression_with_2d_input():
pred = gmm.predict(np.array([0, 1]), np.hstack((x, x[::-1])))
mse = np.sum((y - pred) ** 2) / n_samples

n_samples = 200
x = np.linspace(0, 2, n_samples)[:, np.newaxis]
y1 = 3 * x[:n_samples // 2] + 1
y2 = -3 * x[n_samples // 2:] + 7
noise = random_state.randn(n_samples, 1) * 0.01
y = np.vstack((y1, y2)) + noise
samples = np.hstack((x, x[::-1], y))


def test_regression_without_noise():
"""Test regression without noise."""
Expand Down
84 changes: 84 additions & 0 deletions gmr/tests/test_sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import numpy as np
from nose.tools import assert_less
from numpy.testing import assert_array_almost_equal
from gmr.utils import check_random_state

from gmr.sklearn import GaussianMixtureRegressor


def test_sklearn_regression():
"""Test regression with GaussianMixtureRegressor."""
random_state = check_random_state(0)

n_samples = 200
x = np.linspace(0, 2, n_samples)[:, np.newaxis]
y1 = 3 * x[:n_samples // 2] + 1
y2 = -3 * x[n_samples // 2:] + 7
noise = random_state.randn(n_samples, 1) * 0.01
y = np.vstack((y1, y2)) + noise

gmr = GaussianMixtureRegressor(n_components=2, random_state=random_state)
gmr.fit(x, y)
assert_array_almost_equal(gmr.gmm_.priors, 0.5 * np.ones(2), decimal=2)
assert_array_almost_equal(gmr.gmm_.means[0], np.array([0.5, 2.5]), decimal=2)
assert_array_almost_equal(gmr.gmm_.means[1], np.array([1.5, 2.5]), decimal=1)

pred = gmr.predict(x)
mse = np.sum((y - pred) ** 2) / n_samples
assert_less(mse, 0.01)


def test_sklearn_regression_with_2d_input():
"""Test regression with GaussianMixtureRegressor and two-dimensional input."""
random_state = check_random_state(0)

n_samples = 200
x = np.linspace(0, 2, n_samples)[:, np.newaxis]
y1 = 3 * x[:n_samples // 2] + 1
y2 = -3 * x[n_samples // 2:] + 7
noise = random_state.randn(n_samples, 1) * 0.01
y = np.vstack((y1, y2)) + noise

gmr = GaussianMixtureRegressor(n_components=2, random_state=random_state)
gmr.fit(x, y)

pred = gmr.predict(x)
mse = np.sum((y - pred) ** 2) / n_samples
assert_less(mse, 0.01)


def test_sklearn_regression_with_1d_output():
"""Test regression with GaussianMixtureRegressor and two-dimensional input."""
random_state = check_random_state(0)

n_samples = 200
x = np.linspace(0, 2, n_samples)[:, np.newaxis]
y = 3 * x + 1
y = y.flatten()

gmr = GaussianMixtureRegressor(n_components=1, random_state=random_state)
gmr.fit(x, y)

pred = gmr.predict(x)
mse = np.sum((y - pred) ** 2) / n_samples


def test_sklearn_regression_without_noise():
"""Test regression without noise."""
random_state = 0

n_samples = 200
x = np.linspace(0, 2, n_samples)[:, np.newaxis]
y1 = 3 * x[:n_samples // 2] + 1
y2 = -3 * x[n_samples // 2:] + 7
y = np.vstack((y1, y2))

gmr = GaussianMixtureRegressor(n_components=2, random_state=random_state)
gmr.fit(x, y)
assert_array_almost_equal(gmr.gmm_.priors, 0.5 * np.ones(2), decimal=2)
assert_array_almost_equal(gmr.gmm_.means[0], np.array([1.5, 2.5]), decimal=2)
assert_array_almost_equal(gmr.gmm_.means[1], np.array([0.5, 2.5]), decimal=1)

pred = gmr.predict(x)
mse = np.sum((y - pred) ** 2) / n_samples
assert_less(mse, 0.01)