Skip to content

Commit

Permalink
Improve RMH correction with covariance estimated from data.
Browse files Browse the repository at this point in the history
  • Loading branch information
vnmabus committed Aug 15, 2023
1 parent 2295f15 commit 6a066f7
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 63 deletions.
1 change: 1 addition & 0 deletions skfda/exploratory/stats/covariance/_parametric_gaussian.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def fit(

regressor = GaussianProcessRegressor(kernel=cov)
regressor.fit(grid_points, data_matrix.T)
self.cov_ = regressor.kernel_

# TODO: Skip cov computation?
# TODO: Use a user-public structure to represent the covariance,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from __future__ import annotations

import abc
import copy
import math
from typing import (
TYPE_CHECKING,
Expand All @@ -23,9 +22,14 @@
import numpy.ma as ma
import scipy.stats
import sklearn.utils
from sklearn.base import clone
from typing_extensions import Literal

import dcor
from skfda.exploratory.stats.covariance import (
CovarianceEstimator,
EmpiricalCovariance,
)

from ...._utils._sklearn_adapter import (
BaseEstimator,
Expand Down Expand Up @@ -132,7 +136,7 @@ class Correction(BaseEstimator):
"""

def begin(self, X: FDataGrid, y: NDArrayFloat) -> None:
def fit(self, X: FDataGrid, y: NDArrayFloat) -> None:
"""
Initialize the correction for a run.
Expand Down Expand Up @@ -239,8 +243,6 @@ class GaussianCorrection(ConditionalMeanCorrection):
Parameters:
mean: Mean function of the Gaussian process.
cov: Covariance function of the Gaussian process.
fit_hyperparameters: If ``True`` the hyperparameters of the
covariance function are optimized for the data.
"""

Expand All @@ -249,40 +251,11 @@ def __init__(
*,
mean: Union[float, Callable[[NDArrayFloat], NDArrayFloat]] = 0,
cov: Union[float, CovarianceLike] = 1,
fit_hyperparameters: bool = False,
) -> None:
super().__init__()

self.mean = mean
self.cov = make_kernel(cov)
self.fit_hyperparameters = fit_hyperparameters

def begin(self, X: FDataGrid, y: NDArrayFloat) -> None:
if self.fit_hyperparameters:
# TODO: Migrate this to scikit-learn
import GPy

T = X.grid_points[0]
X_copy = np.copy(X.data_matrix[..., 0])

y = np.ravel(y)
for class_label in np.unique(y):
trajectories = X_copy[y == class_label, :]

mean = np.mean(trajectories, axis=0)
X_copy[y == class_label, :] -= mean

gpy_kernel = getattr(self.cov, "_PicklableKernel__kernel")

m = GPy.models.GPRegression(
T[:, None],
X_copy.T,
kernel=gpy_kernel,
)
m.constrain_positive('')
m.optimize()

self.cov_ = copy.deepcopy(make_kernel(m.kern))

def _evaluate_mean(self, t: NDArrayFloat) -> NDArrayFloat:

Expand Down Expand Up @@ -501,40 +474,42 @@ class GaussianSampleCorrection(ConditionalMeanCorrection):
Correction assuming that the process is Gaussian and using as the kernel
the sample covariance.
Parameters:
cov_estimator: Covariance estimator to use.
"""

def begin(self, X: FDataGrid, y: NDArrayFloat) -> None:
def __init__(
self,
cov_estimator: CovarianceEstimator[FDataGrid] | None = None,
):
self.cov_estimator = cov_estimator

def fit(self, X: FDataGrid, y: NDArrayFloat) -> None:

X_copy = np.copy(X.data_matrix[..., 0])
self.cov_estimator_ = (
EmpiricalCovariance()
if self.cov_estimator is None
else clone(self.cov_estimator)
)

X_matrix_copy = np.copy(X.data_matrix[..., 0])

y = np.ravel(y)
for class_label in np.unique(y):
trajectories = X_copy[y == class_label, :]
class_index = (y == class_label)
trajectories = X_matrix_copy[class_index]

mean = np.mean(trajectories, axis=0)
X_copy[y == class_label, :] -= mean
X_matrix_copy[class_index] -= mean

X_copy = X.copy(data_matrix=X_matrix_copy)

self.cov_matrix_ = np.cov(X_copy, rowvar=False)
self.t_ = np.ravel(X.grid_points)
self.cov_estimator_.fit(X_copy)
self.gaussian_correction_ = GaussianCorrection(
cov=self.cov_fun,
cov=self.cov_estimator_.covariance_,
)

def cov_fun(
self,
t_0: ArrayLike,
t_1: ArrayLike,
) -> NDArrayFloat:
i = np.searchsorted(self.t_, t_0)
j = np.searchsorted(self.t_, t_1)

i_r = np.ravel(i)
j_r = np.ravel(j)

return self.cov_matrix_[ # type: ignore[no-any-return]
np.ix_(i_r, j_r)
]

def conditioned(
self,
*,
Expand Down Expand Up @@ -977,10 +952,10 @@ def fit( # type: ignore[override] # noqa: D102

y = np.asfarray(y)

correction = (
self.correction
if self.correction
else UniformCorrection()
self.correction_ = (
UniformCorrection()
if self.correction is None
else clone(self.correction)
)

redundancy_condition = (
Expand All @@ -1007,7 +982,7 @@ def fit( # type: ignore[override] # noqa: D102
relevances = []
first_pass = True

correction.begin(X, y)
self.correction_.fit(X, y)

while True:
dependences = _compute_dependence(
Expand Down Expand Up @@ -1060,12 +1035,12 @@ def fit( # type: ignore[override] # noqa: D102
mask |= influence_mask

# Correct the influence of t_max
X = correction(
X = self.correction_(
X=X,
selected_index=t_max_index,
)

correction = correction.conditioned(
self.correction_ = self.correction_.conditioned(
X=X.data_matrix,
T=X.grid_points[0],
t_0=X.grid_points[0][t_max_index],
Expand All @@ -1087,7 +1062,7 @@ def transform(self, X: FDataGrid) -> NDArrayFloat:

output = X_matrix[(slice(None),) + self.indexes_]

return output.reshape( # type: ignore[no-any-return]
return output.reshape(
X.n_samples,
-1,
)
Expand Down
66 changes: 66 additions & 0 deletions skfda/tests/test_recursive_maxima_hunting.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
import unittest

import numpy as np
import pytest

import skfda
from skfda.datasets import make_gaussian_process
from skfda.exploratory.stats.covariance import ParametricGaussianCovariance
from skfda.misc.covariances import Exponential
from skfda.preprocessing.dim_reduction import variable_selection as vs


Expand Down Expand Up @@ -65,6 +68,69 @@ def mean_1( # noqa: WPS430
points = X.grid_points[0][point_mask]
np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1)

@pytest.mark.filterwarnings(
'ignore::sklearn.exceptions.ConvergenceWarning'
)
def test_fit_exponential(self) -> None:
"""
Test the case for which RMH is optimal.
It tests a case with two homoscedastic Brownian processes where the
difference of means is piecewise linear.
In this case RMH should return the points where the linear parts
join.
"""
n_samples = 10000
n_features = 100

def mean_1( # noqa: WPS430
t: np.typing.NDArray[np.float_],
) -> np.typing.NDArray[np.float_]:

return ( # type: ignore[no-any-return]
np.abs(t - 0.25)
- 2 * np.abs(t - 0.5)
+ np.abs(t - 0.75)
)

X_0 = make_gaussian_process(
n_samples=n_samples // 2,
n_features=n_features,
cov=Exponential(length_scale=2),
random_state=0,
)
X_1 = make_gaussian_process(
n_samples=n_samples // 2,
n_features=n_features,
mean=mean_1,
cov=Exponential(length_scale=2),
random_state=1,
)
X = skfda.concatenate((X_0, X_1))

y = np.zeros(n_samples)
y[n_samples // 2:] = 1

correction = vs.recursive_maxima_hunting.GaussianSampleCorrection(
cov_estimator=ParametricGaussianCovariance(
cov=Exponential(),
),
)
stopping_condition = vs.recursive_maxima_hunting.ScoreThresholdStop(
threshold=0.05,
)

rmh = vs.RecursiveMaximaHunting(
correction=correction,
stopping_condition=stopping_condition,
)
rmh.fit(X, y)
point_mask = rmh.get_support()
points = X.grid_points[0][point_mask]
np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1)


if __name__ == '__main__':
unittest.main()

0 comments on commit 6a066f7

Please sign in to comment.