Improve RMH correction with covariance estimated from data.

GAA-UAM · Aug 15, 2023 · 6a066f7 · 6a066f7
1 parent 2295f15
commit 6a066f7
Show file tree

Hide file tree

Showing 3 changed files with 105 additions and 63 deletions.
diff --git a/skfda/exploratory/stats/covariance/_parametric_gaussian.py b/skfda/exploratory/stats/covariance/_parametric_gaussian.py
@@ -51,6 +51,7 @@ def fit(
 
         regressor = GaussianProcessRegressor(kernel=cov)
         regressor.fit(grid_points, data_matrix.T)
+        self.cov_ = regressor.kernel_
 
         # TODO: Skip cov computation?
         # TODO: Use a user-public structure to represent the covariance,

diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 
 import abc
-import copy
 import math
 from typing import (
     TYPE_CHECKING,
@@ -23,9 +22,14 @@
 import numpy.ma as ma
 import scipy.stats
 import sklearn.utils
+from sklearn.base import clone
 from typing_extensions import Literal
 
 import dcor
+from skfda.exploratory.stats.covariance import (
+    CovarianceEstimator,
+    EmpiricalCovariance,
+)
 
 from ...._utils._sklearn_adapter import (
     BaseEstimator,
@@ -132,7 +136,7 @@ class Correction(BaseEstimator):
 
     """
 
-    def begin(self, X: FDataGrid, y: NDArrayFloat) -> None:
+    def fit(self, X: FDataGrid, y: NDArrayFloat) -> None:
         """
         Initialize the correction for a run.
 
@@ -239,8 +243,6 @@ class GaussianCorrection(ConditionalMeanCorrection):
     Parameters:
         mean: Mean function of the Gaussian process.
         cov: Covariance function of the Gaussian process.
-        fit_hyperparameters: If ``True`` the hyperparameters of the
-            covariance function are optimized for the data.
 
     """
 
@@ -249,40 +251,11 @@ def __init__(
         *,
         mean: Union[float, Callable[[NDArrayFloat], NDArrayFloat]] = 0,
         cov: Union[float, CovarianceLike] = 1,
-        fit_hyperparameters: bool = False,
     ) -> None:
         super().__init__()
 
         self.mean = mean
         self.cov = make_kernel(cov)
-        self.fit_hyperparameters = fit_hyperparameters
-
-    def begin(self, X: FDataGrid, y: NDArrayFloat) -> None:
-        if self.fit_hyperparameters:
-            # TODO: Migrate this to scikit-learn
-            import GPy
-
-            T = X.grid_points[0]
-            X_copy = np.copy(X.data_matrix[..., 0])
-
-            y = np.ravel(y)
-            for class_label in np.unique(y):
-                trajectories = X_copy[y == class_label, :]
-
-                mean = np.mean(trajectories, axis=0)
-                X_copy[y == class_label, :] -= mean
-
-            gpy_kernel = getattr(self.cov, "_PicklableKernel__kernel")
-
-            m = GPy.models.GPRegression(
-                T[:, None],
-                X_copy.T,
-                kernel=gpy_kernel,
-            )
-            m.constrain_positive('')
-            m.optimize()
-
-            self.cov_ = copy.deepcopy(make_kernel(m.kern))
 
     def _evaluate_mean(self, t: NDArrayFloat) -> NDArrayFloat:
 
@@ -501,40 +474,42 @@ class GaussianSampleCorrection(ConditionalMeanCorrection):
     Correction assuming that the process is Gaussian and using as the kernel
     the sample covariance.
 
+    Parameters:
+        cov_estimator: Covariance estimator to use.
+
     """
 
-    def begin(self, X: FDataGrid, y: NDArrayFloat) -> None:
+    def __init__(
+        self,
+        cov_estimator: CovarianceEstimator[FDataGrid] | None = None,
+    ):
+        self.cov_estimator = cov_estimator
+
+    def fit(self, X: FDataGrid, y: NDArrayFloat) -> None:
 
-        X_copy = np.copy(X.data_matrix[..., 0])
+        self.cov_estimator_ = (
+            EmpiricalCovariance()
+            if self.cov_estimator is None
+            else clone(self.cov_estimator)
+        )
+
+        X_matrix_copy = np.copy(X.data_matrix[..., 0])
 
         y = np.ravel(y)
         for class_label in np.unique(y):
-            trajectories = X_copy[y == class_label, :]
+            class_index = (y == class_label)
+            trajectories = X_matrix_copy[class_index]
 
             mean = np.mean(trajectories, axis=0)
-            X_copy[y == class_label, :] -= mean
+            X_matrix_copy[class_index] -= mean
+
+        X_copy = X.copy(data_matrix=X_matrix_copy)
 
-        self.cov_matrix_ = np.cov(X_copy, rowvar=False)
-        self.t_ = np.ravel(X.grid_points)
+        self.cov_estimator_.fit(X_copy)
         self.gaussian_correction_ = GaussianCorrection(
-            cov=self.cov_fun,
+            cov=self.cov_estimator_.covariance_,
         )
 
-    def cov_fun(
-        self,
-        t_0: ArrayLike,
-        t_1: ArrayLike,
-    ) -> NDArrayFloat:
-        i = np.searchsorted(self.t_, t_0)
-        j = np.searchsorted(self.t_, t_1)
-
-        i_r = np.ravel(i)
-        j_r = np.ravel(j)
-
-        return self.cov_matrix_[  # type: ignore[no-any-return]
-            np.ix_(i_r, j_r)
-        ]
-
     def conditioned(
         self,
         *,
@@ -977,10 +952,10 @@ def fit(  # type: ignore[override] # noqa: D102
 
         y = np.asfarray(y)
 
-        correction = (
-            self.correction
-            if self.correction
-            else UniformCorrection()
+        self.correction_ = (
+            UniformCorrection()
+            if self.correction is None
+            else clone(self.correction)
         )
 
         redundancy_condition = (
@@ -1007,7 +982,7 @@ def fit(  # type: ignore[override] # noqa: D102
         relevances = []
         first_pass = True
 
-        correction.begin(X, y)
+        self.correction_.fit(X, y)
 
         while True:
             dependences = _compute_dependence(
@@ -1060,12 +1035,12 @@ def fit(  # type: ignore[override] # noqa: D102
             mask |= influence_mask
 
             # Correct the influence of t_max
-            X = correction(
+            X = self.correction_(
                 X=X,
                 selected_index=t_max_index,
             )
 
-            correction = correction.conditioned(
+            self.correction_ = self.correction_.conditioned(
                 X=X.data_matrix,
                 T=X.grid_points[0],
                 t_0=X.grid_points[0][t_max_index],
@@ -1087,7 +1062,7 @@ def transform(self, X: FDataGrid) -> NDArrayFloat:
 
         output = X_matrix[(slice(None),) + self.indexes_]
 
-        return output.reshape(  # type: ignore[no-any-return]
+        return output.reshape(
             X.n_samples,
             -1,
         )

diff --git a/skfda/tests/test_recursive_maxima_hunting.py b/skfda/tests/test_recursive_maxima_hunting.py
@@ -2,9 +2,12 @@
 import unittest
 
 import numpy as np
+import pytest
 
 import skfda
 from skfda.datasets import make_gaussian_process
+from skfda.exploratory.stats.covariance import ParametricGaussianCovariance
+from skfda.misc.covariances import Exponential
 from skfda.preprocessing.dim_reduction import variable_selection as vs
 
 
@@ -65,6 +68,69 @@ def mean_1(  # noqa: WPS430
         points = X.grid_points[0][point_mask]
         np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1)
 
+    @pytest.mark.filterwarnings(
+        'ignore::sklearn.exceptions.ConvergenceWarning'
+    )
+    def test_fit_exponential(self) -> None:
+        """
+        Test the case for which RMH is optimal.
+
+        It tests a case with two homoscedastic Brownian processes where the
+        difference of means is piecewise linear.
+
+        In this case RMH should return the points where the linear parts
+        join.
+
+        """
+        n_samples = 10000
+        n_features = 100
+
+        def mean_1(  # noqa: WPS430
+            t: np.typing.NDArray[np.float_],
+        ) -> np.typing.NDArray[np.float_]:
+
+            return (  # type: ignore[no-any-return]
+                np.abs(t - 0.25)
+                - 2 * np.abs(t - 0.5)
+                + np.abs(t - 0.75)
+            )
+
+        X_0 = make_gaussian_process(
+            n_samples=n_samples // 2,
+            n_features=n_features,
+            cov=Exponential(length_scale=2),
+            random_state=0,
+        )
+        X_1 = make_gaussian_process(
+            n_samples=n_samples // 2,
+            n_features=n_features,
+            mean=mean_1,
+            cov=Exponential(length_scale=2),
+            random_state=1,
+        )
+        X = skfda.concatenate((X_0, X_1))
+
+        y = np.zeros(n_samples)
+        y[n_samples // 2:] = 1
+
+        correction = vs.recursive_maxima_hunting.GaussianSampleCorrection(
+            cov_estimator=ParametricGaussianCovariance(
+                cov=Exponential(),
+            ),
+        )
+        stopping_condition = vs.recursive_maxima_hunting.ScoreThresholdStop(
+            threshold=0.05,
+        )
+
+        rmh = vs.RecursiveMaximaHunting(
+            correction=correction,
+            stopping_condition=stopping_condition,
+        )
+        rmh.fit(X, y)
+        point_mask = rmh.get_support()
+        points = X.grid_points[0][point_mask]
+        np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1)
+
 
 if __name__ == '__main__':
     unittest.main()