Merge pull request #519 from GAA-UAM/feature/speedUpPenalization

Speed up the calculation of the penalization matrix for FDataGrid
GAA-UAM · Jun 24, 2023 · c40e372 · c40e372
2 parents 0154d69 + f69db8f
commit c40e372
Show file tree

Hide file tree

Showing 6 changed files with 315 additions and 45 deletions.
diff --git a/skfda/misc/operators/_linear_differential_operator.py b/skfda/misc/operators/_linear_differential_operator.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 
 import numbers
-from typing import Callable, Sequence, Union
+from typing import Callable, Sequence, Tuple, Union
 
+import findiff
 import numpy as np
 import scipy.integrate
 from numpy import polyder, polyint, polymul, polyval
@@ -16,9 +17,10 @@
     CustomBasis,
     FourierBasis,
     MonomialBasis,
+    _GridBasis,
 )
 from ...typing._base import DomainRangeLike
-from ...typing._numpy import NDArrayFloat
+from ...typing._numpy import NDArrayFloat, NDArrayInt
 from ._operators import Operator, gram_matrix, gram_matrix_optimization
 
 Order = int
@@ -577,50 +579,199 @@ def bspline_penalty_matrix_optimized(
     return penalty_matrix
 
 
+def _optimized_operator_evaluation_in_grid(
+    linear_operator: LinearDifferentialOperator,
+    grid_points: NDArrayFloat,
+    findiff_accuracy: int,
+) -> Tuple[NDArrayFloat, NDArrayInt]:
+    """
+    Compute the linear operator applied to the delta basis of the grid.
+
+    The delta basis is the basis where the i-th basis function is 1 in the
+    i-th point and 0 in the rest of points.
+
+    Returns:
+    If the linear operator does not have constant weights, NotImplemented.
+    Otherwise, returns a tuple with:
+     - A matrix where the i-th row contains the values of the linear operator
+        applied to the i-th basis function of the delta basis.
+
+    - The domain limits of each basis function of the delta basis. Outside
+        these limits, the result of applying the linear operator to the
+        given basis function is zero.
+
+    Example:
+        >>> import numpy as np
+        >>> from skfda.misc.operators import LinearDifferentialOperator
+        >>> _optimized_operator_evaluation_in_grid(
+        ...     LinearDifferentialOperator(2),
+        ...     np.linspace(0, 7, 8),
+        ...     2,
+        ... )
+        (array([[ 2.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
+        [-5., -2.,  1.,  0.,  0.,  0.,  0.,  0.],
+        [ 4.,  1., -2.,  1.,  0.,  0.,  0.,  0.],
+        [-1.,  0.,  1., -2.,  1.,  0.,  0.,  0.],
+        [ 0.,  0.,  0.,  1., -2.,  1.,  0., -1.],
+        [ 0.,  0.,  0.,  0.,  1., -2.,  1.,  4.],
+        [ 0.,  0.,  0.,  0.,  0.,  1., -2., -5.],
+        [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  2.]]),
+        array([[0, 1],
+                [0, 2],
+                [0, 3],
+                [0, 4],
+                [3, 7],
+                [4, 7],
+                [5, 7],
+                [6, 7]]))
+
+    Explanation:
+        Each row of the first array contains the values of the linear operator
+        applied to the delta basis function.
+        As it can be seen, each row of the second array cointains the domain
+        limits of the corresponding row of the first array. That is to say,
+        the range in which their values are not 0.
+        For example, the third row of the first array is non-zero until the
+        fourth element, so the third row of the second array is [0, 3].
+    """
+    n_points = grid_points.shape[0]
+    diff_coefficients = np.zeros((n_points, n_points))
+
+    domain_limits = np.hstack(
+        [
+            np.ones((n_points, 1)) * n_points,
+            np.zeros((n_points, 1)),
+        ],
+    ).astype(int)
+
+    linear_operator_weights = linear_operator.constant_weights()
+    if linear_operator_weights is None:
+        return NotImplemented, NotImplemented
+
+    # The desired matrix can be obtained by appending the coefficients
+    # of the finite differences for each point column-wise.
+    for dif_order, w in enumerate(linear_operator_weights):
+        if w == 0:
+            continue
+        for point_index in range(n_points):
+            # Calculate the finite differences coefficients
+            # the point point_index. These coefficients express the
+            # contribution of a function in each of the points in the
+            # grid to the value of the operator applied to the function
+            # in the point point_index.
+            finite_diff = findiff.coefs.coefficients_non_uni(
+                deriv=dif_order,
+                acc=findiff_accuracy,
+                coords=grid_points,
+                idx=point_index,
+            )
+
+            # The offsets are the relative positions of the
+            # points for which the coefficients are being
+            # calculated
+            offsets = finite_diff["offsets"]
+            coefficients = finite_diff["coefficients"]
+
+            # Add the position of the point where the finite
+            # difference is calculated to get the absolute position
+            # of the points where each coefficient is applied
+            positions = offsets + point_index
+
+            # If the coefficients for calculating the value of
+            # the operator in this point (point_index)
+            # include a coefficient for the i-th point,
+            # then the i-th basis function contains the point point_index
+            # in its domain.
+            # (by definition that fuction is 1 in the i-th point).
+            domain_limits[positions, 0] = np.minimum(
+                domain_limits[positions, 0],
+                np.ones(positions.shape) * point_index,
+            )
+            domain_limits[positions, 1] = np.maximum(
+                domain_limits[positions, 1],
+                np.ones(positions.shape) * point_index,
+            )
+
+            w_eval = w(grid_points[point_index]) if callable(w) else w
+            diff_coefficients[positions, point_index] += w_eval * coefficients
+
+    return (diff_coefficients, domain_limits)
+
+
 @gram_matrix_optimization.register
-def fdatagrid_penalty_matrix_optimized(
+def gridbasis_penalty_matrix_optimized(
     linear_operator: LinearDifferentialOperator,
-    basis: FDataGrid,
+    basis: _GridBasis,
 ) -> NDArrayFloat:
     """
-    Optimized version for FDatagrid.
-
-    If the data_matrix of the FDataGrid is the identity, the resulting
-    matrix will be the gram matrix for functions discretized in the
-    given grid points. That is to say, in order to calculate the norm
-    of the linear operator applied to functions discretized in the given
-    grid points, it is enough to multiply the returned matrix by the
-    values in the points of the grid on both sides.
-
-    If the data_matrix of the FDataGrid is not the identity, the resulting
-    matrix will be the gram matrix for functions expressed as a combination
-    of the given set of functions (the entries of the fdatagrid). The
-    intended use is to calculate the norm of the linear operator for
-    functions expressed in a basis that is a FDataGrid (see CustomBasis).
-
-    Note that the first case is a particular case of the second one.
+    Optimized version for GridBasis.
+
+    It evaluates the operator in the grid points.
     """
-    evaluated_basis = sum(
-        w(basis.grid_points[0]) if callable(w) else w
-        * basis.derivative(order=i)(basis.grid_points[0])
-        for i, w in enumerate(linear_operator.weights)
+    n_points = basis.grid_points[0].shape[0]
+    findiff_accuracy = 2
+
+    evaluated_basis, domain_limits = _optimized_operator_evaluation_in_grid(
+        linear_operator,
+        basis.grid_points[0],
+        findiff_accuracy=findiff_accuracy,
     )
-    assert not isinstance(evaluated_basis, int)
 
-    indices = np.triu_indices(basis.n_samples)
-    product = evaluated_basis[indices[0]] * evaluated_basis[indices[1]]
+    # If the operator does not have constant weights, the
+    # optimized evaluation is not implemented.
+    if evaluated_basis is NotImplemented:
+        return NotImplemented
 
-    triang_vec = scipy.integrate.simps(product[..., 0], x=basis.grid_points)
+    # The support of the result of applying the linear operator
+    # is a small subset of the grid points. We calculate the
+    # maximum number of points to the left and right of the
+    # point where the linear operator is applied that can be
+    # different from zero.
 
-    matrix = np.empty((basis.n_samples, basis.n_samples))
+    max_domain_width = np.max(
+        np.abs(domain_limits[:, 0] - domain_limits[:, 1]),
+    )
 
-    # Set upper matrix
-    matrix[indices] = triang_vec
+    # Precompute the simpson quadrature for the entire domain.
+    # This is done to avoid computing it for each subinterval of integration.
+    quadrature = scipy.integrate.simpson(
+        np.identity(n_points),
+        basis.grid_points[0],
+    )
 
-    # Set lower matrix
-    matrix[(indices[1], indices[0])] = triang_vec
+    product_matrix = np.zeros((basis.n_basis, basis.n_basis))
+    for i in range(n_points):
+        # Maximum index of another function with not-disjoint support
+        max_no_disjoint_index = min(n_points, i + 2 * max_domain_width)
+
+        for j in range(i, max_no_disjoint_index):
+            if domain_limits[i, 1] < domain_limits[j, 0]:
+                continue
+
+            left_lim = min(
+                domain_limits[i, 0],
+                domain_limits[j, 0],
+            )
+            right_lim = max(
+                domain_limits[i, 1],
+                domain_limits[j, 1],
+            )
+
+            # The right limit of the interval is excluded when slicing
+            right_lim += 1
+
+            # Limit both the multiplication and the integration to
+            # the interval where the functions are not 0.
+            operator_product = (
+                evaluated_basis[i, left_lim:right_lim]
+                * evaluated_basis[j, left_lim:right_lim]
+            )
+            sub_quadrature = quadrature[left_lim:right_lim]
 
-    return matrix
+            product_matrix[i, j] = np.dot(operator_product, sub_quadrature)
+            product_matrix[j, i] = product_matrix[i, j]
+
+    return product_matrix
 
 
 @gram_matrix_optimization.register
@@ -644,6 +795,24 @@ def fdatabasis_penalty_matrix_optimized(
     return coef_matrix @ basis_pen_matrix @ coef_matrix.T
 
 
+@gram_matrix_optimization.register
+def fdatagrid_penalty_matrix_optimized(
+    linear_operator: LinearDifferentialOperator,
+    fdatagrid: FDataGrid,
+) -> NDArrayFloat:
+    """Optimized version for FDataGrid."""
+    from ...misc import inner_product_matrix
+
+    operator_evaluated = linear_operator(fdatagrid)(
+        fdatagrid.grid_points[0],
+    )
+    fdatagrid_evaluated = FDataGrid(
+        data_matrix=operator_evaluated[..., 0],
+        grid_points=fdatagrid.grid_points[0],
+    )
+    return inner_product_matrix(fdatagrid_evaluated)
+
+
 @gram_matrix_optimization.register
 def custombasis_penalty_matrix_optimized(
     linear_operator: LinearDifferentialOperator,

diff --git a/skfda/preprocessing/dim_reduction/_fpca.py b/skfda/preprocessing/dim_reduction/_fpca.py
@@ -12,7 +12,7 @@
 from ..._utils._sklearn_adapter import BaseEstimator, InductiveTransformerMixin
 from ...misc.regularization import L2Regularization, compute_penalty_matrix
 from ...representation import FData
-from ...representation.basis import Basis, FDataBasis
+from ...representation.basis import Basis, FDataBasis, _GridBasis
 from ...representation.grid import FDataGrid
 from ...typing._numpy import ArrayLike, NDArrayFloat
 
@@ -360,13 +360,8 @@ def _fit_grid(
 
         weights_matrix = np.diag(self._weights)
 
-        basis = FDataGrid(
-            data_matrix=np.identity(n_points_discretization),
-            grid_points=X.grid_points,
-        )
-
         regularization_matrix = compute_penalty_matrix(
-            basis_iterable=(basis,),
+            basis_iterable=(_GridBasis(grid_points=X.grid_points),),
             regularization_parameter=1,
             regularization=self.regularization,
         )

diff --git a/skfda/representation/basis/__init__.py b/skfda/representation/basis/__init__.py
@@ -13,6 +13,7 @@
         "_fdatabasis": ["FDataBasis", "FDataBasisDType"],
         "_finite_element_basis": ["FiniteElementBasis", "FiniteElement"],
         "_fourier_basis": ["FourierBasis", "Fourier"],
+        "_grid_basis": ["_GridBasis"],
         "_monomial_basis": ["MonomialBasis", "Monomial"],
         "_tensor_basis": ["TensorBasis", "Tensor"],
         "_vector_basis": ["VectorValuedBasis", "VectorValued"],
@@ -42,6 +43,7 @@
         Fourier as Fourier,
         FourierBasis as FourierBasis,
     )
+    from ._grid_basis import _GridBasis as _GridBasis
     from ._monomial_basis import (
         Monomial as Monomial,
         MonomialBasis as MonomialBasis,

diff --git a/skfda/representation/basis/_custom_basis.py b/skfda/representation/basis/_custom_basis.py
@@ -37,9 +37,8 @@ def __init__(
             domain_range=fdata.domain_range,
             n_basis=fdata.n_samples,
         )
-        self._check_linearly_independent(fdata)
-
         self.fdata = fdata
+        self._check_linearly_independent(fdata)
 
     @multimethod.multidispatch
     def _check_linearly_independent(self, fdata: FData) -> None:

diff --git a/skfda/representation/basis/_grid_basis.py b/skfda/representation/basis/_grid_basis.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from typing import Any, TypeVar
+
+from ..._utils import _to_grid_points
+from ...typing._base import GridPointsLike
+from ...typing._numpy import NDArrayFloat
+from ._basis import Basis
+
+T = TypeVar("T", bound="_GridBasis")
+
+
+class _GridBasis(Basis):
+    """
+    Basis associated to a grid of points.
+
+    Used to express functions whose values are known in a grid of points.
+    Each basis function is guaranteed to be zero in all points of the grid
+    except one, where it is one. The intermediate values are interpolated
+    depending on the interpolation object passed as argument.
+
+    This basis is used internally as an alternate representation of a
+    FDataGrid object. In certain cases, it is more convenient to work with
+    FDataGrid objects as a basis representation in this basis since, then,
+    all functional data can be represented as an FDataBasis object.
+
+    Parameters:
+        grid_points: Grid of points where the functions are evaluated.
+
+    """
+
+    def __init__(
+        self,
+        *,
+        grid_points: GridPointsLike,
+    ) -> None:
+        """Basis constructor."""
+        self.grid_points = _to_grid_points(grid_points)
+        domain_range = tuple((s[0], s[-1]) for s in self.grid_points)
+        super().__init__(
+            domain_range=domain_range,
+            n_basis=len(self.grid_points[0]),
+        )
+
+    def _evaluate(self, eval_points: NDArrayFloat) -> NDArrayFloat:
+        raise NotImplementedError(
+            "Evaluation is not implemented in this basis",
+        )
+
+    def __eq__(self, other: Any) -> bool:
+        return (
+            super().__eq__(other)
+            and self.grid_points == other.grid_points
+        )
+
+    def __repr__(self) -> str:
+        return f"{type(self).__name__}(grid_points={self.grid_points}) "
+
+    def __hash__(self) -> int:
+        return hash(
+            (
+                super(),
+                (
+                    tuple(grid_point_axis)
+                    for grid_point_axis in self.grid_points
+                ),
+            ),
+        )