Merge pull request #466 from GAA-UAM/feature/FPCA_Regression

Feature/fpca_regression
GAA-UAM · Mar 1, 2023 · cc26782 · cc26782
2 parents 48d1fae + 5ab79ca
commit cc26782
Show file tree

Hide file tree

Showing 7 changed files with 551 additions and 4 deletions.
diff --git a/docs/modules/ml/regression.rst b/docs/modules/ml/regression.rst
@@ -45,3 +45,16 @@ non-parametric technique that uses :class:`~skfda.misc.hat_matrix.HatMatrix` obj
    :toctree: autosummary
 
     skfda.ml.regression.KernelRegression
+
+FPCA regression
+-----------------
+This module includes the implementation of FPCA (Functional Principal Component Analysis) 
+regression for FData with scalar response.  FPCA regression consists of two steps.
+Firstly, the principal component basis is created. Then a linear 
+regression is fitted using the coefficients of the functions in said basis.
+
+
+.. autosummary::
+   :toctree: autosummary
+
+    skfda.ml.regression.FPCARegression
diff --git a/examples/plot_fpca_regression.py b/examples/plot_fpca_regression.py
@@ -0,0 +1,110 @@
+"""
+Functional Principal Component Analysis Regression.
+===================================================
+
+This example explores the use of the functional principal component analysis
+(FPCA) in regression problems.
+
+"""
+
+# Author: David del Val
+# License: MIT
+
+import matplotlib.pyplot as plt
+from sklearn.model_selection import GridSearchCV, train_test_split
+
+import skfda
+from skfda.ml.regression import FPCARegression
+
+##############################################################################
+# In this example, we will demonstrate the use of the FPCA regression method
+# using the :func:`tecator <skfda.datasets.fetch_tecator>` dataset.
+# This data set contains 215 samples. Each of those samples is comprised of
+# a spectrum of absorbances and the contents of water, fat and protein.
+
+X, y = skfda.datasets.fetch_tecator(return_X_y=True, as_frame=True)
+X = X.iloc[:, 0].values
+y = y["fat"].values
+
+##############################################################################
+# Our goal will be to estimate the fat percentage from the spectrum. However,
+# in order to better understand the data, we will first plot all the spectra
+# curves. The color of these curves depends on the amount of fat, from least
+# (yellow) to highest (red).
+
+X.plot(gradient_criteria=y, legend=True)
+plt.show()
+
+##############################################################################
+# In order to evaluate the performance of the model, we will split the data
+# into train and test sets. The former will contain 80% of the samples, while
+# the latter will contain the remaining 20%.
+X_train, X_test, y_train, y_test = train_test_split(
+    X,
+    y,
+    test_size=0.2,
+    random_state=1,
+)
+
+##############################################################################
+# Since the FPCA regression provides good results with a small number of
+# components, we will start by using only 5 components. After training the
+# model, we can check its performance on the test set.
+
+reg = FPCARegression(n_components=5)
+reg.fit(X_train, y_train)
+print(reg.score(X_test, y_test))
+
+##############################################################################
+# We have obtained a pretty good result considering that
+# the model has only used 5 components. That is to say, the dimensionality of
+# the problem has been reduced from 100 (each spectrum has 100 points) to 5.
+#
+# However, we can improve the performance of the model by using more
+# components. To do so, we will use cross validation to find the best number of
+# components. We will test with values from 1 to 100.
+
+param_grid = {"n_components": range(1, 100, 1)}
+reg = FPCARegression()
+
+# Perform grid search with cross-validation
+gscv = GridSearchCV(reg, param_grid, cv=5)
+gscv.fit(X_train, y_train)
+
+
+print("Best params:", gscv.best_params_)
+print("Best cross-validation score:", gscv.best_score_)
+
+##############################################################################
+# The best performance for the train set is obtained using 30 components.
+# This still provides a good reduction in dimensionality. However, it is
+# important to note that the performance of the model scales
+# very slowly with the number of components.
+#
+# This phenomenon can be seen in the following plot, and confirms that
+# FPCA already provides a good approximation of the data with
+# a small number of components.
+
+fig = plt.figure()
+ax = fig.add_subplot(1, 1, 1)
+ax.bar(param_grid["n_components"], gscv.cv_results_["mean_test_score"])
+ax.set_xticks(range(0, 100, 10))
+ax.set_ylabel("Number of Components")
+ax.set_xlabel("Cross-validation score")
+ax.set_ylim((0.5, 1))
+
+##############################################################################
+# To conclude, we can calculate the score of the model on the test set after
+# it has been trained on the whole train set. As expected, the score is
+# slightly higher than the one reported by the cross-validation.
+#
+# Moreover, we can check that the score barely changes when we use a somewhat
+# smaller number of components.
+
+reg = FPCARegression(n_components=30)
+reg.fit(X_train, y_train)
+print("Score with 30 components:", reg.score(X_test, y_test))
+
+reg = FPCARegression(n_components=15)
+reg.fit(X_train, y_train)
+print("Score with 15 components:", reg.score(X_test, y_test))
diff --git a/skfda/misc/operators/_linear_differential_operator.py b/skfda/misc/operators/_linear_differential_operator.py
@@ -8,17 +8,18 @@
 from numpy import polyder, polyint, polymul, polyval
 from scipy.interpolate import PPoly
 
-from ...representation import FData, FDataGrid
+from ...representation import FData, FDataBasis, FDataGrid
 from ...representation.basis import (
     Basis,
     BSplineBasis,
     ConstantBasis,
+    CustomBasis,
     FourierBasis,
     MonomialBasis,
 )
 from ...typing._base import DomainRangeLike
 from ...typing._numpy import NDArrayFloat
-from ._operators import Operator, gram_matrix_optimization
+from ._operators import Operator, gram_matrix, gram_matrix_optimization
 
 Order = int
 
@@ -581,7 +582,24 @@ def fdatagrid_penalty_matrix_optimized(
     linear_operator: LinearDifferentialOperator,
     basis: FDataGrid,
 ) -> NDArrayFloat:
-    """Optimized version for FDatagrid."""
+    """
+    Optimized version for FDatagrid.
+
+    If the data_matrix of the FDataGrid is the identity, the resulting
+    matrix will be the gram matrix for functions discretized in the
+    given grid points. That is to say, in order to calculate the norm
+    of the linear operator applied to functions discretized in the given
+    grid points, it is enough to multiply the returned matrix by the
+    values in the points of the grid on both sides.
+
+    If the data_matrix of the FDataGrid is not the identity, the resulting
+    matrix will be the gram matrix for functions expressed as a combination
+    of the given set of functions (the entries of the fdatagrid). The
+    intended use is to calculate the norm of the linear operator for
+    functions expressed in a basis that is a FDataGrid (see CustomBasis).
+
+    Note that the first case is a particular case of the second one.
+    """
     evaluated_basis = sum(
         w(basis.grid_points[0]) if callable(w) else w
         * basis.derivative(order=i)(basis.grid_points[0])
@@ -603,3 +621,33 @@ def fdatagrid_penalty_matrix_optimized(
     matrix[(indices[1], indices[0])] = triang_vec
 
     return matrix
+
+
+@gram_matrix_optimization.register
+def fdatabasis_penalty_matrix_optimized(
+    linear_operator: LinearDifferentialOperator,
+    fdatabasis: FDataBasis,
+) -> NDArrayFloat:
+    """Optimized version for FDataBasis."""
+    # By calculating the gram matrix of the basis first, we can
+    # take advantage of the optimized implementations for
+    # several basis types.
+    # Then we can calculate the penalty matrix using the
+    # coefficients of the functions in the basis
+    # and the penalty matrix of the basis.
+    basis_pen_matrix = gram_matrix(
+        linear_operator,
+        fdatabasis.basis,
+    )
+
+    coef_matrix = fdatabasis.coefficients
+    return coef_matrix @ basis_pen_matrix @ coef_matrix.T
+
+
+@gram_matrix_optimization.register
+def custombasis_penalty_matrix_optimized(
+    linear_operator: LinearDifferentialOperator,
+    basis: CustomBasis,
+) -> NDArrayFloat:
+    """Optimized version for CustomBasis."""
+    return gram_matrix(linear_operator, basis.fdata)
diff --git a/skfda/ml/regression/__init__.py b/skfda/ml/regression/__init__.py
@@ -1,5 +1,4 @@
 """Regression."""
-
 from typing import TYPE_CHECKING
 
 import lazy_loader as lazy
@@ -10,6 +9,7 @@
         "_historical_linear_model": ["HistoricalLinearRegression"],
         "_kernel_regression": ["KernelRegression"],
         "_linear_regression": ["LinearRegression"],
+        "_fpca_regression": ["FPCARegression"],
         "_neighbors_regression": [
             "KNeighborsRegressor",
             "RadiusNeighborsRegressor",
@@ -18,6 +18,7 @@
 )
 
 if TYPE_CHECKING:
+    from ._fpca_regression import FPCARegression
     from ._historical_linear_model import (
         HistoricalLinearRegression as HistoricalLinearRegression,
     )

diff --git a/skfda/ml/regression/_fpca_regression.py b/skfda/ml/regression/_fpca_regression.py
@@ -0,0 +1,152 @@
+from __future__ import annotations
+
+from typing import TypeVar
+
+from sklearn.utils.validation import check_is_fitted
+
+from ..._utils._sklearn_adapter import BaseEstimator, RegressorMixin
+from ...misc.regularization import L2Regularization
+from ...preprocessing.dim_reduction import FPCA
+from ...representation import FData
+from ...representation.basis import Basis, CustomBasis, FDataBasis
+from ...typing._numpy import NDArrayFloat
+from ._linear_regression import LinearRegression
+
+FPCARegressionSelf = TypeVar("FPCARegressionSelf", bound="FPCARegression")
+
+
+class FPCARegression(
+    BaseEstimator,
+    RegressorMixin,
+):
+    r"""Regression using Functional Principal Components Analysis.
+
+    It performs Functional Principal Components Analysis to reduce the
+    dimension of the functional data, and then uses a linear regression model
+    to relate the transformed data to a scalar value.
+
+    Args:
+        n_components: Number of principal components to keep. Defaults to 5.
+        fit\_intercept: If True, the linear model is calculated with an
+            intercept.  Defaults to ``True``.
+        pca_regularization: Regularization parameter for the principal
+            component extraction. If None then no regularization is applied.
+            Defaults to ``None``.
+        regression_regularization: Regularization parameter for the linear
+            regression. If None then no regularization is applied.
+            Defaults to ``None``.
+        components_basis: Basis used for the principal components. If None
+            then the basis of the input data is used. Defaults to None.
+            It is only used if the input data is a FDataBasis object.
+
+    Attributes:
+        n\_components\_: Number of principal components used.
+        components\_: Principal components.
+        coef\_: Coefficients of the linear regression model.
+        explained\_variance\_: Amount of variance explained by
+            each of the selected components.
+        explained\_variance\_ratio\_: Percentage of variance
+            explained by each of the selected components.
+
+    Examples:
+        Using the Berkeley Growth Study dataset, we can fit the model.
+
+        >>> import skfda
+        >>> dataset = skfda.datasets.fetch_growth()
+        >>> fd = dataset["data"]
+        >>> y = dataset["target"]
+        >>> reg = skfda.ml.regression.FPCARegression(n_components=2)
+        >>> reg.fit(fd, y)
+        FPCARegression(n_components=2)
+
+        Then, we can predict the target values and calculate the
+        score.
+
+        >>> score = reg.score(fd, y)
+        >>> reg.predict(fd) # doctest:+ELLIPSIS
+        array([...])
+
+    """
+
+    def __init__(
+        self,
+        n_components: int = 5,
+        fit_intercept: bool = True,
+        pca_regularization: L2Regularization | None = None,
+        regression_regularization: L2Regularization | None = None,
+        components_basis: Basis | None = None,
+    ) -> None:
+        self.n_components = n_components
+        self.fit_intercept = fit_intercept
+        self.pca_regularization = pca_regularization
+        self.regression_regularization = regression_regularization
+        self.components_basis = components_basis
+
+    def fit(
+        self,
+        X: FData,
+        y: NDArrayFloat,
+    ) -> FPCARegressionSelf:
+        """Fit the model according to the given training data.
+
+        Args:
+            X: Functional data.
+            y: Target values.
+
+        Returns:
+            self
+
+        """
+        self._fpca = FPCA(
+            n_components=self.n_components,
+            centering=True,
+            regularization=self.pca_regularization,
+            components_basis=self.components_basis,
+        )
+        self._linear_model = LinearRegression(
+            fit_intercept=self.fit_intercept,
+            regularization=self.regression_regularization,
+        )
+        transformed_coefficients = self._fpca.fit_transform(X)
+
+        # The linear model is fitted with the observations expressed in the
+        # basis of the principal components.
+        self.fpca_basis = CustomBasis(
+            fdata=self._fpca.components_,
+        )
+
+        X_transformed = FDataBasis(
+            basis=self.fpca_basis,
+            coefficients=transformed_coefficients,
+        )
+        self._linear_model.fit(X_transformed, y)
+
+        self.n_components_ = self.n_components
+        self.components_ = self._fpca.components_
+        self.coef_ = self._linear_model.coef_
+        self.explained_variance_ = self._fpca.explained_variance_
+        self.explained_variance_ratio_ = self._fpca.explained_variance_ratio_
+
+        return self
+
+    def predict(
+        self,
+        X: FData,
+    ) -> NDArrayFloat:
+        """Predict using the linear model.
+
+        Args:
+            X: Functional data.
+
+        Returns:
+            Target values.
+
+        """
+        check_is_fitted(self)
+
+        X_transformed = FDataBasis(
+            basis=self.fpca_basis,
+            coefficients=self._fpca.transform(X),
+        )
+
+        return self._linear_model.predict(X_transformed)