Skip to content

Commit

Permalink
Merge pull request #466 from GAA-UAM/feature/FPCA_Regression
Browse files Browse the repository at this point in the history
Feature/fpca_regression
  • Loading branch information
vnmabus committed Mar 1, 2023
2 parents 48d1fae + 5ab79ca commit cc26782
Show file tree
Hide file tree
Showing 7 changed files with 551 additions and 4 deletions.
13 changes: 13 additions & 0 deletions docs/modules/ml/regression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,16 @@ non-parametric technique that uses :class:`~skfda.misc.hat_matrix.HatMatrix` obj
:toctree: autosummary

skfda.ml.regression.KernelRegression

FPCA regression
-----------------
This module includes the implementation of FPCA (Functional Principal Component Analysis)
regression for FData with scalar response. FPCA regression consists of two steps.
Firstly, the principal component basis is created. Then a linear
regression is fitted using the coefficients of the functions in said basis.


.. autosummary::
:toctree: autosummary

skfda.ml.regression.FPCARegression
110 changes: 110 additions & 0 deletions examples/plot_fpca_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""
Functional Principal Component Analysis Regression.
===================================================
This example explores the use of the functional principal component analysis
(FPCA) in regression problems.
"""

# Author: David del Val
# License: MIT

import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split

import skfda
from skfda.ml.regression import FPCARegression

##############################################################################
# In this example, we will demonstrate the use of the FPCA regression method
# using the :func:`tecator <skfda.datasets.fetch_tecator>` dataset.
# This data set contains 215 samples. Each of those samples is comprised of
# a spectrum of absorbances and the contents of water, fat and protein.

X, y = skfda.datasets.fetch_tecator(return_X_y=True, as_frame=True)
X = X.iloc[:, 0].values
y = y["fat"].values

##############################################################################
# Our goal will be to estimate the fat percentage from the spectrum. However,
# in order to better understand the data, we will first plot all the spectra
# curves. The color of these curves depends on the amount of fat, from least
# (yellow) to highest (red).

X.plot(gradient_criteria=y, legend=True)
plt.show()

##############################################################################
# In order to evaluate the performance of the model, we will split the data
# into train and test sets. The former will contain 80% of the samples, while
# the latter will contain the remaining 20%.
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
random_state=1,
)

##############################################################################
# Since the FPCA regression provides good results with a small number of
# components, we will start by using only 5 components. After training the
# model, we can check its performance on the test set.

reg = FPCARegression(n_components=5)
reg.fit(X_train, y_train)
print(reg.score(X_test, y_test))

##############################################################################
# We have obtained a pretty good result considering that
# the model has only used 5 components. That is to say, the dimensionality of
# the problem has been reduced from 100 (each spectrum has 100 points) to 5.
#
# However, we can improve the performance of the model by using more
# components. To do so, we will use cross validation to find the best number of
# components. We will test with values from 1 to 100.

param_grid = {"n_components": range(1, 100, 1)}
reg = FPCARegression()

# Perform grid search with cross-validation
gscv = GridSearchCV(reg, param_grid, cv=5)
gscv.fit(X_train, y_train)


print("Best params:", gscv.best_params_)
print("Best cross-validation score:", gscv.best_score_)

##############################################################################
# The best performance for the train set is obtained using 30 components.
# This still provides a good reduction in dimensionality. However, it is
# important to note that the performance of the model scales
# very slowly with the number of components.
#
# This phenomenon can be seen in the following plot, and confirms that
# FPCA already provides a good approximation of the data with
# a small number of components.

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.bar(param_grid["n_components"], gscv.cv_results_["mean_test_score"])
ax.set_xticks(range(0, 100, 10))
ax.set_ylabel("Number of Components")
ax.set_xlabel("Cross-validation score")
ax.set_ylim((0.5, 1))

##############################################################################
# To conclude, we can calculate the score of the model on the test set after
# it has been trained on the whole train set. As expected, the score is
# slightly higher than the one reported by the cross-validation.
#
# Moreover, we can check that the score barely changes when we use a somewhat
# smaller number of components.

reg = FPCARegression(n_components=30)
reg.fit(X_train, y_train)
print("Score with 30 components:", reg.score(X_test, y_test))

reg = FPCARegression(n_components=15)
reg.fit(X_train, y_train)
print("Score with 15 components:", reg.score(X_test, y_test))
54 changes: 51 additions & 3 deletions skfda/misc/operators/_linear_differential_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,18 @@
from numpy import polyder, polyint, polymul, polyval
from scipy.interpolate import PPoly

from ...representation import FData, FDataGrid
from ...representation import FData, FDataBasis, FDataGrid
from ...representation.basis import (
Basis,
BSplineBasis,
ConstantBasis,
CustomBasis,
FourierBasis,
MonomialBasis,
)
from ...typing._base import DomainRangeLike
from ...typing._numpy import NDArrayFloat
from ._operators import Operator, gram_matrix_optimization
from ._operators import Operator, gram_matrix, gram_matrix_optimization

Order = int

Expand Down Expand Up @@ -581,7 +582,24 @@ def fdatagrid_penalty_matrix_optimized(
linear_operator: LinearDifferentialOperator,
basis: FDataGrid,
) -> NDArrayFloat:
"""Optimized version for FDatagrid."""
"""
Optimized version for FDatagrid.
If the data_matrix of the FDataGrid is the identity, the resulting
matrix will be the gram matrix for functions discretized in the
given grid points. That is to say, in order to calculate the norm
of the linear operator applied to functions discretized in the given
grid points, it is enough to multiply the returned matrix by the
values in the points of the grid on both sides.
If the data_matrix of the FDataGrid is not the identity, the resulting
matrix will be the gram matrix for functions expressed as a combination
of the given set of functions (the entries of the fdatagrid). The
intended use is to calculate the norm of the linear operator for
functions expressed in a basis that is a FDataGrid (see CustomBasis).
Note that the first case is a particular case of the second one.
"""
evaluated_basis = sum(
w(basis.grid_points[0]) if callable(w) else w
* basis.derivative(order=i)(basis.grid_points[0])
Expand All @@ -603,3 +621,33 @@ def fdatagrid_penalty_matrix_optimized(
matrix[(indices[1], indices[0])] = triang_vec

return matrix


@gram_matrix_optimization.register
def fdatabasis_penalty_matrix_optimized(
linear_operator: LinearDifferentialOperator,
fdatabasis: FDataBasis,
) -> NDArrayFloat:
"""Optimized version for FDataBasis."""
# By calculating the gram matrix of the basis first, we can
# take advantage of the optimized implementations for
# several basis types.
# Then we can calculate the penalty matrix using the
# coefficients of the functions in the basis
# and the penalty matrix of the basis.
basis_pen_matrix = gram_matrix(
linear_operator,
fdatabasis.basis,
)

coef_matrix = fdatabasis.coefficients
return coef_matrix @ basis_pen_matrix @ coef_matrix.T


@gram_matrix_optimization.register
def custombasis_penalty_matrix_optimized(
linear_operator: LinearDifferentialOperator,
basis: CustomBasis,
) -> NDArrayFloat:
"""Optimized version for CustomBasis."""
return gram_matrix(linear_operator, basis.fdata)
3 changes: 2 additions & 1 deletion skfda/ml/regression/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Regression."""

from typing import TYPE_CHECKING

import lazy_loader as lazy
Expand All @@ -10,6 +9,7 @@
"_historical_linear_model": ["HistoricalLinearRegression"],
"_kernel_regression": ["KernelRegression"],
"_linear_regression": ["LinearRegression"],
"_fpca_regression": ["FPCARegression"],
"_neighbors_regression": [
"KNeighborsRegressor",
"RadiusNeighborsRegressor",
Expand All @@ -18,6 +18,7 @@
)

if TYPE_CHECKING:
from ._fpca_regression import FPCARegression
from ._historical_linear_model import (
HistoricalLinearRegression as HistoricalLinearRegression,
)
Expand Down
152 changes: 152 additions & 0 deletions skfda/ml/regression/_fpca_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
from __future__ import annotations

from typing import TypeVar

from sklearn.utils.validation import check_is_fitted

from ..._utils._sklearn_adapter import BaseEstimator, RegressorMixin
from ...misc.regularization import L2Regularization
from ...preprocessing.dim_reduction import FPCA
from ...representation import FData
from ...representation.basis import Basis, CustomBasis, FDataBasis
from ...typing._numpy import NDArrayFloat
from ._linear_regression import LinearRegression

FPCARegressionSelf = TypeVar("FPCARegressionSelf", bound="FPCARegression")


class FPCARegression(
BaseEstimator,
RegressorMixin,
):
r"""Regression using Functional Principal Components Analysis.
It performs Functional Principal Components Analysis to reduce the
dimension of the functional data, and then uses a linear regression model
to relate the transformed data to a scalar value.
Args:
n_components: Number of principal components to keep. Defaults to 5.
fit\_intercept: If True, the linear model is calculated with an
intercept. Defaults to ``True``.
pca_regularization: Regularization parameter for the principal
component extraction. If None then no regularization is applied.
Defaults to ``None``.
regression_regularization: Regularization parameter for the linear
regression. If None then no regularization is applied.
Defaults to ``None``.
components_basis: Basis used for the principal components. If None
then the basis of the input data is used. Defaults to None.
It is only used if the input data is a FDataBasis object.
Attributes:
n\_components\_: Number of principal components used.
components\_: Principal components.
coef\_: Coefficients of the linear regression model.
explained\_variance\_: Amount of variance explained by
each of the selected components.
explained\_variance\_ratio\_: Percentage of variance
explained by each of the selected components.
Examples:
Using the Berkeley Growth Study dataset, we can fit the model.
>>> import skfda
>>> dataset = skfda.datasets.fetch_growth()
>>> fd = dataset["data"]
>>> y = dataset["target"]
>>> reg = skfda.ml.regression.FPCARegression(n_components=2)
>>> reg.fit(fd, y)
FPCARegression(n_components=2)
Then, we can predict the target values and calculate the
score.
>>> score = reg.score(fd, y)
>>> reg.predict(fd) # doctest:+ELLIPSIS
array([...])
"""

def __init__(
self,
n_components: int = 5,
fit_intercept: bool = True,
pca_regularization: L2Regularization | None = None,
regression_regularization: L2Regularization | None = None,
components_basis: Basis | None = None,
) -> None:
self.n_components = n_components
self.fit_intercept = fit_intercept
self.pca_regularization = pca_regularization
self.regression_regularization = regression_regularization
self.components_basis = components_basis

def fit(
self,
X: FData,
y: NDArrayFloat,
) -> FPCARegressionSelf:
"""Fit the model according to the given training data.
Args:
X: Functional data.
y: Target values.
Returns:
self
"""
self._fpca = FPCA(
n_components=self.n_components,
centering=True,
regularization=self.pca_regularization,
components_basis=self.components_basis,
)
self._linear_model = LinearRegression(
fit_intercept=self.fit_intercept,
regularization=self.regression_regularization,
)
transformed_coefficients = self._fpca.fit_transform(X)

# The linear model is fitted with the observations expressed in the
# basis of the principal components.
self.fpca_basis = CustomBasis(
fdata=self._fpca.components_,
)

X_transformed = FDataBasis(
basis=self.fpca_basis,
coefficients=transformed_coefficients,
)
self._linear_model.fit(X_transformed, y)

self.n_components_ = self.n_components
self.components_ = self._fpca.components_
self.coef_ = self._linear_model.coef_
self.explained_variance_ = self._fpca.explained_variance_
self.explained_variance_ratio_ = self._fpca.explained_variance_ratio_

return self

def predict(
self,
X: FData,
) -> NDArrayFloat:
"""Predict using the linear model.
Args:
X: Functional data.
Returns:
Target values.
"""
check_is_fitted(self)

X_transformed = FDataBasis(
basis=self.fpca_basis,
coefficients=self._fpca.transform(X),
)

return self._linear_model.predict(X_transformed)

0 comments on commit cc26782

Please sign in to comment.