-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #466 from GAA-UAM/feature/FPCA_Regression
Feature/fpca_regression
- Loading branch information
Showing
7 changed files
with
551 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
""" | ||
Functional Principal Component Analysis Regression. | ||
=================================================== | ||
This example explores the use of the functional principal component analysis | ||
(FPCA) in regression problems. | ||
""" | ||
|
||
# Author: David del Val | ||
# License: MIT | ||
|
||
import matplotlib.pyplot as plt | ||
from sklearn.model_selection import GridSearchCV, train_test_split | ||
|
||
import skfda | ||
from skfda.ml.regression import FPCARegression | ||
|
||
############################################################################## | ||
# In this example, we will demonstrate the use of the FPCA regression method | ||
# using the :func:`tecator <skfda.datasets.fetch_tecator>` dataset. | ||
# This data set contains 215 samples. Each of those samples is comprised of | ||
# a spectrum of absorbances and the contents of water, fat and protein. | ||
|
||
X, y = skfda.datasets.fetch_tecator(return_X_y=True, as_frame=True) | ||
X = X.iloc[:, 0].values | ||
y = y["fat"].values | ||
|
||
############################################################################## | ||
# Our goal will be to estimate the fat percentage from the spectrum. However, | ||
# in order to better understand the data, we will first plot all the spectra | ||
# curves. The color of these curves depends on the amount of fat, from least | ||
# (yellow) to highest (red). | ||
|
||
X.plot(gradient_criteria=y, legend=True) | ||
plt.show() | ||
|
||
############################################################################## | ||
# In order to evaluate the performance of the model, we will split the data | ||
# into train and test sets. The former will contain 80% of the samples, while | ||
# the latter will contain the remaining 20%. | ||
X_train, X_test, y_train, y_test = train_test_split( | ||
X, | ||
y, | ||
test_size=0.2, | ||
random_state=1, | ||
) | ||
|
||
############################################################################## | ||
# Since the FPCA regression provides good results with a small number of | ||
# components, we will start by using only 5 components. After training the | ||
# model, we can check its performance on the test set. | ||
|
||
reg = FPCARegression(n_components=5) | ||
reg.fit(X_train, y_train) | ||
print(reg.score(X_test, y_test)) | ||
|
||
############################################################################## | ||
# We have obtained a pretty good result considering that | ||
# the model has only used 5 components. That is to say, the dimensionality of | ||
# the problem has been reduced from 100 (each spectrum has 100 points) to 5. | ||
# | ||
# However, we can improve the performance of the model by using more | ||
# components. To do so, we will use cross validation to find the best number of | ||
# components. We will test with values from 1 to 100. | ||
|
||
param_grid = {"n_components": range(1, 100, 1)} | ||
reg = FPCARegression() | ||
|
||
# Perform grid search with cross-validation | ||
gscv = GridSearchCV(reg, param_grid, cv=5) | ||
gscv.fit(X_train, y_train) | ||
|
||
|
||
print("Best params:", gscv.best_params_) | ||
print("Best cross-validation score:", gscv.best_score_) | ||
|
||
############################################################################## | ||
# The best performance for the train set is obtained using 30 components. | ||
# This still provides a good reduction in dimensionality. However, it is | ||
# important to note that the performance of the model scales | ||
# very slowly with the number of components. | ||
# | ||
# This phenomenon can be seen in the following plot, and confirms that | ||
# FPCA already provides a good approximation of the data with | ||
# a small number of components. | ||
|
||
fig = plt.figure() | ||
ax = fig.add_subplot(1, 1, 1) | ||
ax.bar(param_grid["n_components"], gscv.cv_results_["mean_test_score"]) | ||
ax.set_xticks(range(0, 100, 10)) | ||
ax.set_ylabel("Number of Components") | ||
ax.set_xlabel("Cross-validation score") | ||
ax.set_ylim((0.5, 1)) | ||
|
||
############################################################################## | ||
# To conclude, we can calculate the score of the model on the test set after | ||
# it has been trained on the whole train set. As expected, the score is | ||
# slightly higher than the one reported by the cross-validation. | ||
# | ||
# Moreover, we can check that the score barely changes when we use a somewhat | ||
# smaller number of components. | ||
|
||
reg = FPCARegression(n_components=30) | ||
reg.fit(X_train, y_train) | ||
print("Score with 30 components:", reg.score(X_test, y_test)) | ||
|
||
reg = FPCARegression(n_components=15) | ||
reg.fit(X_train, y_train) | ||
print("Score with 15 components:", reg.score(X_test, y_test)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TypeVar | ||
|
||
from sklearn.utils.validation import check_is_fitted | ||
|
||
from ..._utils._sklearn_adapter import BaseEstimator, RegressorMixin | ||
from ...misc.regularization import L2Regularization | ||
from ...preprocessing.dim_reduction import FPCA | ||
from ...representation import FData | ||
from ...representation.basis import Basis, CustomBasis, FDataBasis | ||
from ...typing._numpy import NDArrayFloat | ||
from ._linear_regression import LinearRegression | ||
|
||
FPCARegressionSelf = TypeVar("FPCARegressionSelf", bound="FPCARegression") | ||
|
||
|
||
class FPCARegression( | ||
BaseEstimator, | ||
RegressorMixin, | ||
): | ||
r"""Regression using Functional Principal Components Analysis. | ||
It performs Functional Principal Components Analysis to reduce the | ||
dimension of the functional data, and then uses a linear regression model | ||
to relate the transformed data to a scalar value. | ||
Args: | ||
n_components: Number of principal components to keep. Defaults to 5. | ||
fit\_intercept: If True, the linear model is calculated with an | ||
intercept. Defaults to ``True``. | ||
pca_regularization: Regularization parameter for the principal | ||
component extraction. If None then no regularization is applied. | ||
Defaults to ``None``. | ||
regression_regularization: Regularization parameter for the linear | ||
regression. If None then no regularization is applied. | ||
Defaults to ``None``. | ||
components_basis: Basis used for the principal components. If None | ||
then the basis of the input data is used. Defaults to None. | ||
It is only used if the input data is a FDataBasis object. | ||
Attributes: | ||
n\_components\_: Number of principal components used. | ||
components\_: Principal components. | ||
coef\_: Coefficients of the linear regression model. | ||
explained\_variance\_: Amount of variance explained by | ||
each of the selected components. | ||
explained\_variance\_ratio\_: Percentage of variance | ||
explained by each of the selected components. | ||
Examples: | ||
Using the Berkeley Growth Study dataset, we can fit the model. | ||
>>> import skfda | ||
>>> dataset = skfda.datasets.fetch_growth() | ||
>>> fd = dataset["data"] | ||
>>> y = dataset["target"] | ||
>>> reg = skfda.ml.regression.FPCARegression(n_components=2) | ||
>>> reg.fit(fd, y) | ||
FPCARegression(n_components=2) | ||
Then, we can predict the target values and calculate the | ||
score. | ||
>>> score = reg.score(fd, y) | ||
>>> reg.predict(fd) # doctest:+ELLIPSIS | ||
array([...]) | ||
""" | ||
|
||
def __init__( | ||
self, | ||
n_components: int = 5, | ||
fit_intercept: bool = True, | ||
pca_regularization: L2Regularization | None = None, | ||
regression_regularization: L2Regularization | None = None, | ||
components_basis: Basis | None = None, | ||
) -> None: | ||
self.n_components = n_components | ||
self.fit_intercept = fit_intercept | ||
self.pca_regularization = pca_regularization | ||
self.regression_regularization = regression_regularization | ||
self.components_basis = components_basis | ||
|
||
def fit( | ||
self, | ||
X: FData, | ||
y: NDArrayFloat, | ||
) -> FPCARegressionSelf: | ||
"""Fit the model according to the given training data. | ||
Args: | ||
X: Functional data. | ||
y: Target values. | ||
Returns: | ||
self | ||
""" | ||
self._fpca = FPCA( | ||
n_components=self.n_components, | ||
centering=True, | ||
regularization=self.pca_regularization, | ||
components_basis=self.components_basis, | ||
) | ||
self._linear_model = LinearRegression( | ||
fit_intercept=self.fit_intercept, | ||
regularization=self.regression_regularization, | ||
) | ||
transformed_coefficients = self._fpca.fit_transform(X) | ||
|
||
# The linear model is fitted with the observations expressed in the | ||
# basis of the principal components. | ||
self.fpca_basis = CustomBasis( | ||
fdata=self._fpca.components_, | ||
) | ||
|
||
X_transformed = FDataBasis( | ||
basis=self.fpca_basis, | ||
coefficients=transformed_coefficients, | ||
) | ||
self._linear_model.fit(X_transformed, y) | ||
|
||
self.n_components_ = self.n_components | ||
self.components_ = self._fpca.components_ | ||
self.coef_ = self._linear_model.coef_ | ||
self.explained_variance_ = self._fpca.explained_variance_ | ||
self.explained_variance_ratio_ = self._fpca.explained_variance_ratio_ | ||
|
||
return self | ||
|
||
def predict( | ||
self, | ||
X: FData, | ||
) -> NDArrayFloat: | ||
"""Predict using the linear model. | ||
Args: | ||
X: Functional data. | ||
Returns: | ||
Target values. | ||
""" | ||
check_is_fitted(self) | ||
|
||
X_transformed = FDataBasis( | ||
basis=self.fpca_basis, | ||
coefficients=self._fpca.transform(X), | ||
) | ||
|
||
return self._linear_model.predict(X_transformed) |
Oops, something went wrong.