Merge pull request #533 from GAA-UAM/feature/missing_data

Add missing data imputation via interpolation
GAA-UAM · Apr 16, 2023 · 6a39d6a · 6a39d6a
2 parents 4865912 + 5760d9f
commit 6a39d6a
Show file tree

Hide file tree

Showing 5 changed files with 205 additions and 1 deletion.
diff --git a/docs/modules/preprocessing.rst b/docs/modules/preprocessing.rst
@@ -10,11 +10,19 @@ this category deal with this problem.
    :caption: Modules:
    :hidden:
 
+   preprocessing/missing
    preprocessing/smoothing
    preprocessing/registration
    preprocessing/dim_reduction
    preprocessing/feature_construction
 
+Missing data
+------------
+
+When the observations contains missing data, it is necessary to reconstruct
+the invalid information before processing it further.
+:doc:`Here <preprocessing/missing>` you can learn more about this procedure.
+
 Smoothing
 ---------
 

diff --git a/docs/modules/preprocessing/missing.rst b/docs/modules/preprocessing/missing.rst
@@ -0,0 +1,12 @@
+Missing data
+============
+
+Sometimes data contains invalid or missing values.
+Before doing any kind of processing, the invalid points in the functions
+should be replaced with valid values.
+This module deals with that kind of procedure.
+
+.. autosummary::
+   :toctree: autosummary
+
+   skfda.preprocessing.missing.MissingValuesInterpolation
diff --git a/skfda/preprocessing/__init__.py b/skfda/preprocessing/__init__.py
@@ -5,9 +5,10 @@
 __getattr__, __dir__, __all__ = lazy.attach(
     __name__,
     submodules=[
+        "dim_reduction",
         "feature_construction",
+        "missing",
         "registration",
         "smoothing",
-        "dim_reduction",
     ],
 )
diff --git a/skfda/preprocessing/missing/__init__.py b/skfda/preprocessing/missing/__init__.py
@@ -0,0 +1,19 @@
+"""Imputation of missing values."""
+from __future__ import annotations
+
+import importlib
+from typing import TYPE_CHECKING, Any
+
+import lazy_loader as lazy
+
+__getattr__, __dir__, __all__ = lazy.attach(
+    __name__,
+    submod_attrs={
+        "_interpolate": ["MissingValuesInterpolation"],
+    },
+)
+
+if TYPE_CHECKING:
+    from ._interpolate import (
+        MissingValuesInterpolation as MissingValuesInterpolation
+    )
diff --git a/skfda/preprocessing/missing/_interpolate.py b/skfda/preprocessing/missing/_interpolate.py
@@ -0,0 +1,164 @@
+from typing import Any, TypeVar
+
+import numpy as np
+from scipy.interpolate import InterpolatedUnivariateSpline
+from scipy.interpolate.interpnd import LinearNDInterpolator
+
+from ..._utils._sklearn_adapter import BaseEstimator, InductiveTransformerMixin
+from ...representation import FDataGrid
+from ...typing._base import GridPoints
+from ...typing._numpy import NDArrayFloat, NDArrayInt
+
+T = TypeVar("T", bound=FDataGrid)
+
+
+def _coords_from_indices(
+    coord_indices: NDArrayInt,
+    grid_points: GridPoints,
+) -> NDArrayFloat:
+    return np.stack([
+        grid_points[i][coord_index]
+        for i, coord_index in enumerate(coord_indices.T)
+    ]).T
+
+
+def _interpolate_nans(
+    fdatagrid: T,
+) -> T:
+
+    data_matrix = fdatagrid.data_matrix.copy()
+
+    for n_sample in range(fdatagrid.n_samples):
+        for n_coord in range(fdatagrid.dim_codomain):
+
+            data_points = data_matrix[n_sample, ..., n_coord]
+            nan_pos = np.isnan(data_points)
+            valid_pos = ~nan_pos
+            coord_indices = np.argwhere(valid_pos)
+            desired_coord_indices = np.argwhere(nan_pos)
+            coords = _coords_from_indices(
+                coord_indices,
+                fdatagrid.grid_points,
+            )
+            desired_coords = _coords_from_indices(
+                desired_coord_indices,
+                fdatagrid.grid_points,
+            )
+            values = data_points[valid_pos]
+
+            if fdatagrid.dim_domain == 1:
+                interpolation = InterpolatedUnivariateSpline(
+                    coords,
+                    values,
+                    k=1,
+                    ext=3,
+                )
+            else:
+                interpolation = LinearNDInterpolator(
+                    coords,
+                    values,
+                )
+
+            new_values = interpolation(
+                desired_coords,
+            )
+
+            data_matrix[n_sample, nan_pos, n_coord] = new_values.ravel()
+
+    return fdatagrid.copy(data_matrix=data_matrix)
+
+
+class MissingValuesInterpolation(
+    BaseEstimator,
+    InductiveTransformerMixin[T, T, Any],
+):
+    """
+    Class to interpolate missing values.
+
+    Missing values are represented as NaNs.
+    They are interpolated from nearby values with valid data.
+    Note that this may be a poor choice if there are large contiguous portions
+    of the function with missing values, as some of them would be inferred from
+    very far away points.
+
+    Examples:
+        It is possible to interpolate NaNs scalar-valued univariate functions:
+
+        >>> from skfda import FDataGrid
+        >>> from skfda.preprocessing.missing import MissingValuesInterpolation
+        >>> import numpy as np
+
+        >>> X = FDataGrid([
+        ...     [1, 2, np.nan, 4],
+        ...     [5, np.nan, 7, 8],
+        ...     [9, 10, np.nan, 12],
+        ... ])
+        >>> nan_interp = MissingValuesInterpolation()
+        >>> X_transformed = nan_interp.fit_transform(X)
+        >>> X_transformed.data_matrix[..., 0]
+        array([[ 1.,  2.,  3.,  4.],
+               [ 5.,  6.,  7.,  8.],
+               [ 9., 10., 11., 12.]])
+
+        For vector-valued functions each coordinate is interpolated
+        independently:
+
+        >>> X = FDataGrid(
+        ...     [
+        ...         [
+        ...             (1, 5),
+        ...             (2, np.nan),
+        ...             (np.nan, 7),
+        ...             (4, 8),
+        ...         ],
+        ...         [
+        ...             (9, 13),
+        ...             (10, np.nan),
+        ...             (np.nan, np.nan),
+        ...             (12, 16),
+        ...         ],
+        ...     ],
+        ...     grid_points=np.linspace(0, 1, 4)
+        ... )
+        >>> nan_interp = MissingValuesInterpolation()
+        >>> X_transformed = nan_interp.fit_transform(X)
+        >>> X_transformed.data_matrix # doctest: +NORMALIZE_WHITESPACE
+        array([[[  1.,  5.],
+                [  2.,  6.],
+                [  3.,  7.],
+                [  4.,  8.]],
+               [[  9., 13.],
+                [ 10., 14.],
+                [ 11., 15.],
+                [ 12., 16.]]])
+
+        For multivariate functions, such as surfaces all dimensions are
+        considered. This is currently done using
+        :external:class:`~scipy.interpolation.LinearNDInterpolator`, which
+        triangulates the space and performs linear barycentric interpolation:
+
+        >>> X = FDataGrid(
+        ...     [
+        ...         [
+        ...             [1, 2, 3, 4],
+        ...             [5, np.nan, 7, 8],
+        ...             [10, 10, np.nan, 10],
+        ...             [13, 14, 15, 16],
+        ...         ],
+        ...     ],
+        ...     grid_points=(np.linspace(0, 1, 4), np.linspace(0, 1, 4))
+        ... )
+        >>> nan_interp = MissingValuesInterpolation()
+        >>> X_transformed = nan_interp.fit_transform(X)
+        >>> X_transformed.data_matrix[..., 0]
+        array([[[  1.,   2.,   3.,   4.],
+                [  5.,   6.,   7.,   8.],
+                [ 10.,  10.,  11.,  10.],
+                [ 13.,  14.,  15.,  16.]]])
+    """
+
+    def transform(
+        self,
+        X: T,
+    ) -> T:
+        return _interpolate_nans(X)