Merge pull request #536 from GAA-UAM/feature/irregular_operations

Feature/irregular_structure_and_operations
GAA-UAM · Mar 11, 2024 · 6b11674 · 6b11674
2 parents adb80fe + 21f7bad
commit 6b11674
Show file tree

Hide file tree

Showing 18 changed files with 3,500 additions and 40 deletions.
diff --git a/docs/modules/representation.rst b/docs/modules/representation.rst
@@ -85,6 +85,29 @@ methods.
 
    skfda.representation.basis.Basis
 
+
+Irregular representation
+------------------------
+
+In practice, many functional datasets do not contain functions evaluated
+uniformly over a fixed grid. In other words, it is paramount to be able
+to represent irregular functional data.
+
+While the FDataGrid class could support these kind of datasets by filling a
+common grid with possibly emtpy (or nan) values, it is inefficient to store a
+complete grid with low data density. Furthermore, there are specific methods
+that can be applied to irregular data in order to obtain, among other things,
+a better conversion to basis representation.
+
+The FDataIrregular class provides the functionality which suits these purposes.
+
+
+.. autosummary::
+   :toctree: autosummary
+
+   skfda.representation.irregular.FDataIrregular
+
+
 Generic representation
 ----------------------
 

diff --git a/skfda/__init__.py b/skfda/__init__.py
@@ -17,7 +17,9 @@
         "representation",
     ],
     submod_attrs={
-        'representation': ["FData", "FDataBasis", "FDataGrid"],
+        'representation': [
+            "FData", "FDataBasis", "FDataGrid", "FDataIrregular",
+        ],
         'representation._functional_data': ['concatenate'],
     },
 )

diff --git a/skfda/datasets/__init__.py b/skfda/datasets/__init__.py
@@ -20,6 +20,7 @@
             "fetch_tecator",
             "fetch_ucr",
             "fetch_weather",
+            "fetch_bone_density",
         ],
         "_samples_generators": [
             "make_gaussian",

diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py
@@ -12,6 +12,7 @@
 from typing_extensions import Literal
 
 from ..representation import FDataGrid
+from ..representation.irregular import FDataIrregular
 from ..typing._numpy import NDArrayFloat, NDArrayInt
 
 
@@ -162,7 +163,7 @@ def fetch_ucr(
     return_X_y: bool = False,
     **kwargs: Any,
 ) -> Bunch | Tuple[FDataGrid, NDArrayInt]:
-    """
+    r"""
     Fetch a dataset from the UCR/UEA repository.
 
     The UCR/UEA Time Series Classification repository, hosted at
@@ -173,6 +174,7 @@ def fetch_ucr(
 
     Args:
         name: Dataset name.
+        return_X_y: Return tuple (data, target)
         kwargs: Additional parameters for the function
             :func:`skdatasets.repositories.ucr.fetch`.
 
@@ -247,7 +249,7 @@ def _fetch_fda_usc(name: str) -> Any:
     Acoustic-Phonetic Continuous Speech Corpus, NTIS, US Dept of Commerce)
     which is a widely used resource for research in speech recognition. A
     dataset was formed by selecting five phonemes for
-    classification based on digitized speech from this database.   
+    classification based on digitized speech from this database.
     phonemes are transcribed as follows: "sh" as in "she", "dcl" as in
     "dark", "iy" as the vowel in "she", "aa" as the vowel in "dark", and
     "ao" as the first vowel in "water". From continuous speech of 50 male
@@ -1551,3 +1553,92 @@ def fetch_mco(
         cite=":footcite:p:`ruiz-meana++_2003_cariporide`",
         bibliography=".. footbibliography::",
     ) + _param_descr
+
+
+def _fetch_loon_data(name: str) -> Any:
+    return _fetch_cran_no_encoding_warning(
+        name,
+        "loon.data",
+        version="0.1.3",
+    )
+
+
+_bone_density_descr = """
+    The Bone Density dataset is a study of bone density
+    in boys and girls aged 8-17. It contains data from 423
+    individuals, measured irregularly in different times,
+    with an average of ~3 points per individual.
+
+    References:
+        https://cran.r-project.org/package=loon.data
+        Laura K. Bachrach, Trevor Hastie, May-Choo Wang,
+            Balasubramanian Narasimhan, and Robert Marcus (1999)
+            "Bone Mineral Acquisition in Healthy Asian, Hispanic, Black
+            and Caucasian Youth. A Longitudinal Study",
+            J Clin Endocrinol Metab, 84, 4702-12.
+        Trevor Hastie, Robert Tibshirani, and Jerome Friedman (2009)
+            "The Elements of Statistical Learning",
+            2nd Edition, Springer New York <doi:10.1007/978-0-387-84858-7>
+
+"""
+
+
+def fetch_bone_density(
+    return_X_y: bool = False,
+    as_frame: bool = False,
+) -> Bunch | Tuple[FDataGrid, NDArrayInt] | Tuple[DataFrame, Series]:
+    """
+    Load the Bone Density dataset. This is an irregular dataset.
+
+    The data is obtained from the R package 'loon.data', which compiles several
+    irregular datasets. Sources to be determined.
+    """
+    descr = _bone_density_descr
+    frame = None
+
+    raw_dataset = _fetch_loon_data("bone_ext")
+
+    data = raw_dataset["bone_ext"]
+
+    curve_name = "idnum"
+    argument_name = "age"
+    target_name = "sex"
+    coordinate_name = "spnbmd"
+
+    curves = FDataIrregular._from_dataframe(
+        data,
+        id_column=curve_name,
+        argument_columns=argument_name,
+        coordinate_columns=coordinate_name,
+        argument_names=[argument_name],
+        coordinate_names=[coordinate_name],
+        dataset_name="bone_ext",
+    )
+
+    target = pd.Series(
+        data.drop_duplicates(subset=["idnum"])[target_name],
+        name="group",
+    )
+
+    feature_name = curves.dataset_name.lower()
+    target_names = target.values.tolist()
+
+    if as_frame:
+        curves = pd.DataFrame({feature_name: curves})
+        target_as_frame = target.reset_index(drop=True).to_frame()
+        frame = pd.concat([curves, target_as_frame], axis=1)
+    else:
+        target = target.values.codes
+
+    if return_X_y:
+        return curves, target
+
+    return Bunch(
+        data=curves,
+        target=target,
+        frame=frame,
+        categories={},
+        feature_names=[argument_name],
+        target_names=target_names,
+        DESCR=descr,
+    )
diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py
@@ -6,13 +6,12 @@
 from typing import Callable, TypeVar, Union
 
 import numpy as np
-from scipy import integrate
 from scipy.stats import rankdata
 
 from skfda._utils.ndfunction import average_function_value
 
 from ...misc.metrics._lp_distances import l2_distance
-from ...representation import FData, FDataBasis, FDataGrid
+from ...representation import FData, FDataBasis, FDataGrid, FDataIrregular
 from ...typing._metric import Metric
 from ...typing._numpy import NDArrayFloat
 from ..depth import Depth, ModifiedBandDepth
@@ -103,7 +102,7 @@ def cov(
 
 
 @functools.singledispatch
-def std(X: F, correction: int = 1) -> F:
+def std(X: F, correction: int = 0) -> F:
     r"""
     Compute the standard deviation of all the samples in a FData object.
 
@@ -127,7 +126,7 @@ def std(X: F, correction: int = 1) -> F:
 
 
 @std.register
-def std_fdatagrid(X: FDataGrid, correction: int = 1) -> FDataGrid:
+def std_fdatagrid(X: FDataGrid, correction: int = 0) -> FDataGrid:
     """Compute the standard deviation of a FDataGrid."""
     return X.copy(
         data_matrix=np.std(
@@ -138,7 +137,25 @@ def std_fdatagrid(X: FDataGrid, correction: int = 1) -> FDataGrid:
 
 
 @std.register
-def std_fdatabasis(X: FDataBasis, correction: int = 1) -> FDataBasis:
+def std_fdatairregular(
+    X: FDataIrregular, correction: int = 0,
+) -> FDataIrregular:
+    """Compute the standard deviation of a FDataIrregular."""
+    common_points, common_values = X._get_common_points_and_values()
+    std_values = np.std(
+        common_values, axis=0, ddof=correction,
+    )
+
+    return FDataIrregular(
+        start_indices=np.array([0]),
+        points=common_points,
+        values=std_values,
+        sample_names=(None,),
+    )
+
+
+@std.register
+def std_fdatabasis(X: FDataBasis, correction: int = 0) -> FDataBasis:
     """Compute the standard deviation of a FDataBasis."""
     from ..._utils import function_to_fdatabasis