Skip to content

Commit

Permalink
Improve ICTAI examples and add references.
Browse files Browse the repository at this point in the history
  • Loading branch information
vnmabus committed Nov 8, 2023
1 parent 4a368bd commit 7488932
Show file tree
Hide file tree
Showing 5 changed files with 254 additions and 43 deletions.
75 changes: 75 additions & 0 deletions docs/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,21 @@ @article{berrendero++_2022_functional
keywords = {62J12,62R10,Functional data,Kernel methods in statistics,Logistic regression,Mathematics - Statistics Theory,Reproducing kernel Hilbert spaces}
}

@article{borggaard+thodberg_1992_optimal,
title = {Optimal Minimal Neural Interpretation of Spectra},
author = {Borggaard, {\relax Claus}. and Thodberg, Hans Henrik.},
year = {1992},
month = mar,
journal = {Analytical Chemistry},
volume = {64},
number = {5},
pages = {545--551},
issn = {0003-2700},
doi = {10.1021/ac00029a018},
url = {https://doi.org/10.1021/ac00029a018},
urldate = {2018-12-09}
}

@article{cuesta-albertos++_2017_ddgclassifier,
title = {The {{DDᴳ}}-Classifier in the Functional Setting},
author = {{Cuesta-Albertos}, J. A. and {Febrero-Bande}, M. and {Oviedo~de~la~Fuente}, M.},
Expand Down Expand Up @@ -293,6 +308,26 @@ @article{ghosh+chaudhuri_2005_maximum
keywords = {Bayes risk,cross-validation,data depth,elliptic symmetry,kernel density estimation,location shift model,Mahalanobis distance,misclassification rate,Vapnik Chervonenkis dimension}
}

@article{hastie++_1995_penalized,
title = {Penalized Discriminant Analysis},
author = {Hastie, Trevor and Buja, Andreas and Tibshirani, Robert},
year = {1995},
month = feb,
journal = {The Annals of Statistics},
volume = {23},
number = {1},
pages = {73--102},
issn = {0090-5364, 2168-8966},
doi = {10.1214/aos/1176324456},
url = {https://projecteuclid.org/euclid.aos/1176324456},
urldate = {2018-12-09},
abstract = {Fisher's linear discriminant analysis (LDA) is a popular data-analytic tool for studying the relationship between a set of predictors and a categorical response. In this paper we describe a penalized version of LDA. It is designed for situations in which there are many highly correlated predictors, such as those obtained by discretizing a function, or the grey-scale values of the pixels in a series of images. In cases such as these it is natural, efficient and sometimes essential to impose a spatial smoothness constraint on the coefficients, both for improved prediction performance and interpretability. We cast the classification problem into a regression framework via optimal scoring. Using this, our proposal facilitates the use of any penalized regression technique in the classification setting. The technique is illustrated with examples in speech recognition and handwritten character recognition.},
langid = {english},
mrnumber = {MR1331657},
zmnumber = {0821.62031},
keywords = {discrimination,regularization,Signal and image classification}
}

@article{li++_2012_ddclassifier,
title = {{{DD-Classifier}}: {{Nonparametric}} Classification Procedure Based on {{DD-Plot}}},
shorttitle = {{{DD-Classifier}}},
Expand Down Expand Up @@ -347,6 +382,14 @@ @article{marron++_2015_functional
keywords = {alignment,dynamic time warping,elastic metric,Fisher–Rao metric,Functional data analysis,registration,warping}
}

@misc{meteorologicalstateagencyofspainaemet_2009_meteorological,
title = {Meteorological Data of {{Spanish}} Weather Stations between Years 1980 and 2009.},
author = {{Meteorological State Agency of Spain (AEMET)}},
year = {2009},
url = {http://www.aemet.es/},
abstract = {The data were obtained from the FTP of AEMET in 2009.}
}

@article{pini++_2018_hotelling,
title = {Hotelling's {{T2}} in Separable {{Hilbert}} Spaces},
author = {Pini, Alessia and Stamm, Aymeric and Vantini, Simone},
Expand Down Expand Up @@ -379,6 +422,20 @@ @incollection{pintado+romo_2005_depthbased
isbn = {978-0-8218-3596-8}
}

@inproceedings{ramos-carreno++_2022_scikitfda,
title = {{{scikit-fda}}: {{Computational}} Tools for Machine Learning with Functional Data},
shorttitle = {Scikit-Fda},
booktitle = {2022 {{IEEE}} 34th {{International Conference}} on {{Tools}} with {{Artificial Intelligence}} ({{ICTAI}})},
author = {{Ramos-Carre{\~n}o}, Carlos and Torrecilla, Jos{\'e} Luis and Hong, Yujian and Su{\'a}rez, Alberto},
year = {2022},
month = oct,
pages = {213--218},
issn = {2375-0197},
doi = {10.1109/ICTAI56018.2022.00038},
abstract = {Machine learning from functional data poses particular challenges that require specific computational tools that take into account their structure. In this work, we present scikit-fda, a Python library for functional data analysis, visualization, preprocessing, and machine learning. The library is designed for smooth integration in the Python scientific ecosystem. In particular, it complements and can be used in combination with scikit-learn, the reference Python library for machine learning. The functionality of scikit-fda is illustrated in clustering, regression, and classification problems from different areas of application.},
keywords = {Data analysis,data visualization,Data visualization,Documentation,Ecosystems,Extrapolation,functional data analysis,Interpolation,machine learning,Machine learning,Python toolbox}
}

@inbook{ramsay+silverman_2005_functionala,
title = {From Functional Data to Smooth Functions},
booktitle = {Functional Data Analysis},
Expand Down Expand Up @@ -436,6 +493,24 @@ @inbook{ramsay+silverman_2005_registration
keywords = {Multivariate analysis}
}

@incollection{romeo+marzoljaen_2014_analisis,
title = {{An\'alisis del viento y la niebla en el aeropuerto de Los Rodeos (Tenerife). Cambios y tendencias}},
booktitle = {{Cambio clim\'atico y cambio global.}},
author = {Romeo, V{\'i}ctor M. and Marzol Ja{\'e}n, M{\textordfeminine} Victoria},
editor = {Fern{\'a}ndez Montes, Sonia and S{\'a}nchez Rodrigo, Fernando},
year = {2014},
series = {{Publicaciones de la Asociaci\'on Espa\~nola de Climatolog\'ia. Serie A.}},
number = {9},
pages = {325--334},
publisher = {{Asociaci\'on Espa\~nola de Climatolog\'ia}},
url = {https://repositorio.aemet.es/handle/20.500.11765/8173},
urldate = {2022-07-28},
abstract = {[ES]La particular localizaci\'on del aeropuerto Tenerife Norte-Los Rodeos, en el nordeste de la isla de Tenerife, es la causa de la modificaci\'on de la direcci\'on habitual de los vientos alisios del NE a vientos del NO, acompa\~nados frecuentemente por nubosidad. Esto ocasiona que en un n\'umero significativo de d\'ias al a\~no este aeropuerto est\'e cubierto por la niebla, lo que afecta de una forma muy importante a su operatividad. El objetivo del trabajo es caracterizar su r\'egimen de vientos y de la niebla durante los \'ultimos trece a\~nos (2000-2012) y comprobar si se han producido cambios significativos en ambas variables clim\'aticas con respecto a los periodos normales de 1961-1990 y 1971-2000. Los resultados indican que las dos direcciones dominantes del viento, NO y SE, mantienen sus frecuencias no as\'i la niebla que ha aumentado su presencia en los \'ultimos a\~nos.},
copyright = {Licencia CC: Reconocimiento CC BY},
isbn = {978-84-16027-69-9},
langid = {spanish}
}

@article{ruiz-meana++_2003_cariporide,
title = {Cariporide Preserves Mitochondrial Proton Gradient and Delays {{ATP}} Depletion in Cardiomyocytes during Ischemic Conditions},
author = {{Ruiz-Meana}, Marisol and {Garcia-Dorado}, David and Pina, Pilar and Inserte, Javier and Agull{\'o}, Luis and {Soler-Soler}, Jordi},
Expand Down
95 changes: 81 additions & 14 deletions examples/full_examples/plot_aemet_unsupervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,31 +30,56 @@
from skfda.ml.clustering import KMeans
from skfda.preprocessing.dim_reduction import FPCA

##############################################################################
# We will first load the AEMET dataset and plot it.
# %%
# In this example we explore the curves of daily temperatures at
# different weather stations from Spain, included in the AEMET dataset\
# :footcite:p:`meteorologicalstateagencyofspainaemet_2009_meteorological`.
#
# We first show how the visualization tools of scikit-fda can be used to
# detect and interpret magnitude and shape outliers.
# We also explain how to employ a clustering method on these temperature
# curves using scikit-fda.
# Then, the location of each weather station is plotted into a map of Spain
# with a different color according to its cluster.
# This reveals a remarkable resemblance to a Spanish climate map.
# A posterior analysis using functional principal component analysis (FPCA)
# explains how the two first principal components are related with relevant
# meteorological concepts, and can be used to reconstruct and interpret
# the original clustering.
#
# This is one of the examples presented in the ICTAI conference\
# :footcite:p:`ramos-carreno++_2022_scikitfda`.

# %%
# We first load the AEMET dataset and plot it.
X, _ = fetch_aemet(return_X_y=True)
X = X.coordinates[0]

X.plot()
plt.show()

##############################################################################
# %%
# A boxplot can show magnitude outliers, in this case Navacerrada.
# Here the temperatures are lower than in the other curves, as this
# weather station is at a high altitude, near a ski resort.
Boxplot(
X,
depth_method=ModifiedBandDepth(),
).plot()
plt.show()

##############################################################################
# %%
# A magnitude-shape plot can be used to detect shape outliers, such as the
# Canary islands, with a less steeper temperature curve.
# The Canary islands are at a lower latitude, closer to the equator.
# Thus, they have a subtropical climate which presents less temperature
# variation during the year.
MagnitudeShapePlot(
X,
).plot()
plt.show()

##############################################################################
# %%
# We now attempt to cluster the curves using functional k-means.
n_clusters = 5
n_init = 10
Expand All @@ -67,7 +92,7 @@
)
fda_clusters = fda_kmeans.fit_predict(X)

##############################################################################
# %%
# We want to plot the cluster of each station in the map of Spain. We need to
# define first auxiliary variables and functions for plotting.
coords_spain = (-10, 5, 34.98, 44.8)
Expand Down Expand Up @@ -130,15 +155,36 @@ def plot_cluster_points(

# Names of each climate (for this particular seed)
climate_names = {
0: "Cold-mountain",
0: "Cold/mountain",
1: "Mediterranean",
2: "Atlantic",
3: "Subtropical",
4: "Continental",
}

##############################################################################
# %%
# We now plot the obtained clustering in the maps.
#
# It is possible to notice that each cluster seems to roughly correspond with
# a particular climate:
#
# - Red points, only present in the Canary islands, correspond to a subtropical
# climate.
# - Yellow points, in the Mediterranean coast, correspond to a Mediterranean
# climate.
# - Green points, in the Atlantic coast of mainland Spain, correspond to an
# Atlantic or oceanic climate.
# - Orange points, in the center of mainland Spain, correspond to a continental
# climate.
# - Finally the purple points are located at the coldest regions of Spain, such
# as in high mountain ranges.
# Thus, it can be associated with a cold or mountain climate.
#
# Notice that a couple of points in the Canary islands are not red.
# The purple one is easy to explain, as it is in mount Teide, the highest
# mountain of Spain.
# The yellow one is in the airport of Los Rodeos, which has its own
# microclimate\ :footcite:p:`romeo+marzoljaen_2014_analisis`.

# Mainland
fig_spain = create_map(coords_spain, figsize=(8, 6))
Expand All @@ -161,7 +207,7 @@ def plot_cluster_points(
)
plt.show()

##############################################################################
# %%
# We now can compute the first two principal components for interpretability,
# and project the data over these directions.
fpca = FPCA(n_components=2)
Expand All @@ -173,12 +219,17 @@ def plot_cluster_points(

X_red = fpca.transform(X)

##############################################################################
# %%
# We now plot the first two principal components as perturbations over the
# mean.
#
# The ``factor`` parameters is a number that multiplies each component in
# order to make their effect more noticeable.
#
# It is possible to observe that the first component measures a global
# increase/decrease in temperature.
# The second component instead has the effect of increasing/decreasing
# the variability of the temperatures during the year.
fig = plt.figure(figsize=(8, 4))
FPCAPlot(
fpca.mean_,
Expand All @@ -188,8 +239,15 @@ def plot_cluster_points(
).plot()
plt.show()

##############################################################################
# We also plot the projections over the first two principal components.
# %%
# We also plot the projections over the first two principal components,
# keeping the cluster colors.
# Here we can easily observe the corresponding characteristics of each
# climate in terms of global temperature and annual variability.
# The two outliers of the Mediterranean climate correspond to the
# aforementioned airport of Los Rodeos, and to Tarifa, which is
# located at the strait of Gibraltar and thus receives also the cold
# waters of the Atlantic, explaining its lower annual variability.
fig, ax = plt.subplots(1, 1)
for cluster in range(n_clusters):
selection = fda_clusters == cluster
Expand All @@ -205,7 +263,7 @@ def plot_cluster_points(
ax.legend()
plt.show()

##############################################################################
# %%
# We now attempt a multivariate clustering using only these projections.
mv_kmeans = sklearn.cluster.KMeans(
n_clusters=n_clusters,
Expand All @@ -214,9 +272,12 @@ def plot_cluster_points(
)
mv_clusters = mv_kmeans.fit_predict(X_red)

##############################################################################
# %%
# We now plot the multivariate clustering in the maps. We define a different
# color map to match cluster colors with the previously obtained ones.
# As you can see, the clustering using only the two first principal components
# matches almost perfectly with the original one, that used the complete
# temperature curves.

mv_color_map = {
0: "yellow",
Expand Down Expand Up @@ -246,3 +307,9 @@ def plot_cluster_points(
ax=fig_canary.axes[0],
)
plt.show()

# %%
# References
# ----------
#
# .. footbibliography::

0 comments on commit 7488932

Please sign in to comment.