# PCA + k-Means on ERA5 Data

## Results

- Data loading takes ~5min
- Data preprocessing takes ~4min
- PCA with 500 PCs takes ~8min
- PCA:
  - N_pcs=24 cover 0.7 of the variance
  - N_pcs=80 cover 0.8 of the variance
  - N_pcs=171 cover 0.85 of the variance
  - N_pcs=421 cover 0.9 of the variance


In [None]:
import a6
import sklearn.cluster
import sklearn.decomposition
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import xarray as xr

In [None]:
%%time

path = pathlib.Path(
    "/p/project/deepacf/maelstrom/emmerich1/data/ecmwf_era5/era5_pl_1964_2023_12.nc"
)
ds = xr.open_dataset(path)

coordinates = a6.datasets.coordinates.Coordinates()
variables = a6.datasets.variables.Model()

masked = a6.datasets.methods.mask.set_nans_to_mean(ds, coordinates=coordinates)

In [None]:
%%time

data = (
    (
        a6.features.methods.weighting.weight_by_latitudes(
            latitudes=coordinates.latitude,
            use_sqrt=True,
        )
        >> a6.features.methods.reshape.xarray.reshape_spatio_temporal_data(
            # Set to None to avoid memory excess in function
            time_coordinate=None,
        )
        >> a6.features.methods.standardization.normalize_features()
    )
    .apply_to(masked)
    .compute()
)
del ds
del masked

In [None]:
%%time

pca = sklearn.decomposition.PCA(n_components=500, copy_X=False).fit(data)
cum_evr = np.cumsum(pca.explained_variance_ratio_)

In [None]:
plt.plot(list(range(pca.n_components_)), cum_evr)
for var in np.arange(0, 0.9, 0.1):
    n_pcs = np.where(cum_evr > var)[0][0]
    print(f"N_pcs={n_pcs} cover {var} of the variance")
n_pcs = np.where(cum_evr > 0.80)[0][0]
transformed = sklearn.decomposition.PCA(n_components=n_pcs).fit_transform(data)
kmeans = sklearn.cluster.KMeans(n_clusters=40).fit(transformed)

del transformed

kmeans.labels_

Kernel PCA: Gaussian radial basis function with $\sigma = 200$ (see http://dx.doi.org/10.1016/j.procs.2011.08.043)

In [None]:
%%time
kpca = sklearn.decomposition.KernelPCA(
    n_components=50, kernel="rbf", gamma=200, copy_X=False, n_jobs=-1
).fit(data)
cum_evr = np.cumsum(kpca.explained_variance_ratio_)

In [None]:
plt.plot(list(range(kpca.n_components_)), cum_evr)
for var in np.arange(0, 0.9, 0.1):
    n_pcs = np.where(cum_evr > var)[0][0]
    print(f"N_pcs={n_pcs} cover {var} of the variance")
n_pcs = np.where(cum_evr > 0.80)[0][0]
transformed = sklearn.decomposition.KernelPCA(
    n_components=n_pcs, kernel="rbf", gamma=200, n_jobs=-1
).fit_transform(data)
kmeans = sklearn.cluster.KMeans(n_clusters=40).fit(transformed)

del transformed

kmeans.labels_