### Notebook to produce the dataset with 60years already normalized and reduced(pca) with the results obtained from the analysis over the 60y_jump_6_days

OUTPUT : matrix with 60y (1961-2020) normalized and with pca applied
- X_pca_60y.npy

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import xarray as xr
import numpy as np
import gc
import joblib
import warnings
warnings.filterwarnings('ignore')

import importlib
import utils.data_processing
import utils.visualization
importlib.reload(utils.data_processing)
importlib.reload(utils.visualization)

# Import the functions 
from utils.data_processing import prepare_data_matrix



In [2]:
# Return to the original dataset to process the 10 years data 

file_name= "10y/era5_2011_2020_t850_z500.grib"    ###### CHANGE WHEN CHANGING FILE
year = "2011_2020"                            ###### CHANGE WHEN CHANGING FILE

# Load only the t variable
ds_t = xr.open_dataset(file_name, engine='cfgrib', filter_by_keys={'shortName': 't'})
t_850 = ds_t['t']

print(list(ds_t.data_vars.keys()))

# Load only the z variable
ds_z = xr.open_dataset(file_name, engine='cfgrib', filter_by_keys={'shortName': 'z'})
z_500 = ds_z['z']
print(list(ds_z.data_vars.keys()))

t_850 = t_850.reset_coords('isobaricInhPa', drop=True)
z_500 = z_500.reset_coords('isobaricInhPa', drop=True)
ds_filtered = xr.Dataset({
    't': t_850,
    'z': z_500
})
print("Overview of the combined dataset:")
print(f"   • Variables: {list(ds_filtered.data_vars.keys())}")
print(f"   • Coordinates: {list(ds_filtered.coords.keys())}")
print(f"   • Shape: {ds_filtered.dims}")

['t']
['z']
Overview of the combined dataset:
   • Variables: ['t', 'z']
   • Coordinates: ['number', 'time', 'step', 'latitude', 'longitude', 'valid_time']


In [3]:
X, _ = prepare_data_matrix(ds_filtered)


t_mean = np.load("Mid_result_to_save/t_mean_60y.npy")
t_std = np.load("Mid_result_to_save/t_std_60y.npy") 
z_mean = np.load("Mid_result_to_save/z_mean_60y.npy")
z_std = np.load("Mid_result_to_save/z_std_60y.npy")

# Compute spatial dimensions
n_lat = 201  
n_lon = 321  
spatial_size = n_lat * n_lon

# Separate temperature and geopotential data
X_temperature = X[:, :spatial_size]
X_geopotential = X[:, spatial_size:]

X_temperature_std = (X_temperature - t_mean) / t_std
X_geopotential_std = (X_geopotential - z_mean) / z_std

X_std = np.concatenate([X_temperature_std, X_geopotential_std], axis=1)

del ds_filtered, X, X_temperature, X_geopotential, X_temperature_std, X_geopotential_std
gc.collect()




Processing t...
     → t: ('time', 'latitude', 'longitude') → (3653, 64521)
Processing z...
     → z: ('time', 'latitude', 'longitude') → (3653, 64521)

Combined matrix shape: (3653, 129042)


44

In [4]:
# Load the PCA model
ipca = joblib.load("Mid_result_to_save/ipca_60y.pkl")
X_reduced = ipca.transform(X_std)

# Save the reduced data
np.save(f"Mid_result_to_save/X_pca_{year}.npy", X_reduced)
print(f"Dati ridotti salvati in Mid_result_to_save/X_pca_{year}.npy")

Dati ridotti salvati in Mid_result_to_save/X_pca_2011_2020.npy


_The previous code as been applied to the two files: era5_1994_2008.grib and era5_2009_2023.grib. Now I have obtained the two matrices X_pca_...period..., and now I merge them in order to obtain the final 30 years matrix with the normalization and pca applied_

______  ____________  ________  _________

#### 60 years matrix
Now let's merge together all the X_pca_files for the ten years in order to form the dataset over the 60 years already normalized and with PCs.  

<span style="color: red;">Run the following cell only after havin produce (with the above cells) all the X_pca_... needed</span>

In [5]:

X1 = np.load("Mid_result_to_save/X_pca_1961_1970.npy")
X2 = np.load("Mid_result_to_save/X_pca_1971_1980.npy")
X3= np.load("Mid_result_to_save/X_pca_1981_1990.npy")
X4= np.load("Mid_result_to_save/X_pca_1991_2000.npy")
X5= np.load("Mid_result_to_save/X_pca_2001_2010.npy")
X6= np.load("Mid_result_to_save/X_pca_2011_2020.npy")

X_pca_60y = np.concatenate([X1, X2, X3, X4, X5, X6], axis=0)

print(f"Shape unified matrix: {X_pca_60y.shape}")

# Save the unified matrix
np.save("Mid_result_to_save/X_pca_60y.npy", X_pca_60y)


Shape unified matrix: (21915, 20)
