## Notebook to create the matrix with 30y EVEN with normalization and pca altready applied

OUTPUT : quantities computed over the dataset 30y_EVEN  
-  _ipca_ saved as ipca_30y.pkl for further use
- _global_mean_ saved as global_mean_30y.npy
- _global_std_ saved as global_std_30y.npy

In [1]:
import sys
import os 
sys.path.append(os.path.abspath(".."))

In [None]:
import xarray as xr
import numpy as np
import gc
import joblib
import warnings
warnings.filterwarnings('ignore')

import importlib
import utils.data_processing
import utils.visualization
importlib.reload(utils.data_processing)
importlib.reload(utils.visualization)

# Import the functions 
from utils.data_processing import prepare_data_matrix, apply_global_standardization, perform_incremental_pca


In [3]:
# First file
ds1 = xr.open_dataset('../era5_1994_2008.grib', engine='cfgrib')
ds1_even = ds1.isel(time=slice(0, None, 2))  # select only the even rows
del ds1  
gc.collect()

# Second file
ds2 = xr.open_dataset('../era5_2009_2023.grib', engine='cfgrib')
ds2_even = ds2.isel(time=slice(0, None, 2))  
del ds2  
gc.collect()

# Merge the two datasets
ds_even = xr.concat([ds1_even, ds2_even], dim='time')
del ds1_even, ds2_even
gc.collect()

print("Overview of the combined dataset:")
print(f"   • Variables: {list(ds_even.data_vars.keys())}")
print(f"   • Coordinates: {list(ds_even.coords.keys())}")
print(f"   • Shape: {ds_even.dims}")

Ignoring index file '../era5_1994_2008.grib.5b7b6.idx' incompatible with GRIB file


Overview of the combined dataset:
   • Variables: ['z', 't']
   • Coordinates: ['number', 'time', 'step', 'isobaricInhPa', 'latitude', 'longitude', 'valid_time']


In [4]:
print(ds_even['z'].dtype)
print(ds_even['t'].dtype)

ds_even = ds_even.astype('float32')

# Save the dataset
ds_even.to_netcdf("era5_30years_even.nc")

del ds_even
gc.collect()

float32
float32


16

In [6]:
# Create a new dataset
ds_filtered = xr.Dataset()
ds_even = xr.open_dataset("era5_30years_even.nc")

# Add the geopotential at 500 hPa
z_500 = ds_even['z'].sel(isobaricInhPa=[500])

# Add the temperature at 850 hPa
t_850 = ds_even['t'].sel(isobaricInhPa=[850])

# Rename the coordinates in order to avoid conflicts
z_500 = z_500.rename({'isobaricInhPa': 'pressure_z'})
t_850 = t_850.rename({'isobaricInhPa': 'pressure_t'})

# Combine everything into the filtered dataset
ds_filtered = xr.Dataset({
    'z': z_500,
    't': t_850
})


print(f"Filtered Dataset - Variables: {list(ds_filtered.data_vars.keys())}")
print(f"Dimensions: {dict(ds_filtered.dims)}")
print(f"Time range: {ds_filtered.time.min().values} to {ds_filtered.time.max().values}")

print(ds_filtered)  

print("-----------------")
    
X, data_matrices = prepare_data_matrix(ds_filtered) 


Filtered Dataset - Variables: ['z', 't']
Dimensions: {'time': 5479, 'pressure_z': 1, 'latitude': 201, 'longitude': 321, 'pressure_t': 1}
Time range: 1994-01-01T00:00:00.000000000 to 2023-12-30T00:00:00.000000000
<xarray.Dataset> Size: 3GB
Dimensions:     (time: 5479, pressure_z: 1, latitude: 201, longitude: 321,
                 pressure_t: 1)
Coordinates:
    number      int64 8B 0
  * time        (time) datetime64[ns] 44kB 1994-01-01 1994-01-03 ... 2023-12-30
    step        timedelta64[ns] 8B 00:00:00
  * pressure_z  (pressure_z) float64 8B 500.0
  * latitude    (latitude) float64 2kB 70.0 69.75 69.5 69.25 ... 20.5 20.25 20.0
  * longitude   (longitude) float64 3kB -40.0 -39.75 -39.5 ... 39.5 39.75 40.0
    valid_time  (time) datetime64[ns] 44kB 1994-01-01 1994-01-03 ... 2023-12-30
  * pressure_t  (pressure_t) float64 8B 850.0
Data variables:
    z           (time, pressure_z, latitude, longitude) float32 1GB ...
    t           (time, pressure_t, latitude, longitude) float32 1GB ..

In [7]:
print("GLOBAL STANDARDIZATION")

# Apply global standardization
X, global_mean, global_std = apply_global_standardization(X)

print("Before standardization:")
print(f" • Global mean: {global_mean:.2f}m")
print(f" • Global std: {global_std:.2f}m")
print(f"Dataset shape: {X.shape}")


print("After standardization:")
print(f" • New global mean: {X.mean():.6f}")
print(f" • New global std: {X.std():.6f}")

# Verify some sample statistics
print(f" • Sample min: {X.min():.3f}")
print(f" • Sample max: {X.max():.3f}")

GLOBAL STANDARDIZATION
Before standardization:
 • Global mean: 27779.39m
 • Global std: 27544.21m
Dataset shape: (5479, 129042)
After standardization:
 • New global mean: 0.000253
 • New global std: 1.000214
 • Sample min: -1.000
 • Sample max: 1.138


In [8]:
# Save mean and std
np.save("global_mean_30y.npy", global_mean)
np.save("global_std_30y.npy", global_std)

In [9]:
print("INCREMENTAL PCA ANALYSIS")

n_components = 20  # Reduced to 20 components for safety with batch processing
batch_size = 100    # Increased batch size to accommodate more components

print(f"   • Target components: {n_components}")
print(f"   • Batch size: {batch_size}")
 
X_pca, ipca, explained_variance_ratio, cumulative_variance = perform_incremental_pca(X, n_components=n_components, batch_size=batch_size)

INCREMENTAL PCA ANALYSIS
   • Target components: 20
   • Batch size: 100
Original shape: (5479, 129042)
PCA shape: (5479, 20)

First 10 components variance: [0.49234794 0.11733436 0.09122786 0.07440938 0.05198909 0.03327221
 0.0219017  0.02104583 0.01374796 0.01086561]
Total explained variance 20 components cumulative: 0.976


In [10]:
# Save the PCA model

joblib.dump(ipca, "ipca_30y.pkl")

['ipca_30y.pkl']