In [1]:
import xarray as xr
import numpy as np
import logging
import glob
import os

from info import train_years as years

In [2]:
# initialize logger
logging_level = logging.INFO
logging.basicConfig(format="[%(asctime)s] %(levelname)s : %(message)s", level=logging_level, datefmt='%Y-%m-%d %H:%M:%S')
logging.info(f'Program started')

[2025-01-22 17:25:38] INFO : Program started


In [3]:
# data directories
data_dir = '../data/'
datasets_dir = os.path.join(data_dir, 'patches', 'train')
scaler_dir = os.path.join(data_dir, 'patches')
os.makedirs(scaler_dir, exist_ok=True)

In [4]:
# set the data pattern to retrieve the data from the disk
dataset_pattern_dir = os.path.join(datasets_dir, '*.zarr')

In [12]:
# define scaler filename
scaler_filename = f'scaler_{years[0]}_{years[-1]}.nc'
scaler_path = os.path.join(scaler_dir, scaler_filename)
scaler_path

'../data/patches/scaler_1980_2009.nc'

In [6]:
# get all the filenames in the directory
files = sorted(glob.glob(dataset_pattern_dir))

# define drivers to scale
drivers = ['fg10', 'i10fg', 'msl', 't_500', 't_300', 'vo_850', 'sst']

# log
files[:5]

['../data/patches/train/cyclone-1980.zarr',
 '../data/patches/train/cyclone-1981.zarr',
 '../data/patches/train/cyclone-1982.zarr',
 '../data/patches/train/cyclone-1983.zarr',
 '../data/patches/train/cyclone-1984.zarr']

In [7]:
data = np.empty(shape=(len(drivers), 0, 40, 40))
for file in files:
    # log
    logging.info(f'   {file}')
    x = xr.open_zarr(file)[drivers].to_array().load()
    data = np.concatenate((data, x), axis=1)

[2025-01-22 17:25:38] INFO :    ../data/patches/train/cyclone-1980.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1981.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1982.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1983.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1984.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1985.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1986.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1987.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1988.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1989.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1990.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1991.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-1992.zarr
[2025-01-22 17:25:39] INFO :    ../data/patches/train/cyclone-19

In [8]:
data.shape

(7, 492, 40, 40)

In [9]:
data_mean = np.nanmean(data, axis=(1,2,3))
data_mean

array([8.76776420e+00, 8.10639425e+00, 1.01137806e+05, 2.66241849e+02,
       2.40507119e+02, 2.18670501e-06, 2.99233324e+02])

In [10]:
data_std = np.nanstd(data, axis=(1,2,3))
data_std

array([4.18378852e+00, 4.12865754e+00, 5.83242028e+02, 4.35574411e+00,
       4.76354345e+00, 4.56283861e-05, 5.12574549e+00])

In [11]:
scaler_ds = xr.Dataset(data_vars={})
for v,var in enumerate(drivers):
    scaler_ds[f'mean-{var}'] = data_mean[v]
    scaler_ds[f'std-{var}'] = data_std[v]
scaler_ds

In [32]:
# store to disk as netcdf
scaler_ds.to_netcdf(scaler_path)