In [None]:
import xarray as xr
import numpy as np
import logging
import glob
import os
import toml

from info import train_years as years

In [None]:
# initialize logger
logging_level = logging.INFO
logging.basicConfig(format="[%(asctime)s] %(levelname)s : %(message)s", level=logging_level, datefmt='%Y-%m-%d %H:%M:%S')
logging.info(f'Program started')

In [None]:
# config file
config = toml.load("./config/cnns.toml")
# data directories
data_dir = '../data/'
datasets_dir = os.path.join(data_dir, 'patches', 'train')
scaler_dir = os.path.join(data_dir, 'patches')
os.makedirs(scaler_dir, exist_ok=True)

In [None]:
# set the data pattern to retrieve the data from the disk
dataset_pattern_dir = os.path.join(datasets_dir, '*.zarr')

In [None]:
# define scaler filename
scaler_filename = f'scaler_{years[0]}_{years[-1]}.nc'
scaler_path = os.path.join(scaler_dir, scaler_filename)
scaler_path

In [None]:
# get all the filenames in the directory
files = sorted(glob.glob(dataset_pattern_dir))

# define drivers to scale
drivers = config["data"]["drivers"]

# log
files[:5]

In [None]:
data = np.empty(shape=(len(drivers), 0, 40, 40))
for file in files:
    # log
    logging.info(f'   {file}')
    x = xr.open_zarr(file)[drivers].to_array().load()
    data = np.concatenate((data, x), axis=1)

In [None]:
data.shape

In [None]:
data_mean = np.nanmean(data, axis=(1,2,3))
data_mean

In [None]:
data_std = np.nanstd(data, axis=(1,2,3))
data_std

In [None]:
scaler_ds = xr.Dataset(data_vars={})
for v,var in enumerate(drivers):
    scaler_ds[f'mean-{var}'] = data_mean[v]
    scaler_ds[f'std-{var}'] = data_std[v]
scaler_ds

In [None]:
# store to disk as netcdf
scaler_ds.to_netcdf(scaler_path)