In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import h5py

import matplotlib.pyplot as plt

In [None]:
ds_era5 = xr.load_dataset('data/perdigao_era5_2020.nc')
ds_era5['vel100'] = np.sqrt(ds_era5['u100'] ** 2 +  ds_era5['v100'] ** 2)
ds_era5['vel100'].attrs = {'long_name': '100 meter horizontal wind speed', 'units': 'm/s'}
ds_era5

In [None]:
ds_low_res_pre = xr.load_dataset('data/perdigao_low_res_1H_2020.nc')
ds_low_res_pre

In [None]:
ds_high_res_pre = xr.load_dataset('data/perdigao_high_res_1H_2020.nc')
ds_high_res_pre

In [None]:
#check if there is nan
for var_name in ds_era5.data_vars:
    print("ds_era5 ", var_name, np.isnan(ds_era5[var_name]).any().to_numpy())
#check if there is nan
for var_name in ds_low_res_pre.data_vars:
    print("ds_low_res_pre ", var_name, np.isnan(ds_low_res_pre[var_name]).any().to_numpy())
#check if there is nan
for var_name in ds_high_res_pre.data_vars:
    print("ds_high_res_pre ", var_name, np.isnan(ds_high_res_pre[var_name]).any().to_numpy())

In [None]:
# Check the pattern where nan shows up
# The pattern should be that, for a given time, we see nan on all sites for all variables

low_res_nan_t_idx_list = []
for var_name in ds_low_res_pre.data_vars:
    if var_name == "absolute_height":
        continue
    print("Doing ", var_name, " for ds_low_res_pre")
    mask_low = np.isnan(ds_low_res_pre[var_name])
    nan_idx_tuple = np.where(mask_low == True)
    nan_t_idx = np.unique(nan_idx_tuple[0])
    print(nan_idx_tuple[0].shape, nan_t_idx.shape)
    low_res_nan_t_idx_list.append(nan_t_idx)
#    print(nan_t_idx, '\n')
for i in range(len(low_res_nan_t_idx_list) -1):
    assert(np.array_equal(low_res_nan_t_idx_list[i], low_res_nan_t_idx_list[i+1]))
    
high_res_nan_t_idx_list = []
for var_name in ds_high_res_pre.data_vars:
    if var_name == "absolute_height":
        continue
    print("Doing ", var_name, " for ds_high_res_pre")
    mask_high = np.isnan(ds_high_res_pre[var_name])
    nan_idx_tuple = np.where(mask_high == True)
    nan_t_idx = np.unique(nan_idx_tuple[0])
    print(nan_idx_tuple[0].shape, nan_t_idx.shape)
    high_res_nan_t_idx_list.append(nan_t_idx)
#    print(nan_t_idx, '\n')
for i in range(len(high_res_nan_t_idx_list) -1):
    assert(np.array_equal(high_res_nan_t_idx_list[i], high_res_nan_t_idx_list[i+1]))    

In [None]:
# Data preprocessing
# In this step we try to remove those timeslices where we only have nan. This is critical in DL

# First find the union of nan_t_idx of low/high res input ds
print("Low, ", low_res_nan_t_idx_list[0].shape, low_res_nan_t_idx_list[0])
print("High, ", high_res_nan_t_idx_list[0].shape, high_res_nan_t_idx_list[0])
nan_t_idx_union = np.union1d(low_res_nan_t_idx_list[0], high_res_nan_t_idx_list[0])
print("Union, ", nan_t_idx_union.shape, nan_t_idx_union)

In [None]:
# Then drop those data in both ds, first do low res

print(ds_low_res_pre)
ds_low_res = ds_low_res_pre.drop_isel(time=nan_t_idx_union)
print(ds_low_res)

In [None]:
# Then drop those data in both ds, next do high res

print(ds_high_res_pre)
ds_high_res = ds_high_res_pre.drop_isel(time=nan_t_idx_union)
print(ds_high_res)

In [None]:
# Next make sure there is no nan anymore

for var_name in ds_low_res.data_vars:
    print("ds_low_res ", var_name, np.isnan(ds_low_res[var_name]).any().to_numpy())
for var_name in ds_high_res.data_vars:
    print("ds_high_res ", var_name, np.isnan(ds_high_res[var_name]).any().to_numpy())

In [None]:
# Finally save to disk

ds_low_res.to_netcdf('processed_data/perdigao_low_res_1H_2020.nc')
ds_high_res.to_netcdf('processed_data/perdigao_high_res_1H_2020.nc')

In [None]:
# Make sure we can open it again and it is the same

ds_low_res_reopen = xr.load_dataset('processed_data/perdigao_low_res_1H_2020.nc')
ds_high_res_reopen = xr.load_dataset('processed_data/perdigao_high_res_1H_2020.nc')
print(ds_low_res.equals(ds_low_res_reopen))
print(ds_high_res.equals(ds_high_res_reopen))

# Compute the statistics of low and high resolution data for GAN

In [None]:
mean_lr_u = ds_low_res["u"].mean().to_numpy()
mean_lr_v = ds_low_res["v"].mean().to_numpy()
mean_hr_u = ds_high_res["u"].mean().to_numpy()
mean_hr_v = ds_high_res["v"].mean().to_numpy()

In [None]:
print(mean_lr_u)
print(mean_lr_v)
print(mean_hr_u)
print(mean_hr_v)

In [None]:
stddev_lr_u = ds_low_res["u"].std().to_numpy()
stddev_lr_v = ds_low_res["v"].std().to_numpy()
stddev_hr_u = ds_high_res["u"].std().to_numpy()
stddev_hr_v = ds_high_res["v"].std().to_numpy()

In [None]:
print(stddev_lr_u)
print(stddev_lr_v)
print(stddev_hr_u)
print(stddev_hr_v)

# Standardrize the dataset for GAN

In [None]:
da_high_res_std_u = ( ds_high_res["u"] - mean_hr_u ) / stddev_hr_u
da_high_res_std_v = ( ds_high_res["v"] - mean_hr_v ) / stddev_hr_v
print(da_high_res_std_u.mean().to_numpy())
print(da_high_res_std_v.mean().to_numpy())
print(da_high_res_std_u.std().to_numpy())
print(da_high_res_std_v.std().to_numpy())

In [None]:
#so weird... If I stack them with axis=-1, then result of std and mean will be incorrect!
np_hr_std_pre = np.stack((da_high_res_std_u.to_numpy(), da_high_res_std_v.to_numpy()), axis=0)
print(np_hr_std_pre.shape)

np_hr_std = np_hr_std_pre.transpose(1,2,3,0)
print(np_hr_std.shape)

print(np_hr_std.std(axis=(0,1,2)))
print(np_hr_std.mean(axis=(0,1,2)))

In [None]:
da_low_res_std_u = ( ds_low_res["u"] - mean_lr_u ) / stddev_lr_u
da_low_res_std_v = ( ds_low_res["v"] - mean_lr_v ) / stddev_lr_v
print(da_low_res_std_u.mean().to_numpy())
print(da_low_res_std_v.mean().to_numpy())
print(da_low_res_std_u.std().to_numpy())
print(da_low_res_std_v.std().to_numpy())

In [None]:
#so weird... If I stack them with axis=-1, then result of std and mean will be incorrect!
np_lr_std_pre = np.stack((da_low_res_std_u.to_numpy(), da_low_res_std_v.to_numpy()), axis=0)
print(np_lr_std_pre.shape)

np_lr_std = np_lr_std_pre.transpose(1,2,3,0)
print(np_lr_std.shape)

print(np_lr_std.std(axis=(0,1,2)))
print(np_lr_std.mean(axis=(0,1,2)))

In [None]:
lr_mean = np.array([mean_lr_u, mean_lr_v])
hr_mean = np.array([mean_hr_u, mean_hr_v])
lr_stddev = np.array([stddev_lr_u, stddev_lr_v])
hr_stddev = np.array([stddev_hr_u, stddev_hr_v])

In [None]:
with h5py.File('processed_data/np_gan_standard.h5', 'w') as hf:
    hf.create_dataset("np_lr",  data=np_lr_std)
    hf.create_dataset("np_hr",  data=np_hr_std)
    hf.create_dataset("np_lr_mean",  data=lr_mean)
    hf.create_dataset("np_hr_mean",  data=hr_mean)
    hf.create_dataset("np_lr_stddev",  data=lr_stddev)
    hf.create_dataset("np_hr_stddev",  data=hr_stddev)

In [None]:
with h5py.File('processed_data/np_gan_standard.h5', 'r') as hf:
    data_lr = hf['np_lr'][:]
    data_lr_mean = hf['np_lr_mean'][:]
    data_lr_stddev = hf['np_lr_stddev'][:]
    data_hr = hf['np_hr'][:]
    data_hr_mean = hf['np_hr_mean'][:]
    data_hr_stddev = hf['np_hr_stddev'][:]

In [None]:
print(np.array_equal(np_lr_std, data_lr))

In [None]:
print(np.array_equal(np_hr_std, data_hr))

In [None]:
np_lr_reopen = data_lr * data_lr_stddev + data_lr_mean
print(np.max(np_lr_reopen[:,:,:,0] - ds_low_res["u"].to_numpy()))
print(np.max(np_lr_reopen[:,:,:,1] - ds_low_res["v"].to_numpy()))

In [None]:
np_hr_reopen = data_hr * data_hr_stddev + data_hr_mean
print(np.max(np_hr_reopen[:,:,:,0] - ds_high_res["u"].to_numpy()))
print(np.max(np_hr_reopen[:,:,:,1] - ds_high_res["v"].to_numpy()))