In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import h5py

import matplotlib.pyplot as plt

In [2]:
ds_high_res_pre = xr.load_dataset('data/perdigao_high_res_1H_2020.nc')
ds_high_res_pre

In [3]:
#check if there is nan
for var_name in ds_high_res_pre.data_vars:
    print("ds_high_res_pre ", var_name, np.isnan(ds_high_res_pre[var_name]).any().to_numpy())

ds_high_res_pre  absolute_height True
ds_high_res_pre  std True
ds_high_res_pre  temp True
ds_high_res_pre  u True
ds_high_res_pre  v True
ds_high_res_pre  vel True


In [4]:
# Check the pattern where nan shows up
# The pattern should be that, for a given time, we see nan on all sites for all variables
   
high_res_nan_t_idx_list = []
for var_name in ds_high_res_pre.data_vars:
    if var_name == "absolute_height":
        continue
    print("Doing ", var_name, " for ds_high_res_pre")
    mask_high = np.isnan(ds_high_res_pre[var_name])
    nan_idx_tuple = np.where(mask_high == True)
    nan_t_idx = np.unique(nan_idx_tuple[0])
    print(nan_idx_tuple[0].shape, nan_t_idx.shape)
    high_res_nan_t_idx_list.append(nan_t_idx)
for i in range(len(high_res_nan_t_idx_list) -1):
    assert(np.array_equal(high_res_nan_t_idx_list[i], high_res_nan_t_idx_list[i+1]))    

Doing  std  for ds_high_res_pre
(9363456,) (254,)
Doing  temp  for ds_high_res_pre
(9363456,) (254,)
Doing  u  for ds_high_res_pre
(9363456,) (254,)
Doing  v  for ds_high_res_pre
(9363456,) (254,)
Doing  vel  for ds_high_res_pre
(9363456,) (254,)


In [5]:
# Data preprocessing
# In this step we try to remove those timeslices where we only have nan. This is critical in DL

print(high_res_nan_t_idx_list[0].shape, high_res_nan_t_idx_list[0])

(254,) [1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621
 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1944 1945 1946 1947
 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961
 1962 1963 1964 1965 1966 1967 4165 4166 4167 4168 4169 4170 4171 4172
 4173 4174 4175 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199
 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4476 4477 4478
 4479 4480 4481 4482 4483 4484 4485 4486 4487 4500 4501 4502 4503 4504
 4505 4506 4507 4508 4509 4510 4511 4525 4526 4527 4528 4529 4530 4531
 4532 4533 4534 4535 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678
 4679 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4740 4741
 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4764 4765 4766 4767
 4768 4769 4770 4771 4772 4773 4774 4775 4788 4789 4790 4791 4792 4793
 4794 4795 4796 4797 4798 4799 4813 4814 4815 4816 4817 4818 4819 4820
 4821 4822 4823 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847

In [6]:
# Then drop those data in ds

print(ds_high_res_pre)
ds_high_res = ds_high_res_pre.drop_isel(time=high_res_nan_t_idx_list)
print(ds_high_res)

<xarray.Dataset>
Dimensions:          (time: 8784, yf: 192, xf: 192)
Coordinates:
  * time             (time) datetime64[ns] 2020-01-01 ... 2020-12-31T23:00:00
    height           float32 100.0
  * xf               (xf) float64 7.72e+03 7.8e+03 ... 2.292e+04 2.3e+04
  * yf               (yf) float64 7.72e+03 7.8e+03 ... 2.292e+04 2.3e+04
Data variables:
    absolute_height  (time, yf, xf) float32 255.8 258.2 256.9 ... 318.7 300.5
    std              (time, yf, xf) float32 0.07866 0.08326 ... 0.9995 1.074
    temp             (time, yf, xf) float32 284.3 284.3 284.3 ... 278.7 278.8
    u                (time, yf, xf) float32 -2.474 -2.472 -2.497 ... 6.397 6.251
    v                (time, yf, xf) float32 0.4507 0.4227 ... -3.267 -3.64
    vel              (time, yf, xf) float32 2.516 2.509 2.529 ... 7.224 7.292
Attributes:
    site:         Perdigao, Portugal
    description:  80m x 80m x 20m LES simulation
    copyright:    GE Renewable Energy
<xarray.Dataset>
Dimensions:          (t

In [7]:
# Next make sure there is no nan anymore

for var_name in ds_high_res.data_vars:
    print("ds_high_res ", var_name, np.isnan(ds_high_res[var_name]).any().to_numpy())

ds_high_res  absolute_height False
ds_high_res  std False
ds_high_res  temp False
ds_high_res  u False
ds_high_res  v False
ds_high_res  vel False


In [9]:
# Finally save to disk

ds_high_res.to_netcdf('processed_data_ae/perdigao_high_res_1H_2020.nc')

In [10]:
# Make sure we can open it again and it is the same

ds_high_res_reopen = xr.load_dataset('processed_data_ae/perdigao_high_res_1H_2020.nc')
print(ds_high_res.equals(ds_high_res_reopen))

True


In [11]:
#Convert to a numpy array

da_u = ds_high_res["u"] 
da_v = ds_high_res["v"]

#so weird... If I stack them with axis=-1, then result of std and mean will be incorrect!
np_data_pre = np.stack((da_u.to_numpy(), da_v.to_numpy()), axis=0)
print(np_data_pre.shape)

np_data = np_data_pre.transpose(1,2,3,0)
print(np_data.shape)

(2, 8530, 192, 192)
(8530, 192, 192, 2)


In [12]:
#Save into a hdf5

with h5py.File('processed_data_ae/np_data.h5', 'w') as hf:
    hf.create_dataset("np_data",  data=np_data)

In [13]:
#Test if we can reopen successfully and if the result is correct

with h5py.File('processed_data_ae/np_data.h5', 'r') as hf:
    data_reopen = hf['np_data'][:]

print(data_reopen.shape)
print(np.array_equal(np_data, data_reopen))    
print(np.max(data_reopen[:,:,:,0] - ds_high_res["u"].to_numpy()))
print(np.max(data_reopen[:,:,:,1] - ds_high_res["v"].to_numpy()))

(8530, 192, 192, 2)
True
0.0
0.0
