In [None]:
import h5py
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

import os
import datetime
import time

In [None]:
import sys
sys.path.append('/work/ka1176/caroline/gitlab/2020-03-gfz-remote-sensing/')
from gfz_202003.preprocessing import preprocess as prep

In [None]:
np.random.seed(1619)

In [None]:
import xarray as xr
xr.__version__

import netCDF4 as nc

In [None]:
raw_data_dir = '/work/ka1176/shared_data/2020-03/raw_data/2020/212/'
raw_data_files = [os.path.join(raw_data_dir, ff) for ff in os.listdir(raw_data_dir) if ff.endswith('.nc')]

print(raw_data_files)

In [None]:
ds = xr.open_mfdataset(raw_data_files[1])

In [None]:
batch_size = 128

In [None]:
n_samples = len(ds.brcs)

In [None]:
batches = n_samples // batch_size
print(batches)

In [None]:
%%time
for i in range(batches):
    X = ds.brcs[i*batch_size:(i+1)*batch_size]
    y = ds.ERA5_u10[i*batch_size:(i+1)*batch_size]

In [None]:
batch_ix = np.arange(n_samples)
np.random.shuffle(batch_ix)

In [None]:
%%time
for i in range(batches):
    X = ds.brcs[batch_ix[i*batch_size:(i+1)*batch_size]]
    y = ds.ERA5_u10[batch_ix[i*batch_size:(i+1)*batch_size]]

In [None]:
source_file = h5py.File('/work/ka1176/shared_data/2020-03/dev_data/dev_data_random_all_map/train_data.h5', 'r')
brcs = source_file['brcs']
windspeed = source_file['windspeed']

In [None]:
%%time
for i in range(batches):
    X = brcs[i*batch_size:(i+1)*batch_size]
    y = windspeed[i*batch_size:(i+1)*batch_size]

In [None]:
%%time
for i in range(batches):
    X = brcs[list(np.sort(batch_ix[i*batch_size:(i+1)*batch_size]))]
    y = windspeed[i*batch_size:(i+1)*batch_size]

In [None]:
# try to generate new netcdf files with a bit less complexity than the original ones

In [None]:
# full set of variables that we would like to use
# brcs
# eff_scatter
# windspeed
# sp_lat
# sp_lon
# ddm_timestamp_unix

In [None]:
flag = 'valid'
source_file = h5py.File(f'/work/ka1176/shared_data/2020-03/dev_data/dev_data_random_all_map/{flag}_data.h5', 'r')

brcs = source_file['brcs'][:]
eff_scatter = source_file['eff_scatter'][:]
sp_lat = source_file['sp_lat'][:]
sp_lon = source_file['sp_lon'][:]
ddm_timestamp_unix = source_file['ddm_timestamp_unix'][:]
windspeed = source_file['windspeed'][:]
daystamp = ((ddm_timestamp_unix - ddm_timestamp_unix[0]) / 24 / 3600).astype(int)

In [None]:
for day in np.unique(daystamp):
    ix = daystamp==day
    dictionary = dict()
    dictionary['coords'] = dict(sample=dict(dims=('sample',), attrs={}, data=np.arange(np.sum(ix))), attrs={}, dims=dict(sample=np.sum(ix)))
    dictionary['data_vars'] = dict(#brcs=dict(dims='sample', attrs={}, data=brcs[ix]),
                                   windspeed=dict(dims=('sample',), attrs={}, data=windspeed[ix]))
    N = np.sum(ix)
    dd = {'coords': {'sample': {'dims': ('sample',), 
                     'attrs': {}, 
                     'data': np.arange(N)},},
          'attrs': {},
          'dims': {'sample': N, 'delay': 17, 'doppler': 11},
          'data_vars': {'windspeed': {'dims': ('sample',),
                                      'attrs': {'Units': 'm/s', 'Source': 'ERA5', 'Description': 'Surface wind speed'},
                                      'data': windspeed[ix]},
                        'ddm_timestamp_unix': {'dims': ('sample',),
                                               'attrs': {'Units': 's', 'Source': 'CyGNSS', 'Description': 'Sample (DDM) time stamp'},
                                               'data': ddm_timestamp_unix[ix]},
                        'brcs': {'dims': ('sample', 'delay', 'doppler',),
                                 'attrs': {'Units': 'm*m', 'Source': 'CyGNSS', 'Description': 'Bistatic radar cross section DDM'},
                                 'data': brcs[ix]},
                        'eff_scatter': {'dims': ('sample', 'delay', 'doppler',),
                                        'attrs': {'Units': 'm*m', 'Source': 'CyGNSS', 'Description': 'Effective scatter area --> DDM'},
                                        'data': eff_scatter[ix]}
                       }}
    
    
    ds = xr.Dataset.from_dict(dd)
    filename = f'/work/ka1176/shared_data/training/CyGNSS-2/{flag}/day_{day:03d}.nc'
    if os.path.exists(filename):
        os.remove(filename)
    ds.to_netcdf(filename)
    #print(ds)
    #break
    print(filename)

In [None]:
check = xr.open_dataset(filename)
check

In [None]:
raw_ds = xr.open_dataset(raw_data_files[0])
print(raw_ds)

In [None]:
ds