In [1]:
import h5py
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

import os
import datetime
import time

In [2]:
import sys
sys.path.append('/work/ka1176/caroline/gitlab/2020-03-gfz-remote-sensing/')
from gfz_202003.preprocessing import preprocess as prep

In [3]:
np.random.seed(1619)

In [4]:
import xarray as xr
xr.__version__

import netCDF4 as nc

In [5]:
raw_data_dir = '/work/ka1176/shared_data/2020-03/raw_data/2020/212/'
raw_data_files = [os.path.join(raw_data_dir, ff) for ff in os.listdir(raw_data_dir) if ff.endswith('.nc')]

print(raw_data_files)

['/work/ka1176/shared_data/2020-03/raw_data/2020/212/cyg01.ddmi.s20200730-000000-e20200730-235959.l1.power-brcs.a30.d31.nc', '/work/ka1176/shared_data/2020-03/raw_data/2020/212/cyg03.ddmi.s20200730-000000-e20200730-235959.l1.power-brcs.a30.d31.nc', '/work/ka1176/shared_data/2020-03/raw_data/2020/212/cyg02.ddmi.s20200730-000000-e20200730-235959.l1.power-brcs.a30.d31.nc', '/work/ka1176/shared_data/2020-03/raw_data/2020/212/cyg07.ddmi.s20200730-000000-e20200730-235959.l1.power-brcs.a30.d31.nc', '/work/ka1176/shared_data/2020-03/raw_data/2020/212/cyg08.ddmi.s20200730-000000-e20200730-235959.l1.power-brcs.a30.d31.nc', '/work/ka1176/shared_data/2020-03/raw_data/2020/212/cyg06.ddmi.s20200730-000000-e20200730-235959.l1.power-brcs.a30.d31.nc', '/work/ka1176/shared_data/2020-03/raw_data/2020/212/cyg04.ddmi.s20200730-000000-e20200730-235959.l1.power-brcs.a30.d31.nc', '/work/ka1176/shared_data/2020-03/raw_data/2020/212/cyg05.ddmi.s20200730-000000-e20200730-235959.l1.power-brcs.a30.d31.nc']


In [6]:
ds = xr.open_mfdataset(raw_data_files[1])

In [7]:
batch_size = 128

In [8]:
n_samples = len(ds.brcs)

In [9]:
batches = n_samples // batch_size
print(batches)

1350


In [10]:
%%time
for i in range(batches):
    X = ds.brcs[i*batch_size:(i+1)*batch_size]
    y = ds.ERA5_u10[i*batch_size:(i+1)*batch_size]

CPU times: user 4.68 s, sys: 2 ms, total: 4.69 s
Wall time: 4.68 s


In [11]:
batch_ix = np.arange(n_samples)
np.random.shuffle(batch_ix)

In [12]:
%%time
for i in range(batches):
    X = ds.brcs[batch_ix[i*batch_size:(i+1)*batch_size]]
    y = ds.ERA5_u10[batch_ix[i*batch_size:(i+1)*batch_size]]

CPU times: user 6.22 s, sys: 5 ms, total: 6.23 s
Wall time: 6.22 s


In [13]:
source_file = h5py.File('/work/ka1176/shared_data/2020-03/dev_data/dev_data_random_all_map/train_data.h5', 'r')
brcs = source_file['brcs']
windspeed = source_file['windspeed']

In [14]:
%%time
for i in range(batches):
    X = brcs[i*batch_size:(i+1)*batch_size]
    y = windspeed[i*batch_size:(i+1)*batch_size]

CPU times: user 730 ms, sys: 115 ms, total: 845 ms
Wall time: 2.5 s


In [15]:
%%time
for i in range(batches):
    X = brcs[list(np.sort(batch_ix[i*batch_size:(i+1)*batch_size]))]
    y = windspeed[i*batch_size:(i+1)*batch_size]

CPU times: user 41.1 s, sys: 14 s, total: 55.1 s
Wall time: 55.1 s


In [16]:
# try to generate new netcdf files with a bit less complexity than the original ones

In [17]:
# full set of variables that we would like to use
# brcs
# eff_scatter
# windspeed
# sp_lat
# sp_lon
# ddm_timestamp_unix

In [18]:
source_file = h5py.File(f'/work/ka1176/shared_data/2020-03/dev_data/dev_data_random_all_map/train_data.h5', 'r')

brcs = source_file['brcs'][:]
eff_scatter = source_file['eff_scatter'][:]
sp_lat = source_file['sp_lat'][:]
sp_lon = source_file['sp_lon'][:]
ddm_timestamp_unix = source_file['ddm_timestamp_unix'][:]
ddm_timestamp_unix -= ddm_timestamp_unix[0]
windspeed = source_file['windspeed'][:]
daystamp = ((ddm_timestamp_unix) / 24 / 3600).astype(int)

In [19]:
artificial_nan_fraction = 1e-3
artifical_neg_v_fraction = 1e-4

In [24]:
for day in np.unique(daystamp):
    if day < 100:
        flag = 'train'
    elif day < 120:
        flag = 'valid'
    elif day < 140:
        flag = 'test'
    else:
        continue
    ix = daystamp==day
    dictionary = dict()
    dictionary['coords'] = dict(sample=dict(dims=('sample',), attrs={}, data=np.arange(np.sum(ix))), attrs={}, dims=dict(sample=np.sum(ix)))
    dictionary['data_vars'] = dict(#brcs=dict(dims='sample', attrs={}, data=brcs[ix]),
                                   windspeed=dict(dims=('sample',), attrs={}, data=windspeed[ix]))
    
    N = np.sum(ix)
    
    brcs_vals = brcs[ix]
    # replace BRCS by NaN randomly
    zeta = np.random.rand(N)
    brcs_vals[zeta < artificial_nan_fraction, :, :] = None
    
    windspeed_vals = windspeed[ix]
    zeta = np.random.rand(N)
    windspeed_vals[zeta < artifical_neg_v_fraction] = -1.0
    
    zeta = np.random.rand(N)
    windspeed_vals[zeta < artificial_nan_fraction] = None
    
    dd = {'coords': {'sample': {'dims': ('sample',), 
                     'attrs': {'Description': 'CyGNSS dataset for the Efficient Data Preprocessing Tutorial'}, 
                     'data': np.arange(N)},},
          'attrs': {},
          'dims': {'sample': N, 'delay': 17, 'doppler': 11},
          'data_vars': {'windspeed': {'dims': ('sample',),
                                      'attrs': {'Unit': 'meter/second', 'Source': 'ERA5', 'Description': 'Surface wind speed', 'Fill value': '-1'},
                                      'data': windspeed_vals},
                        'ddm_timestamp': {'dims': ('sample',),
                                          'attrs': {'Unit': 'second', 'Source': 'CyGNSS', 'Description': 'Sample (DDM) time stamp'},
                                          'data': ddm_timestamp_unix[ix]},
                        'brcs': {'dims': ('sample', 'delay', 'doppler',),
                                 'attrs': {'Unit': 'meter2', 'Source': 'CyGNSS', 'Description': 'Bistatic radar cross section DDM'},
                                 'data': brcs_vals},
                        #'eff_scatter': {'dims': ('sample', 'delay', 'doppler',),
                        #                'attrs': {'Unit': 'meter2', 'Source': 'CyGNSS', 'Description': 'Effective scatter area --> DDM'},
                        #                'data': eff_scatter[ix]}
                       }}
    
    
    ds = xr.Dataset.from_dict(dd)
    filename = f'/work/ka1176/shared_data/training/CyGNSS-2/{flag}/day_{day:03d}.nc'
    if os.path.exists(filename):
        os.remove(filename)
    ds.to_netcdf(filename)
    #print(ds)
    #break
    print(filename)

/work/ka1176/shared_data/training/CyGNSS-2/train/day_000.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_001.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_002.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_003.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_004.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_005.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_006.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_007.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_008.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_009.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_010.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_011.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_012.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_013.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_014.nc
/work/ka1176/shared_data/training/CyGNSS-2/train/day_015.nc
/work/ka1176/shared_data/training/CyGNSS

In [None]:
ds = xr.open_dataset(raw_data_files[-1])

In [None]:
ds