In [None]:
import os
import time
import datetime
import numpy as np

In [None]:
import xarray
import h5py

In [None]:
from torch.utils.data import Dataset, DataLoader

## Preprocessing CyGNSS data

https://xarray.pydata.org/en/stable/index.html

### Raw data

We have raw data from the CyGNSS satellite mission that we want to use for a machine learning algorithm. In advance, we decided to use 100 days for training, and 20 days each for validation and test, and sorted the raw data NetCDF files accordingly. Below, we set up the paths to the raw data files.

**TASK**: Open a terminal connection to Mistral within Jupyterhub. Change directory to `/work/ka1176/shared_data/training/CyGNSS-2/` and look at the contents of this folder.

In [None]:
train_data_dir = '/work/ka1176/shared_data/training/CyGNSS-2/train/'
all_train_files = [os.path.join(train_data_dir, ff) for ff in sorted(os.listdir(train_data_dir))]

valid_data_dir = '/work/ka1176/shared_data/training/CyGNSS-2/valid/'
all_valid_files = [os.path.join(valid_data_dir, ff) for ff in sorted(os.listdir(valid_data_dir))]

test_data_dir = '/work/ka1176/shared_data/training/CyGNSS-2/test/'
all_test_files = [os.path.join(test_data_dir, ff) for ff in sorted(os.listdir(test_data_dir))]

print(f'Number of files for train dataset: {len(all_train_files):3d}')
print(f'Number of files for valid dataset: {len(all_valid_files):3d}')
print(f'Number of files for test dataset:  {len(all_test_files):3d}')

### Open NetCDF files

Start by opening one of the raw data files and investigate what is there.

**TASK**: Open a few different raw data files by changing `day_ix`. What kind of variables are in the NetCDF files? How many samples are in the NetCDF files? Use the interactive explorer in Jupyter notebook to answer these questions.

In [None]:
day_ix = 3
ds = xarray.open_dataset(all_train_files[day_ix])

In [None]:
ds # Opens an interactive explorer

### Load data from NetCDF files

xarray always uses *lazy loading* for NetCDF files. That means that data is not actually loaded into memory until we explicitly say so. We can use array operations like arithmetic operations, slicing, and subsetting, without loading the data. Only at computation time, the data has to be loaded.

**TASK**: Execute the following two cells. What is the output of each command? 

HINT: If the cell output is too long, enable scrolling in the cell context menu (right-click)

In [None]:
v = ds.brcs
v

In [None]:
x = ds.brcs.values
x

**TASK**: The variable `ddm_timestamp` contains a sample timestamp, measured in seconds (verify this by checking the *Attribute* section). Add a new attribute `ddm_day`, which stores the *day*  the sample was recorded.

In [None]:
ds.ddm_timestamp

In [None]:
ds['ddm_day'] = ... # calculate the day here (SOLUTION: (ds.ddm_timestamp / 24 / 3600).astype(int))

In [None]:
ds.ddm_day.values

### Combine several NetCDF files

**TASK**: Use `xarray.open_mfdataset` to open all files at once and form the train / valid / test dataset. How many samples are available in each dataset? Note that `xarray` now loads the input data in *chunks*, instead of loading all at once.

In [None]:
%%time
#del ds_train
ds_train = xarray.open_mfdataset(all_train_files, combine='nested', concat_dim='sample')
ds_valid = ...
ds_test  = ...

In [None]:
ds_train

## Preprocessing

Raw data needs to be preprocessed before it can be used in a machine learning algorithm. Below, we demonstrate some typical data cleaning tasks. `xarray` implements many `numpy` functions for its `Dataset` and `DataArray`.

### Check for missing values

**TASK** Remove the missing values from the dataset by executing the following cells. First, we create a `mask`, which is a boolean array. Then, we select only samples that meet the condition of the boolean array. How many samples have been removed? Can you estimate the fraction of None values?

Check the documentation of `xarray.ufuncs` and `xarray.Dataset.sel` for more details on these functions.

In [None]:
mask = xarray.ufuncs.isnan(ds_train.windspeed)
mask

In [None]:
ds_train = ds_train.sel(sample=~mask, drop=True) # note the condition is ~mask (NOT mask): we want to keep NOT none values in the dataset
ds_train

Repeat for the `brcs` variable. Note this variable has additional dimensions `delay` and `doppler`. We compute the maximum across these dimensions: if any pixel is None, the sample is discarded.

In [None]:
mask = xarray.ufuncs.isnan(ds_train.brcs)
mask = mask.max(dim=['delay', 'doppler']) 
mask

In [None]:
ds_train = ds_train.sel(sample=~mask, drop=True)
ds_train

### Check for fill values

Sometimes variables contain fill values, indicating missing raw data that is filled not with `None`, but a finite numeric value. In the CyGNSS dataset, the fill value for the `windspeed` variable is `-1` (verify this by looking at the variable attributes).

**TASK**: Remove the samples that have a fill value for `windspeed`

In [None]:
mask = ds_train.windspeed==-1
mask

In [None]:
ds_train = ds_train.sel(sample=~mask, drop=True)
ds_train

### Apply normalization

Input feature normalization can be applied during preprocessing or at a later stage (directly before the samples enter the neural network). For demonstration purposes, we apply the normalization right here. We chose min / max normalization, you could apply another normalization.

In [None]:
brcs_max = ds_train.brcs.max().values
brcs_min = ds_train.brcs.min().values

ds_train['brcs'] -= brcs_min
ds_train['brcs'] /= (brcs_max - brcs_min)

print(f'Before normalization: max = {brcs_max:.2e}, min = {brcs_min:.2e}')
print(f'After normalization:  max = {ds_train.brcs.max().values:.1f}, min = {ds_train.brcs.min().values:.1f}')

**TASK** Apply the input feature normalization for the validation set. 

In [None]:
ds_valid['brcs'] = ...

## Dataset for Machine Learning

In [None]:
# TODO "reset" the ds_train if it is not already done before
# ds_train = xarray.open_mfdataset(all_train_files, combine='nested', concat_dim='sample')

### Feeding samples

A neural network processes the training data in *minibatches* (see Tutorial, Part 1). Feeding data to the neural network can be a bottleneck for training. In this section you will learn:
- How to measure the execution time of code and identify bottlenecks
- How to use efficient file formats for machine learning

We will use the `brcs` variable as the input feature and the `windspeed` variable as the target variable. A sample is a tuple `(X, y) = (ds['brcs'][i], ds['windspeed'][i])`

- One epoch of training: cycle all minibatches
- One minibatch: collect `batch_size` samples *randomly* from the dataset
- Little sidenote: the Python iterator (`yield`)

In [None]:
N_samples = len(ds_train.sample) # total number of samples
batch_size = 128 # typical value for the minibatch size
n_batches = N_samples // batch_size # integer division

print(f'Train dataset contains {N_samples:.1e} samples')
print(f'Batch size {batch_size} ==> {n_batches} minibatches')

**TASK**: Execute the cell below to measure the execution time of loading one minibatch. Note this minibatch is not shuffled, we just load the first `batch_size` samples. How long would it take to load the data for a full epoch? How fast is random access compared to accessing the first `batch_size` samples in the previous **TASK**?

In [None]:
# Demonstrate one batch
start_time = time.time()
X = ds_train.brcs[:batch_size].values
y = ds_train.windspeed[:batch_size].values
end_time = time.time()
print(f'Execution took {end_time - start_time:.2e} seconds')

In [None]:
T_epoch = ... # solution: n_batches * 1.22e-2

**TASK**: Now we would like to load a shuffled minibatch. How long would it take to load the data for a full epoch this way? 

Note: We replaced the cumbersome calculation of the execution time by a *cell magic* function `%%time`, that is a nice feature of jupyter notebooks

In [None]:
%%time
# create random indices
random_ix = np.random.choice(len(ds_train.sample), size=batch_size, replace=True)
X = ds_train.brcs.values[random_ix]
y = ds_train.windspeed.values[random_ix]

In [None]:
T_epoch = ... # solution: n_batches * 2.84

The CyGNSS dataset is large, but it still fits comfortably in memory. Therefore, we could also load the full dataset in advance, instead of loading only the samples for one batch.

In [None]:
ds_train.nbytes / 1e9 # ds_train size in GB

**TASK**: Execute the following cell to load the full dataset in memory. Again, retrieve the time it takes to form one minibatch. How does this time compare to the previous measurements? Note: we are using the magic cell function `%%timeit`, which executes the same code several times and reports mean / std dev of execution times

In [None]:
%%time
ds_train.load()

In [None]:
%%timeit
# create random indices
random_ix = np.random.choice(len(ds_train.sample), size=batch_size, replace=True)
X = ds_train.brcs.values[random_ix]
y = ds_train.windspeed.values[random_ix]

In [None]:
T_epoch = ...

### Save dataset in hdf5 format

We are now done with preprocessing the CyGNSS data. It is good practice to separate these steps from your machine learning algorithm. This way, we avoid repeating the preprocessing every time we load training data, and the code is modularized.

**TASK**: Use the function `save_ds_hdf5` to save train, valid, and test dataset. 

In [None]:
def save_ds_to_hdf5(ds, h5_file_name, overwrite=True):
    '''
    Save a dataset as hdf5.
    
    Parameters:
    -----------
    ds : xarray.Dataset
    h5_file_name : target filename
    overwrite : if True, overwrite existing files
    '''
    start_time = time.time()
    
    
    if os.path.exists(h5_file_name):
        if overwrite:
            print(f'Overwrite {h5_file_name}')
            os.remove(h5_file_name)
        else:
            print(f'Cannot overwrite {h5_file_name}')
            return
        

    h5_file = h5py.File(h5_file_name, 'w')

    
    n_samples = len(ds.sample)
    
    h5_file.create_dataset('brcs', 
                           shape=(n_samples,) + (17,11,),
                           chunks=(1000,) + (17,11,),
                           fletcher32=True, 
                           dtype='float32')
    
    h5_file.create_dataset('windspeed', 
                           shape=(n_samples,),
                           chunks=(1000,),
                           fletcher32=True, 
                           dtype='float32')
    
    h5_file['brcs'][:] = ds.brcs.values
    h5_file['windspeed'][:] = ds.windspeed.values
    h5_file.flush()
    h5_file.attrs['timestamp'] = str(datetime.datetime.now())
    run_time = (time.time() - start_time)
    print(f'{n_samples} samples appended to file '
          f'{h5_file.filename} in {run_time:.2f} seconds')
    h5_file.close()

In [None]:
save_ds_to_hdf5(ds_train, 'train_data.h5')
save_ds_to_hdf5(..., ...) # repeat for the valid dataset

How to open a hdf5 file in python:

In [None]:
ds_train_hdf5 = h5py.File('train_data.h5', 'r')
print(ds_train_hdf5)
print(ds_train_hdf5.keys())

How to load a variable:

In [None]:
%%time
ds_train_hdf5['brcs'][:];

## PyTorch Dataset

Finally, we will look in detail at the two classes that are needed in PyTorch for feeding data to the neural network:
* Dataset
* DataLoader

The Dataset for CyGNSS data is defined in the following cell. 

**TASK** Look at the class definition below. Generate the train and the validation dataset.

In [None]:
class CyGNSSDataset(Dataset):
    def __init__(self, flag):
        '''
        Load data from hdf5 file

        Parameters:
        -----------
        flag : string
            Any of train / valid / test. Defines dataset.
        -----------
        Returns: dataset
        '''
        self.h5_file = h5py.File(flag + '_data.h5', 'r', rdcc_nbytes=0)  # disable cache
        start_time = time.time()
        
        self.y = self.h5_file['windspeed'][:].astype(np.float32)
        self.X = self.h5_file['brcs'][:].astype(np.float32)

        print(f'load {flag} input data: {self.X.shape} ({self.X.nbytes // 1e6}MB)')
        print(f'load {flag} labels: {self.y.shape} ({self.y.nbytes // 1e6}MB)')
        
    def __len__(self):
        '''required function for the pytorch dataloader: returns len(samples)'''
        return self.X.shape[0]

    def __getitem__(self, idx):
        '''required function for the pytorch dataloader: yields sample at idx'''
        X = self.X[idx]
        y = self.y[idx]
        return (X, y)


In [None]:
train_dataset = ...
valid_dataset = ...

The `DataLoader` takes a pytorch `Dataset` and produces minibatches. 

**TASK** Generate a `DataLoader` for the train and validation dataset each. Look up the documentation of the `DataLoader` and discuss it / write a question about it in the google doc.

In [None]:
train_dataloader = Dataloader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
valid_dataloader = ...

In [None]:
for i, (X, y) in enumerate(train_dataloader):
    pass