In [None]:
import os
import time

In [None]:
import xarray

## Preprocessing CyGNSS data

### Raw data

#### xarray

- TODO: task with xarray.open_dataset
- TODO: task with xarray.open_mfdataset

In [None]:
data_dir = '/work/ka1176/shared_data/training/CyGNSS-2/train/'
all_train_files = [os.path.join(data_dir, ff) for ff in os.listdir(data_dir)]

print(len(all_train_files))

In [None]:
ds = xarray.open_dataset(all_train_files[2])

In [None]:
ds # look at the dataset, jupyter notebook offers an explorer --> click ont he description etxc
# Questions: How many samples are in the dataset?
# Can you see which type of data is in the datset?

In [None]:
# Task: Extracting values from a dataset
ds.brcs

In [None]:
%%time
# Task: actually loading the dataset
x = ds.brcs.values
print(x.shape)

In [None]:
# Task open several files and combine them in one dataset

In [None]:
%%time
ds = xarray.open_mfdataset(all_train_files[:10], combine='nested', concat_dim='sample')

In [None]:
ds # how many samples are there now?

In [None]:
# Task: demonstrate a dataloader here (one epoch - loop all training samples)
# Demonstrate that the order actually matters with lazy loading :-)
# Introduce the %%time magic function
# Introduce the alternative: start_time = time.time() --> time.time() - start_time

In [None]:
N_samples = len(ds.sample)
batch_size = 128
n_batches = N_samples // batch_size # integer division
print(n_batches)

In [None]:
# Demonstrate one batch
start_time = time.time()
X = ds.brcs[:batch_size].values
y = ds.windspeed[:batch_size].values
end_time = time.time()
print(f'Execution took {end_time - start_time:.2f} seconds')

In [None]:
%%time
# Demonstrate one batch using the cell magic function
X = ds.brcs.values[:batch_size]
y = ds.windspeed.values[:batch_size]

In [None]:
# Question: which method is faster??

In [None]:
%%time
for i in range(n_batches):
    X = ds.brcs[i*batch_size:(i+1)*batch_size].values
    y = ds.windspeed[i*batch_size:(i+1)*batch_size].values

Task: Get an estimate how long one epoch would take (--> inefficient to read from netcdf files)

#### save as hdf5

TODO: adapt the function xarray-->hdf5 from cygnss git

Task:
    - save the datasets as {train,valid,test}_data.h5 by executing the provided function
    - Again try to form a minibatch by randomly reading from these files. Time the execution. Is there a speedup? (--> it is good practice to save the data in a format that is useful for your later application)

### PyTorch Dataset

In [None]:
class CyGNSSDataset(Dataset):
    def __init__(self, flag, input_v_map=['brcs'], normalization_values=None, filter_quality=False):
        '''
        Load data and apply transforms during setup

        Parameters:
        -----------
        flag : string
            Any of train / valid / test. Defines dataset.
        input_v_map : list
            Input maps, choice of ['brcs', 'eff_scatter']
        normalization_values : dict
            Mean and standard deviation, needed for scaling the input variables
        filter_quality : bool
            Filter samples that are flagged as bad quality (default: False)
        -----------
        Returns: dataset
        '''
        self.h5_file = h5py.File(os.path.join('/work/ka1176/shared_data/CyGNSS/', flag + '_data.h5'), 'r', rdcc_nbytes=0)  # disable cache
        # load everything into memory
        start_time = time.time()
        
        # load labels
        self.y = self.h5_file['windspeed'][:].astype(np.float32)

        # normalize main input data
        # Save normalization values together with the trained model
        # For inference load the normalization values

        if flag=='train': # determine normalization values
            self.normalization_values = dict()
        else:
            self.normalization_values = normalization_values
        
        # stack map vars (2D vars)
        self.X = []
        for v_map in input_v_map:
            X_v_map = self.h5_file[v_map][:].astype(np.float32)
            
            if flag=='train':
                norm_vals = dict()
                X_v_map_scaled, X_mean, X_std = self._standard_scale(X_v_map)
                self.normalization_values[f'{v_map}_mean'] = X_mean
                self.normalization_values[f'{v_map}_std']  = X_std
            else:
                X_mean = self.normalization_values[f'{v_map}_mean']
                X_std = self.normalization_values[f'{v_map}_std']
                X_v_map_scaled = self._standard_scale_given(X_v_map, X_mean, X_std)
                
            self.X.append(X_v_map_scaled) # append scaled 2D map
        self.X = np.stack(self.X, axis=1)
        
        if filter_quality:
            n_before = len(self.y)
            mask = self.h5_file['quality'][:]
            self.X, self.y = self.X[mask], self.y[mask]
            print(f'After filter_quality, {len(self.y)} samples remain ({len(self.y)/n_before*100:.1f}%)')

        print(f'load and transform {flag} input data: {self.X.shape} ({self.X.nbytes // 1e6}MB)')
        print(f'load and transform {flag} labels: {self.y.shape} ({self.y.nbytes // 1e6}MB)')
        
    def _standard_scale(self, v):
        '''apply standard scale and return mean / std'''
        mean = np.mean(v)
        sigma = np.std(v)
        v_tilde = (v - mean) / sigma
        return v_tilde, mean, sigma
    
    def _standard_scale_given(self, v, mean, sigma):
        '''apply standard scale with pre-determined mean / std'''
        v_tilde = (v - mean) / sigma
        return v_tilde

    def _filter_all_data_by_mask(self, mask, flag, name=''): 
        '''filter the input data by the provided mask'''
        self.X, self.y = self.X[mask], self.y[mask]
        print(f'{flag} input data after {name} downsampling: {self.X.shape} ({self.X.nbytes // 1e6}MB)')

    def __len__(self):
        '''required function for the pytorch dataloader'''
        return self.X.shape[0]

    def __getitem__(self, idx):
        '''required function for the pytorch dataloader'''
        X = self.X[idx]
        y = self.y[idx]
        return (X, y)


In [None]:
def setup_dataloaders(filter_quality=False, input_v_map=['brcs']):
    '''Load the datasets and create PyTorch dataloaders
    
    Input parameters:
    -------------------------
    filter_quality : apply a filter for sample quality (default: False)
    input_v_map    : list of input features (default: ['brcs'])
    -------------------------
    
    Returns:
    -------------------------
    pytorch DataLoader instances for train / validation / test set
    '''
    
    train_dataset = CyGNSSDataset('train', filter_quality=filter_quality, input_v_map=input_v_map)
    valid_dataset = CyGNSSDataset('valid', filter_quality=filter_quality, input_v_map=input_v_map, normalization_values=train_dataset.normalization_values)
    test_dataset = CyGNSSDataset('test', filter_quality=filter_quality, input_v_map=input_v_map, normalization_values=train_dataset.normalization_values)
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
    test_dataloader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
    
    return train_dataloader, valid_dataloader, test_dataloader

Tasks:
    - iterate through samples by iterating a dataloader (python concept "yield")
    - pass an argument to the dataset to add another variable