In [1]:
import os
import gc
import numpy as np
import xarray as xr
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn.utils as utils

2022-05-18 08:53:26.822417: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
z_score = True
light_rain_threshold = 0.2

In [3]:
years = [str(year) for year in range(2011,2019)] 
months = [str(month).zfill(2) for month in range(1,13)]

res = 'hres' #Can be hres or lres

load_path = '/p/project/deepacf/deeprain/rojascampos1/data/sauerland/original/'
save_path = '/p/scratch/deepacf/deeprain/rojascampos1/data/radar_enhancement/' + res

if data_augmentation:
    save_path = save_path + '_augmented/'
else:
    save_path = save_path + '/'


print('Load path:', load_path)
print('Save path:', save_path)

Load path: /p/project/deepacf/deeprain/rojascampos1/data/sauerland/original/
Save path: /p/scratch/deepacf/deeprain/rojascampos1/data/radar_enhancement/hres/


In [4]:
## List to create just one file
time_list = []
radar_list = []
cosmo_list = []

In [5]:
## Iterates over years and months, creates files for each month
for year in years:
    for month in months:
        
        ## Stops if in end
        if (year == '2018' and month == '06'):
            break
        
        cosmo_time = np.load(load_path+'/'+year+'/'+'ensemble_stats_'+year+'_'+month+'_time.npy')
        cosmo_data = np.load(load_path+'/'+year+'/'+'ensemble_stats_'+year+'_'+month+'_data.npy')
        radar_time = np.load(load_path+'/'+year+'/'+'radar_'+res+'_'+year+'_'+month+'_time.npy')
        radar_data = np.load(load_path+'/'+year+'/'+'radar_'+res+'_'+year+'_'+month+'_data.npy')
        
        ## Check for temporal compatibility
        mask = np.isin(radar_time, cosmo_time)
        radar_time = radar_time[mask]
        radar_data = radar_data[mask]
        print(year, month, ':', cosmo_data.shape , '--->', radar_data.shape, 'Same timepoints?', np.array_equal(cosmo_time, radar_time))
        
        ## Creates one big file
        time_list.append(radar_time)
        radar_list.append(radar_data)
        cosmo_list.append(cosmo_data)

2011 01 : (247, 36, 36, 143) ---> (247, 72, 72) Same timepoints? True
2011 02 : (224, 36, 36, 143) ---> (224, 72, 72) Same timepoints? True
2011 03 : (248, 36, 36, 143) ---> (248, 72, 72) Same timepoints? True
2011 04 : (238, 36, 36, 143) ---> (238, 72, 72) Same timepoints? True
2011 05 : (248, 36, 36, 143) ---> (248, 72, 72) Same timepoints? True
2011 06 : (239, 36, 36, 143) ---> (239, 72, 72) Same timepoints? True
2011 07 : (248, 36, 36, 143) ---> (248, 72, 72) Same timepoints? True
2011 08 : (243, 36, 36, 143) ---> (243, 72, 72) Same timepoints? True
2011 09 : (240, 36, 36, 143) ---> (240, 72, 72) Same timepoints? True
2011 10 : (248, 36, 36, 143) ---> (248, 72, 72) Same timepoints? True
2011 11 : (240, 36, 36, 143) ---> (240, 72, 72) Same timepoints? True
2011 12 : (248, 36, 36, 143) ---> (248, 72, 72) Same timepoints? True
2012 01 : (248, 36, 36, 143) ---> (248, 72, 72) Same timepoints? True
2012 02 : (224, 36, 36, 143) ---> (224, 72, 72) Same timepoints? True
2012 03 : (248, 36, 

In [6]:
## Save objects as one file
np.save(save_path+'00_time.npy', np.concatenate(time_list))
np.save(save_path+'00_y.npy', np.concatenate(radar_list, axis=0))
np.save(save_path+'00_x.npy', np.concatenate(cosmo_list, axis=0))

In [7]:
del radar_list, time_list, cosmo_list

### Splits train, test and validation data

In [8]:
time = np.load(save_path + '00_time.npy')
x = np.load(save_path + '00_x.npy')
c = x[:,:,:,97]
y = np.load(save_path + '00_y.npy')
print('x =', x.shape, '---> y =', y.shape)

x = (21427, 36, 36, 143) ---> y = (21427, 72, 72)


In [9]:
## exclude nans
nans = np.isnan(y)
n_nans = np.sum(np.sum(nans, axis=1), axis=1)
mask = n_nans == 0
print('Removed', (np.sum(~mask)/y.shape[0]) * 100 , '% of timesteps' ) 
x = x[mask]
c = c[mask]
y = y[mask]
time = time[mask]

Removed 3.929621505577076 % of timesteps


In [10]:
## Split taking 4 days per month
time = pd.DatetimeIndex(time)
mask_test  = np.isin(time.day, [1, 9, 17, 25])
mask_valid = np.isin(time.day, [5, 13, 21, 28])
mask_train = np.logical_or(mask_test, mask_valid) ## Is it right?
mask_train = np.logical_not(mask_train)
print('Number of common elements from groups =', np.sum(mask_test * mask_valid * mask_train))

Number of common elements from groups = 0


In [15]:
tst_x = x[mask_test]
tst_c = c[mask_test]
tst_y = y[mask_test]
tst_t = time[mask_test]

vld_x = x[mask_valid]
vld_c = c[mask_valid]
vld_y = y[mask_valid]
vld_t = time[mask_valid]

trn_x = x[mask_train]
trn_c = c[mask_train]
trn_y = y[mask_train]
trn_t = time[mask_train]

print('Train: x =', trn_x.shape, '---> y =', trn_y.shape, 'cosmo:', trn_c.shape, 'time:', trn_t.shape)
print('Test: x =', tst_x.shape, '---> y =', tst_y.shape,  'cosmo:', tst_c.shape, 'time:', tst_t.shape)
print('Valid: x =', vld_x.shape, '---> y =', vld_y.shape, 'cosmo:', vld_c.shape, 'time:', vld_t.shape)

Train: x = (15189, 36, 36, 143) ---> y = (15189, 72, 72) cosmo: (15189, 36, 36) time: (15189,)
Test: x = (2671, 36, 36, 143) ---> y = (2671, 72, 72) cosmo: (2671, 36, 36) time: (2671,)
Valid: x = (2725, 36, 36, 143) ---> y = (2725, 72, 72) cosmo: (2725, 36, 36) time: (2725,)


In [16]:
np.save(save_path + '01_trn_t.npy', trn_t)
np.save(save_path + '01_tst_t.npy', tst_t)
np.save(save_path + '01_vld_t.npy', vld_t)

In [17]:
x_mean = np.mean(trn_x, axis=0)
x_std  = np.std(trn_x, axis=0)

trn_x = (trn_x - x_mean)/x_std
tst_x = (tst_x - x_mean)/x_std
vld_x = (vld_x - x_mean)/x_std

print('Train: x =', trn_x.shape, '---> y =', trn_y.shape)
print('Test: x =', tst_x.shape, '---> y =', tst_y.shape)
print('Valid: x =', vld_x.shape, '---> y =', vld_y.shape)

Train: x = (15189, 36, 36, 143) ---> y = (15189, 72, 72)
Test: x = (2671, 36, 36, 143) ---> y = (2671, 72, 72)
Valid: x = (2725, 36, 36, 143) ---> y = (2725, 72, 72)


In [18]:
np.save(save_path + '01_trn_x.npy', trn_x)
np.save(save_path + '01_trn_y.npy', trn_y)
np.save(save_path + '01_trn_c.npy', trn_c)

np.save(save_path + '01_tst_x.npy', tst_x)
np.save(save_path + '01_tst_y.npy', tst_y)
np.save(save_path + '01_tst_c.npy', tst_c)

np.save(save_path + '01_vld_x.npy', vld_x)
np.save(save_path + '01_vld_y.npy', vld_y)
np.save(save_path + '01_vld_c.npy', vld_c)

In [19]:
del time, x, y, c, trn_x, trn_y, trn_c, tst_x, tst_y, tst_c, vld_x, vld_y, vld_c

# Convolutional approach

In [None]:
trn_x = np.load(save_path + '01_trn_x.npy')
trn_y = np.load(save_path + '01_trn_y.npy')
tst_x = np.load(save_path + '01_tst_x.npy')
tst_y = np.load(save_path + '01_tst_y.npy')
vld_x = np.load(save_path + '01_vld_x.npy')
vld_y = np.load(save_path + '01_vld_y.npy')
print('Train: x =', trn_x.shape, '---> y =', trn_y.shape)
print('Test: x =', tst_x.shape, '---> y =', tst_y.shape)
print('Valid: x =', vld_x.shape, '---> y =', vld_y.shape)

### Write tfrecords

In [None]:
def write_tfrecords(x, y, n_records, name):
    
    x = np.array_split(x, n_records)
    y = np.array_split(y, n_records)
    
    ## Write n_records files  
    for i, (forecast, prec) in enumerate(zip(x, y)):

        ## Inside each file do:
        with tf.io.TFRecordWriter('/p/scratch/deepacf/deeprain/rojascampos1/data/radar_enhancement/hres_tminusone/'+ name + '/{:03d}'.format(i) +'.tfrecord') as tfrecord:

            for idx in range(2, forecast.shape[0]):
                

                features = {
                    
                    'feature' : tf.train.Feature(float_list=tf.train.FloatList( value = forecast[idx-2:idx].flatten() )),
                    'label'   : tf.train.Feature(float_list=tf.train.FloatList( value = prec[idx].flatten()     ))}

                example = tf.train.Example(features=tf.train.Features(feature=features))
                tfrecord.write(example.SerializeToString())

        print(name, str(i)+'/'+str(n_records)+' wrote')

In [None]:
write_tfrecords(trn_x, trn_y, 100, 'train_set')    
write_tfrecords(tst_x, tst_y, 10, 'test_set')
write_tfrecords(vld_x, vld_y, 10, 'validation_set')