### Tests concerning the speed of training

In [None]:
# Ran with 800GB (750GB should also be fine)

import sys
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import gc

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1_l2

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Activation

# For Leaky_ReLU:
from tensorflow import nn 

t0 = time.time()
path = '/pf/b/b309170'
path_figures = path + '/workspace_icon-ml/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05/figures'
path_model = path + '/workspace_icon-ml/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05/saved_models'
path_data = path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05/based_on_var_interpolated_data'

# Add path with my_classes to sys.path
sys.path.insert(0, path + '/workspace_icon-ml/cloud_cover_parameterization/')

# Reloading custom file to incorporate changes dynamically
import importlib
import my_classes
importlib.reload(my_classes)

from my_classes import read_mean_and_std
from my_classes import TimeOut

# Minutes per fold
timeout = 2120 

# For logging purposes
days = 'all_days'

# Maximum amount of epochs for each model
epochs = 30

# Set seed for reproducibility
seed = 10
tf.random.set_seed(seed)

# For store_mean_model_biases
VERT_LAYERS = 31

gpus = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_visible_devices(gpus[3], 'GPU')

In [None]:
# Prevents crashes of the code
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[0], 'GPU')

In [None]:
# Allow the growth of memory Tensorflow allocates (limits memory usage overall)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
scaler = StandardScaler()

### Load the data

In [None]:
# input_narval = np.load(path_data + '/cloud_cover_input_narval.npy')
# input_qubicc = np.load(path_data + '/cloud_cover_input_qubicc.npy')
# output_narval = np.load(path_data + '/cloud_cover_output_narval.npy')
# output_qubicc = np.load(path_data + '/cloud_cover_output_qubicc.npy')

In [None]:
input_data = np.concatenate((np.load(path_data + '/cloud_cover_input_narval.npy'), 
                             np.load(path_data + '/cloud_cover_input_qubicc.npy')), axis=0)
output_data = np.concatenate((np.load(path_data + '/cloud_cover_output_narval.npy'), 
                              np.load(path_data + '/cloud_cover_output_qubicc.npy')), axis=0)

In [None]:
samples_narval = np.load(path_data + '/cloud_cover_output_narval.npy').shape[0]

In [None]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

*Temporal cross-validation*

Split into 2-weeks increments (when working with 3 months of data). It's 25 day increments with 5 months of data. <br>
1.: Validate on increments 1 and 4 <br>
2.: Validate on increments 2 and 5 <br>
3.: Validate on increments 3 and 6

--> 2/3 training data, 1/3 validation data

In [None]:
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

### Define the model

Activation function for the last layer

In [None]:
def lrelu(x):
    return nn.leaky_relu(x, alpha=0.01)

In [None]:
# Create the model
model = Sequential()

# First hidden layer
model.add(Dense(units=256, activation=lrelu, input_dim=no_of_features, 
                kernel_regularizer=l1_l2(l1=0.000162, l2=0.007437)))

# Second hidden layer
model.add(Dense(units=256, activation=lrelu, kernel_regularizer=l1_l2(l1=0.000162, l2=0.007437)))
model.add(Dropout(0.184124)) # We drop 18% of the hidden nodes

# Output layer
model.add(Dense(1, activation='linear', kernel_regularizer=l1_l2(l1=0.000162, l2=0.007437)))

Preliminary baselines

In [None]:
# # This would be the loss of a NN which outputs zeros everywhere
# np.mean(np.array(output_data)**2)

In [None]:
# # This would be the loss of a NN which outputs the best constant value everywhere
# constant_mean = np.mean(np.array(output_data))
# np.mean((np.array(output_data) - constant_mean)**2)

In [None]:
# # Freeing up memory (~46 GB). Memory usage after this cell: 251 GB
# del input_narval, input_qubicc, output_narval, output_qubicc

# gc.collect()

### 3-fold cross-validation

In [None]:
# By decreasing timeout we make sure every fold gets the same amount of time
# After all, data-loading took some time (Have 3 folds, 60 seconds/minute)
# timeout = timeout - 1/3*1/60*(time.time() - t0)
timeout = timeout - 1/60*(time.time() - t0)
t0 = time.time()

#We loop through the folds
for i in range(1,2):
    
    filename = 'cross_validation_cell_based_fold_%d'%(i+1)
    
    #Standardize according to the fold
    scaler.fit(input_data[training_folds[i]])

    #Load the data for the respective fold and convert it to tf data
    input_train = scaler.transform(input_data[training_folds[i]])
    input_valid = scaler.transform(input_data[validation_folds[i]]) 
    output_train = output_data[training_folds[i]]
    output_valid = output_data[validation_folds[i]]
    
    # Clear memory (Reduces memory requirement to 151 GB)
    del input_data, output_data, first_incr, second_incr, validation_folds, training_folds
    gc.collect()
    
#     # Column-based: batchsize of 128
#     # Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
#     train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
#                                 tf.data.Dataset.from_tensor_slices(output_train))) \
#                 .shuffle(1000, seed=seed) \
#                 .batch(batch_size=1028, drop_remainder=True) \
#                 .prefetch(1)
    
#     # Clear memory
#     del input_train, output_train
#     gc.collect()
    
#     # No need to add prefetch.
#     # tf data with batch_size=10**5 makes the validation evaluation 10 times faster
#     valid_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_valid), 
#                                 tf.data.Dataset.from_tensor_slices(output_valid))) \
#                 .batch(batch_size=10**5, drop_remainder=True)
    
#     # Clear memory (Reduces memory requirement to 151 GB)
#     del input_valid, output_valid
#     gc.collect()
    
    #Feed the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.008726, epsilon=0.1),
        loss=tf.keras.losses.MeanSquaredError()
    )
    
#     #Train the model
# #     time_callback = TimeOut(t0, timeout*(i+1))
#     time_callback = TimeOut(t0, timeout)
#     history = model.fit(train_ds, epochs=epochs, verbose=2, validation_data=valid_ds, 
#                         callbacks=[time_callback])
# #     history = model.fit(train_ds, epochs=epochs, validation_data=valid_ds, callbacks=[time_callback])

#     #Save the model     
#     #Serialize model to YAML
#     model_yaml = model.to_yaml()
#     with open(os.path.join(path_model, filename+".yaml"), "w") as yaml_file:
#         yaml_file.write(model_yaml)
#     #Serialize model and weights to a single HDF5-file
#     model.save(os.path.join(path_model, filename+'.h5'), "w")
#     print('Saved model to disk')
    
#     #Plot the training history
#     if len(history.history['loss']) > len(history.history['val_loss']):
#         del history.history['loss'][-1]
#     pd.DataFrame(history.history).plot(figsize=(8,5))
#     plt.grid(True)
#     plt.ylabel('Mean Squared Error')
#     plt.xlabel('Number of epochs')
#     plt.savefig(os.path.join(path_figures, filename+'.pdf'))
    
#     with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
#         file.write('Results from the %d-th fold\n'%(i+1))
#         file.write('Training epochs: %d\n'%(len(history.history['val_loss'])))
#         file.write('Weights restored from epoch: %d\n\n'%(1+np.argmin(history.history['val_loss'])))

### Training with 800 Mio samples (three-hourly QUBICC data and stricter class balancing)

In [16]:
# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(100000, seed=seed) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [None]:
history = model.fit(train_ds, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

In [16]:
# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(1000000, seed=seed) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [17]:
history = model.fit(train_ds, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(100000, seed=seed) \
            .batch(batch_size=512, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [None]:
history = model.fit(train_ds, epochs=5, verbose=1)

Epoch 1/5
 222813/1313689 [====>.........................] - ETA: 55:20 - loss: 215.5292

### Training with two-hourly QUBICC data is quite infeasible:

Batch size of 1028 is faster than one of 2048!
So batch size of 1028 is best. A shuffle buffer of 100000 is best.

In [16]:
# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(100000, seed=seed) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [17]:
history = model.fit(train_ds, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
 241540/1226713 [====>.........................] - ETA: 54:04 - loss: 120.0847

KeyboardInterrupt: 

In [20]:
# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(100000, seed=200) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [None]:
history = model.fit(train_ds, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

In [16]:
# Shuffle after every epoch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(100000, seed=seed, reshuffle_each_iteration=True) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [17]:
history = model.fit(train_ds, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
# Shuffle after every epoch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(100000, seed=200, reshuffle_each_iteration=True) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [None]:
history = model.fit(train_ds, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5

In [26]:
# Column-based: batchsize of 1028
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(1000, seed=seed) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [27]:
history = model.fit(train_ds, epochs=1, verbose=1)



In [20]:
# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(10, seed=seed) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

In [21]:
history = model.fit(train_ds, epochs=1, verbose=1)



In [22]:
# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(1000, seed=seed) \
            .batch(batch_size=2048, drop_remainder=True) \
            .prefetch(1)

In [23]:
history = model.fit(train_ds, epochs=1, verbose=1)



In [24]:
# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(10000, seed=seed) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [25]:
history = model.fit(train_ds, epochs=1, verbose=1)



In [16]:
# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(100000, seed=seed) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [17]:
history = model.fit(train_ds, epochs=1, verbose=1)



In [40]:
# Different seed
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(100000, seed=100) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [41]:
history = model.fit(train_ds, epochs=1, verbose=1)



In [34]:
# Try to shuffle after batching
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .batch(batch_size=1028, drop_remainder=True) \
            .shuffle(100000, seed=seed) \
            .prefetch(1)

In [35]:
history = model.fit(train_ds, epochs=1, verbose=1)



In [42]:
# Shuffle dataset with numpy and train afterwards
permuted_indices = np.random.permutation(np.arange(input_train.shape[0]))

train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train[permuted_indices]), 
                            tf.data.Dataset.from_tensor_slices(output_train[permuted_indices]))) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

In [43]:
history = model.fit(train_ds, epochs=1, verbose=1)



In [38]:
# Is a batchsize of 512 really much slower than one of 1028? Yes

# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(10, seed=seed) \
            .batch(batch_size=512, drop_remainder=True) \
            .prefetch(1)

In [39]:
history = model.fit(train_ds, epochs=1, verbose=1)



KeyboardInterrupt: 

In [44]:
# Column-based: batchsize of 128
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(100000, seed=seed) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# shuffle 1000 and bs 2056 increases...
# shuffle 1000 and bs 1028 increases... but it starts to decrease after a while!

In [None]:
# Try multiple epochs
history = model.fit(train_ds, epochs=3, verbose=1)

Epoch 1/3

In [None]:
# Try multiple epochs
history = model.fit(train_ds, epochs=6, verbose=1)