## Hyperparameter Tuning

Run **SHERPA**. Fix batchsize = 512. Fix Adam. Do not shuffle the input data as that takes a lot of time. <br>
*First:* Start with 3 epochs each. Here we can already discard some models. <br>
*Then:* Run 20 epochs for all networks that have learned. Possibly add the learning rate scheduler. Usually one uses cross-validation here to truly get a good estimate of generalization error! <br>
*Then:* Run 50 epochs to get the best network. <br>

If no network learns then the batch size is simply too large?

To vary: 
- Learning rate (Learning rate scheduler)
- Model layers (only max 1-4 hidden layers)
- Regularization methods
- Hidden Units
- Activation Functions (not the last)

Batchsize 256: 37 minutes per epoch. <br>
Batchsize 512: 9 minutes per epoch.

**---> Find that we can not really beat the architecture of the models N1-N3**

In [1]:
# Requires at least 500GB to run

import sys
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import gc

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1_l2

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Activation

t0 = time.time()
path = '/pf/b/b309170'
path_figures = path + '/workspace_icon-ml/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/figures'
path_model = path + '/workspace_icon-ml/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/saved_models'
path_data = path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/based_on_var_interpolated_data'

# Add path with my_classes to sys.path
sys.path.insert(0, path + '/workspace_icon-ml/cloud_cover_parameterization/')
# Add sherpa
sys.path.insert(0, path + '/my_work/sherpa')

import datetime
#import sherpa
#import sherpa.algorithms.bayesian_optimization as bayesian_optimization

# Reloading custom file to incorporate changes dynamically
import importlib
import my_classes
importlib.reload(my_classes)

from my_classes import read_mean_and_std
from my_classes import TimeOut

# Minutes per fold
timeout = 2120 

# For logging purposes
days = 'all_days'

# Maximum amount of epochs for each model
epochs = 50

# Set seed for reproducibility
seed = 10
tf.random.set_seed(seed)

# For store_mean_model_biases
VERT_LAYERS = 31

gpus = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_visible_devices(gpus[3], 'GPU')

In [2]:
# Won't run on a CPU node
try:
    # Prevents crashes of the code
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.set_visible_devices(physical_devices[0], 'GPU')

    # Allow the growth of memory Tensorflow allocates (limits memory usage overall)
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
except:
    pass

In [3]:
scaler = StandardScaler()

In [4]:
# To save the SHERPA output in
today = str(datetime.date.today())[:7] # YYYY-MM
random_num = np.random.randint(500000)
print(random_num)

out_path = '/pf/b/b309170/workspace_icon-ml/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/sherpa_results/'+\
        today+'_'+str(random_num)

# Create the directory and save the SHERPA-output in it
try:
    os.mkdir(out_path)
except OSError:
    print('Creation of the directory %s failed' % out_path)
else: 
    print('Successfully created the directory %s' % out_path)

217047
Successfully created the directory /pf/b/b309170/workspace_icon-ml/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/sherpa_results/2021-05_217047


### Load the data

In [5]:
# input_narval = np.load(path_data + '/cloud_cover_input_narval.npy')
# input_qubicc = np.transpose(np.load(path_data + '/cloud_cover_input_qubicc.npy'))
# output_narval = np.load(path_data + '/cloud_cover_output_narval.npy')
# output_qubicc = np.transpose(np.load(path_data + '/cloud_cover_output_qubicc.npy'))

In [6]:
# input_data = np.concatenate(input_narval, input_qubicc)
# output_data = np.concatenate(output_narval, output_qubicc)

input_data = np.concatenate((np.load(path_data + '/cloud_cover_input_narval.npy'), 
                             np.transpose(np.load(path_data + '/cloud_cover_input_qubicc.npy'))), axis=0)
output_data = np.concatenate((np.load(path_data + '/cloud_cover_output_narval.npy'), 
                              np.transpose(np.load(path_data + '/cloud_cover_output_qubicc.npy'))), axis=0)

In [7]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(176209421, 163)

In [8]:
np.random.seed(seed)

# Take only a subset to test with
indices = np.random.randint(samples_total, size=10**7)
input_data = input_data[indices]
output_data = output_data[indices]

(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(10000000, 163)

*Temporal cross-validation*

Split into 2-weeks increments (when working with 3 months of data). It's 25 day increments with 5 months of data. <br>
1.: Validate on increments 1 and 4 <br>
2.: Validate on increments 2 and 5 <br>
3.: Validate on increments 3 and 6

--> 2/3 training data, 1/3 validation data

In [9]:
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

Remove columns that are constant in at least one of the training folds

In [10]:
# This takes a bit of time
# remove_fields = []
# constant_0 = (np.max(input_data[training_folds[0]], axis=0) - np.min(input_data[training_folds[0]], axis=0) < 1e-10)
# constant_1 = (np.max(input_data[training_folds[1]], axis=0) - np.min(input_data[training_folds[1]], axis=0) < 1e-10)
# constant_2 = (np.max(input_data[training_folds[2]], axis=0) - np.min(input_data[training_folds[2]], axis=0) < 1e-10)
# for i in range(no_of_features):
#     if constant_0[i] or constant_1[i] or constant_2[i]:
#         print(i)
#         remove_fields.append(i)

# These features correspond to qc_4, qc_5, qc_6, qc_7, qc_8, qc_9, zg_4, zg_5, zg_6
remove_fields = [27, 28, 29, 30, 31, 32, 135, 136, 137]
assert no_of_features == 163
input_data = np.delete(input_data, remove_fields, axis=1)
no_of_features = no_of_features - len(remove_fields)

### Define the model

Activation function for the last layer

In [11]:
# Activation function for the last layer
def my_act_fct(x):
    return K.minimum(K.maximum(x, 0), 100)

### 3-fold cross-validation

In [12]:
# By decreasing timeout we make sure every fold gets the same amount of time
# After all, data-loading took some time (Have 3 folds, 60 seconds/minute)
# timeout = timeout - 1/3*1/60*(time.time() - t0)
timeout = timeout - 1/60*(time.time() - t0)
t0 = time.time()

#We loop through the folds
# for i in range(3):
for i in range(0,1):
    
    filename = 'cross_validation_column_based_fold_%d'%(i+1)
    
    #Standardize according to the fold
    scaler.fit(input_data[training_folds[i],:])

    #Load the data for the respective fold and convert it to tf data
    input_train = scaler.transform(input_data[training_folds[i]])
    input_valid = scaler.transform(input_data[validation_folds[i]])
    output_train = output_data[training_folds[i]]
    output_valid = output_data[validation_folds[i]]
    # Use a batchsize of 64 or 128
    # Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
    train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                                tf.data.Dataset.from_tensor_slices(output_train))) \
                .batch(batch_size=128, drop_remainder=True).prefetch(1)
    
    # No need to add prefetch.
    # tf data with batch_size=10**5 makes the validation evaluation 10 times faster
    valid_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_valid), 
                                tf.data.Dataset.from_tensor_slices(output_valid))) \
                .batch(batch_size=10**5, drop_remainder=True)
    
#     #Feed the model
#     model.compile(
#         optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
#         loss=tf.keras.losses.MeanSquaredError()
#     )
    
#     #Train the model
# #     time_callback = TimeOut(t0, timeout*(i+1))
#     time_callback = TimeOut(t0, timeout)
#     # Batch size is specified by the tf dataset
#     history = model.fit(train_ds, epochs=epochs, verbose=2, 
#                         validation_data=valid_ds, callbacks=[time_callback])
    
#     #Save the model     
#     #Serialize model to YAML
#     model_yaml = model.to_yaml()
#     with open(os.path.join(path_model, filename+".yaml"), "w") as yaml_file:
#         yaml_file.write(model_yaml)
#     #Serialize model and weights to a single HDF5-file
#     model.save(os.path.join(path_model, filename+'.h5'), "w")
#     print('Saved model to disk')
    
#     #Plot the training history
#     if len(history.history['loss']) > len(history.history['val_loss']):
#         del history.history['loss'][-1]
#     pd.DataFrame(history.history).plot(figsize=(8,5))
#     plt.grid(True)
#     plt.ylabel('Mean Squared Error')
#     plt.xlabel('Number of epochs')
#     plt.savefig(os.path.join(path_figures, filename+'.pdf'))
    
#     with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
#         file.write('Results from the %d-th fold\n'%(i+1))
#         file.write('Training epochs: %d\n'%(len(history.history['val_loss'])))
#         file.write('Weights restored from epoch: %d\n\n'%(np.argmin(history.history['val_loss'])))

In [13]:
# Clear memory (Reduces memory requirement to 238 GB)
del input_data, output_data, first_incr, second_incr, validation_folds, training_folds, input_train, output_train
gc.collect()

22

In [14]:
def save_model(study, today, optimizer):  
    study.results = study.results[study.results['Status']=='COMPLETED'] #To specify results
    study.results.index = study.results['Trial-ID']  #Trial-ID serves as a better index
    # Remove those hyperparameters that actually do not appear in the model
    for i in range(1, max(study.results['Trial-ID']) + 1):
        depth = study.results.at[i, 'model_depth']
        for j in range(depth, 5): #Or up to 8
            study.results.at[i, 'activation_%d'%j] = None
#             study.results.at[i, 'bn_%d'%j] = None
    study.save(out_path)

In [15]:
# Good Reference: https://arxiv.org/pdf/1206.5533.pdf (Bengio), https://arxiv.org/pdf/2004.10652.pdf (Ott)
# lrelu = lambda x: relu(x, alpha=0.01)

# For Leaky_ReLU:
from tensorflow import nn 

def lrelu(x):
    return nn.leaky_relu(x, alpha=0.01)

parameters = [sherpa.Ordinal('num_units', [16, 32, 64, 128, 256]), #No need to vary these per layer. Could add 512.
             sherpa.Discrete('model_depth', [2, 5]), #Originally [2,8] although 8 was never truly tested
             sherpa.Choice('activation_1', ['relu', 'elu', 'tanh', nn.leaky_relu, lrelu]), #Adding SeLU is trickier
             sherpa.Choice('activation_2', ['relu', 'elu', 'tanh', nn.leaky_relu, lrelu]), 
             sherpa.Choice('activation_3', ['relu', 'elu', 'tanh', nn.leaky_relu, lrelu]),
             sherpa.Choice('activation_4', ['relu', 'elu', 'tanh', nn.leaky_relu, lrelu]),
             sherpa.Choice('last_activation', ['linear', my_act_fct]), # my_act_fct doesn't work in the grid cell based case
             sherpa.Continuous('lrinit', [1e-4, 1e-1], 'log'),
             sherpa.Ordinal('epsilon', [1e-8, 1e-7, 0.1, 1]),
             sherpa.Continuous('dropout', [0]), # Better to have 0 here
             sherpa.Continuous('l1_reg', [0, 0.01]),
             sherpa.Continuous('l2_reg', [0, 0.01]),
             sherpa.Ordinal('bn_1', [0, 1]),
             sherpa.Ordinal('bn_2', [0, 1]),
             sherpa.Ordinal('bn_3', [1]), # Better to have 1 here
             sherpa.Ordinal('bn_4', [0]), # Better to have 0 here
             sherpa.Choice('optimizer', ['adam', 'RMSprop', 'adadelta', 'nadam'])]

In [16]:
# max_num_trials is left unspecified, so the optimization will run until the end of the job-runtime

# good_hyperparams = pd.DataFrame({'num_units': [256], 'model_depth': [3], 'activation_1': ['relu'], 'activation_2':['relu'],
#                    'activation_3':['relu'], 'activation_4':['relu'], 'last_activation':[my_act_fct], 'lrinit':[0.001], 'epsilon':[1e-7],
#                    'dropout':[0], 'l1_reg':[0], 'l2_reg':[0], 'bn_1':[0], 'bn_2':[0], 'bn_3':[0], 'bn_4':[0], 'optimizer':['adam']})

# alg = bayesian_optimization.GPyOpt(initial_data_points=good_hyperparams, max_num_trials=100)

alg = bayesian_optimization.GPyOpt() 
study = sherpa.Study(parameters=parameters, algorithm=alg, lower_is_better=True)

INFO:sherpa.core:
-------------------------------------------------------
SHERPA Dashboard running. Access via
http://10.50.13.245:8880 if on a cluster or
http://localhost:8880 if running locally.
-------------------------------------------------------


 * Serving Flask app "sherpa.app.app" (lazy loading)
 * Debug mode: on
 * Environment: production
[2m   Use a production WSGI server instead.[0m


In [None]:
#Starting only with a few epochs
epochs = 3

for trial in study:

    # Create the model
    model = Sequential()
    par = trial.parameters
    
    # Input layer
    model.add(Dense(units=par['num_units'], activation=par['activation_1'], input_dim=no_of_features,
                   kernel_regularizer=l1_l2(l1=par['l1_reg'], l2=par['l2_reg'])))
#     if (par['bn_1']==1):
#         model.add(BatchNormalization()) #There's some debate on whether to use it before or after the activation fct
    
    # Hidden layers    
    for j in range(2, par['model_depth']):
        model.add(Dense(units=par['num_units'], activation=par['activation_'+str(j)], 
                        kernel_regularizer=l1_l2(l1=par['l1_reg'], l2=par['l2_reg'])))
        model.add(Dropout(par['dropout'])) #After every hidden layer we (potentially) add a dropout layer
#         if (par['bn_'+str(j)]==1):
#             model.add(BatchNormalization())
    
    # Output layer
    model.add(Dense(27, activation=par['last_activation'], 
                    kernel_regularizer=l1_l2(l1=par['l1_reg'], l2=par['l2_reg'])))
    
    if par['optimizer'] == 'adam':
        # Optimizer: Adam is relatively robust w.r.t. its beta-parameters 
        optimizer = tf.keras.optimizers.Adam(lr=par['lrinit'], epsilon=par['epsilon'])
    elif par['optimizer'] == 'RMSprop':
        # Optimizer: RMSprop is robust w.r.t. its hyperparameters
        optimizer = tf.keras.optimizers.RMSprop(lr=par['lrinit'], epsilon=par['epsilon'])
    elif par['optimizer'] == 'SGD':
        optimizer = tf.keras.optimizers.SGD(lr=par['lrinit'], momentum=par['epsilon']) 
    elif par['optimizer'] == 'adadelta':
        optimizer = tf.keras.optimizers.Adadelta(lr=par['lrinit'], epsilon=par['epsilon'])
    elif par['optimizer'] == 'nadam':
        optimizer = tf.keras.optimizers.Nadam(lr=par['lrinit'], epsilon=par['epsilon'])
        
    model.compile(loss='mse', optimizer=optimizer)
    
    # Train the model
    model.fit(train_ds, epochs=epochs, verbose=2, validation_data=valid_ds,
              callbacks=[study.keras_callback(trial, objective_name='val_loss')]) ## 3 epochs
    
    
    study.finalize(trial)
    save_model(study, today, par['optimizer'])

Epoch 1/3
52083/52083 - 133s - loss: 343.7736 - val_loss: 342.9725
Epoch 2/3
52083/52083 - 132s - loss: 342.9219 - val_loss: 342.7646
Epoch 3/3
52083/52083 - 132s - loss: 342.7440 - val_loss: 342.5469
Epoch 1/3
52083/52083 - 277s - loss: 357.1911 - val_loss: 349.1093
Epoch 2/3
52083/52083 - 276s - loss: 353.6483 - val_loss: 349.0049
Epoch 3/3
52083/52083 - 277s - loss: 353.3495 - val_loss: 348.7092
Epoch 1/3
52083/52083 - 174s - loss: 1665.8573 - val_loss: 665.3303
Epoch 2/3
52083/52083 - 173s - loss: 1400.4211 - val_loss: 656.0166
Epoch 3/3
52083/52083 - 173s - loss: 807.2425 - val_loss: 443.8656
Epoch 1/3
52083/52083 - 149s - loss: 348.0784 - val_loss: 346.1411
Epoch 2/3
52083/52083 - 148s - loss: 346.8481 - val_loss: 345.5026
Epoch 3/3
52083/52083 - 148s - loss: 346.7727 - val_loss: 345.5513
Epoch 1/3
52083/52083 - 216s - loss: 350.9333 - val_loss: 347.8096
Epoch 2/3
52083/52083 - 215s - loss: 350.1321 - val_loss: 347.9420
Epoch 3/3
52083/52083 - 214s - loss: 350.0687 - val_loss: 34