## Can we recover the good results from sherpa_results2021-05-04_adam?

Can we achieve a validation error of <40 by using a smaller batch size (i.e. 512)?

In [7]:
# No shuffling

In [9]:
# Ran with 800GB (750GB should also be fine)

import sys
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import gc

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1_l2

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Activation

t0 = time.time()
path = '/pf/b/b309170'
path_figures = path + '/workspace_icon-ml/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05/figures'
path_model = path + '/workspace_icon-ml/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05/saved_models'
path_data = path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05/based_on_var_interpolated_data'

# Add path with my_classes to sys.path
sys.path.insert(0, path + '/workspace_icon-ml/cloud_cover_parameterization/')
# Add sherpa
sys.path.insert(0, path + '/my_work/sherpa')

import sherpa
import sherpa.algorithms.bayesian_optimization as bayesian_optimization

# Reloading custom file to incorporate changes dynamically
import importlib
import my_classes
importlib.reload(my_classes)

from my_classes import read_mean_and_std
from my_classes import TimeOut

import datetime

# Minutes per fold
timeout = 2120 

# For logging purposes
days = 'all_days'

# Maximum amount of epochs for each model
epochs = 50

# Set seed for reproducibility
seed = 10
tf.random.set_seed(seed)

# For store_mean_model_biases
VERT_LAYERS = 31

gpus = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_visible_devices(gpus[3], 'GPU')

In [11]:
# Prevents crashes of the code
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[0], 'GPU')

In [13]:
# Allow the growth of memory Tensorflow allocates (limits memory usage overall)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [15]:
scaler = StandardScaler()

### Load the data

In [17]:
# input_narval = np.load(path_data + '/cloud_cover_input_narval.npy')
# input_qubicc = np.load(path_data + '/cloud_cover_input_qubicc.npy')
# output_narval = np.load(path_data + '/cloud_cover_output_narval.npy')
# output_qubicc = np.load(path_data + '/cloud_cover_output_qubicc.npy')

In [19]:
input_data = np.concatenate((np.load(path_data + '/cloud_cover_input_narval.npy'), 
                             np.load(path_data + '/cloud_cover_input_qubicc.npy')), axis=0)
output_data = np.concatenate((np.load(path_data + '/cloud_cover_output_narval.npy'), 
                              np.load(path_data + '/cloud_cover_output_qubicc.npy')), axis=0)

In [21]:
samples_narval = np.load(path_data + '/cloud_cover_output_narval.npy').shape[0]

In [23]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(1008913906, 10)

*Temporal cross-validation*

Split into 2-weeks increments (when working with 3 months of data). It's 25 day increments with 5 months of data. <br>
1.: Validate on increments 1 and 4 <br>
2.: Validate on increments 2 and 5 <br>
3.: Validate on increments 3 and 6

--> 2/3 training data, 1/3 validation data

In [25]:
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

### Define the model

Activation function for the last layer

In [None]:
# Activation function for the last layer
def my_act_fct(x):
    return K.minimum(K.maximum(x, 0), 100)

### 3-fold cross-validation

Actually only set i=1 here

In [None]:
# By decreasing timeout we make sure every fold gets the same amount of time
# After all, data-loading took some time (Have 3 folds, 60 seconds/minute)
# timeout = timeout - 1/3*1/60*(time.time() - t0)
timeout = timeout - 1/60*(time.time() - t0)
t0 = time.time()

#We loop through the folds
for i in range(1,2):
    
    filename = 'cross_validation_cell_based_fold_%d'%(i+1)
    
    #Standardize according to the fold
    scaler.fit(input_data[training_folds[i]])

    #Load the data for the respective fold and convert it to tf data
    input_train = scaler.transform(input_data[training_folds[i]])
    input_valid = scaler.transform(input_data[validation_folds[i]]) 
    output_train = output_data[training_folds[i]]
    output_valid = output_data[validation_folds[i]]
    
    # Clear memory (Reduces memory requirement to 151 GB)
    del input_data, output_data, first_incr, second_incr, validation_folds, training_folds
    gc.collect()
    
    # Column-based: batchsize of 128
    # Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
    # I'm not shuffling for hyperparameter tuning
    train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                                tf.data.Dataset.from_tensor_slices(output_train))) \
                .batch(batch_size=512, drop_remainder=True) \
                .prefetch(1)
    
    # Clear memory
    del input_train, output_train
    gc.collect()
    
    # No need to add prefetch.
    # tf data with batch_size=10**5 makes the validation evaluation 10 times faster
    valid_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_valid), 
                                tf.data.Dataset.from_tensor_slices(output_valid))) \
                .batch(batch_size=10**5, drop_remainder=True)
    
    # Clear memory (Reduces memory requirement to 151 GB)
    del input_valid, output_valid
    gc.collect()
    
#     #Feed the model
#     model.compile(
#         optimizer=tf.keras.optimizers.Adam(learning_rate=0.002),
#         loss=tf.keras.losses.MeanSquaredError()
#     )
    
#     #Train the model
# #     time_callback = TimeOut(t0, timeout*(i+1))
#     time_callback = TimeOut(t0, timeout)
#     history = model.fit(train_ds, epochs=epochs, verbose=2, validation_data=valid_ds, 
#                         callbacks=[time_callback])
# #     history = model.fit(train_ds, epochs=epochs, validation_data=valid_ds, callbacks=[time_callback])

#     #Save the model     
#     #Serialize model to YAML
#     model_yaml = model.to_yaml()
#     with open(os.path.join(path_model, filename+".yaml"), "w") as yaml_file:
#         yaml_file.write(model_yaml)
#     #Serialize model and weights to a single HDF5-file
#     model.save(os.path.join(path_model, filename+'.h5'), "w")
#     print('Saved model to disk')
    
#     #Plot the training history
#     if len(history.history['loss']) > len(history.history['val_loss']):
#         del history.history['loss'][-1]
#     pd.DataFrame(history.history).plot(figsize=(8,5))
#     plt.grid(True)
#     plt.ylabel('Mean Squared Error')
#     plt.xlabel('Number of epochs')
#     plt.savefig(os.path.join(path_figures, filename+'.pdf'))
    
#     with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
#         file.write('Results from the %d-th fold\n'%(i+1))
#         file.write('Training epochs: %d\n'%(len(history.history['val_loss'])))
#         file.write('Weights restored from epoch: %d\n\n'%(1+np.argmin(history.history['val_loss'])))

In [None]:
# random_num = np.random.randint(500000)
# print(random_num)

# def save_model(study, today, optimizer):
#     path = '/pf/b/b309170/workspace_icon-ml/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05/sherpa_results/'+\
#             today+'_'+optimizer+'_'+random_num
    
#     study.results = study.results[study.results['Status']=='COMPLETED'] #To specify results
#     study.results.index = study.results['Trial-ID']  #Trial-ID serves as a better index
#     # Remove those hyperparameters that actually do not appear in the model
#     for i in range(1, max(study.results['Trial-ID']) + 1):
#         depth = study.results.at[i, 'model_depth']
#         for j in range(depth, 5): #Or up to 8
#             study.results.at[i, 'activation_%d'%j] = None
# #             study.results.at[i, 'bn_%d'%j] = None
#     # Create the directory and save the SHERPA-output in it
#     try:
#         os.mkdir(path)
#     except OSError:
#         print('Creation of the directory %s failed' % path)
#     else: 
#         print('Successfully created the directory %s' % path)
#     study.save(path)

In [None]:
# # Good Reference: https://arxiv.org/pdf/1206.5533.pdf (Bengio), https://arxiv.org/pdf/2004.10652.pdf (Ott)
# # lrelu = lambda x: relu(x, alpha=0.01)

# For Leaky_ReLU:
from tensorflow import nn 

def lrelu(x):
    return nn.leaky_relu(x, alpha=0.01)

# OPTIMIZER = 'adam'
# parameters = [sherpa.Ordinal('num_units', [16, 32, 64, 128, 256]), #No need to vary these per layer. Could add 512.
#              sherpa.Discrete('model_depth', [2, 5]), #Originally [2,8] although 8 was never truly tested
#              sherpa.Choice('activation_1', ['relu', 'elu', 'tanh', nn.leaky_relu, lrelu]), #Adding SeLU is trickier
#              sherpa.Choice('activation_2', ['relu', 'elu', 'tanh', nn.leaky_relu, lrelu]), 
#              sherpa.Choice('activation_3', ['relu', 'elu', 'tanh', nn.leaky_relu, lrelu]),
#              sherpa.Choice('activation_4', ['relu', 'elu', 'tanh', nn.leaky_relu, lrelu]),
#              sherpa.Continuous('lrinit', [1e-4, 1e-1], 'log'),
#              sherpa.Ordinal('epsilon', [1e-8, 1e-7, 0.1, 1]),
#              sherpa.Continuous('dropout', [0., 0.5]),
#              sherpa.Continuous('l1_reg', [0, 0.01]),
#              sherpa.Continuous('l2_reg', [0, 0.01])]

              
# #              sherpa.Ordinal('bn_1', [0, 1]),
# #              sherpa.Ordinal('bn_2', [0, 1]),
# #              sherpa.Ordinal('bn_3', [0, 1]),
# #              sherpa.Ordinal('bn_4', [0, 1]),
# #              sherpa.Ordinal('bn_5', [0, 1]),
# #              sherpa.Ordinal('bn_6', [0, 1]),
# #              sherpa.Ordinal('bn_7', [0, 1])]

In [None]:
# # max_num_trials is left unspecified, so the optimization will run until the end of the job-runtime

# good_hyperparams = pd.DataFrame({'num_units': [256], 'model_depth': [3], 'activation_1': [lrelu], 'activation_2':[lrelu],
#                    'activation_3':['relu'], 'activation_4':['relu'], 'activation_5':['relu'], 'activation_6':['relu'],
#                    'activation_7':['relu'], 'lrinit':[0.008725626554323051], 'epsilon':[0.1], 'dropout':[0.1841244119677411], 
#                                  'l1_reg':[0.00016220861742929693], 'l2_reg':[0.007436944699610299]})

# # I expect an objective of around 61.

# alg = bayesian_optimization.GPyOpt(initial_data_points=good_hyperparams)

# # alg = bayesian_optimization.GPyOpt() 
# study = sherpa.Study(parameters=parameters, algorithm=alg, lower_is_better=True)

In [None]:
#Starting only with a few epochs
epochs = 3

# Usually setting patience=8
today = str(str(datetime.date.today())[:7]) # YYYY-MM

par = {'num_units': 256, 'model_depth': 3, 'activation_1': lrelu, 'activation_2': lrelu, 
       'lrinit': 0.008726, 'epsilon': 0.1, 'dropout': 0.184124,
       'l1_reg': 0.000162, 'l2_reg': 0.007437}

# Create the model
model = Sequential()

# Input layer
model.add(Dense(units=par['num_units'], activation=par['activation_1'], input_dim=no_of_features,
               kernel_regularizer=l1_l2(l1=par['l1_reg'], l2=par['l2_reg'])))
#     if (par['bn_1']==1):
#         model.add(BatchNormalization()) #There's some debate on whether to use it before or after the activation fct

# Hidden layers    
for j in range(2, par['model_depth']):
    model.add(Dense(units=par['num_units'], activation=par['activation_'+str(j)], 
                    kernel_regularizer=l1_l2(l1=par['l1_reg'], l2=par['l2_reg'])))
    model.add(Dropout(par['dropout'])) #After every hidden layer we (potentially) add a dropout layer
#         if (par['bn_'+str(j)]==1):
#             model.add(BatchNormalization())

# Output layer
model.add(Dense(1, activation='linear', 
                kernel_regularizer=l1_l2(l1=par['l1_reg'], l2=par['l2_reg'])))

# Optimizer: Adam is relatively robust w.r.t. its beta-parameters 
optimizer = Adam(lr=par['lrinit'], epsilon=par['epsilon']) 
model.compile(loss='mse', optimizer=optimizer)

# Train the model
model.fit(train_ds, epochs=epochs, verbose=1, validation_data=valid_ds) ## 3 epochs

**Learning rate scheduler**

We should add it to Sherpa when we test more than 10 epochs

In [None]:
# def scheduler(epoch, lr):
#     if epoch < 10:
#         return lr
#     else:
#         return lr * tf.math.exp(-0.1)
    
# callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
    
# model = tf.keras.Sequential(
#             [
#                 tf.keras.layers.Dense(256, activation='relu', input_dim = no_of_features),
#                 tf.keras.layers.Dense(256, activation='relu'),
#                 tf.keras.layers.Dense(1, activation=my_act_fct, dtype='float32'),
#             ],
#             name="cell_based_model",
#         )

# #Feed the model
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(learning_rate=0.02),
#     loss=tf.keras.losses.MeanSquaredError()
# )

In [None]:
# history = model.fit(train_ds, epochs=5, verbose=2, callbacks=[callback])