## We try to reduce the complexity of the DYAMOND cl_area NNs with 4-7 features

See #236.

In [None]:
# Ran with 800GB (750GB should also be fine)

import sys
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
import copy
import gc

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1_l2

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Activation

# For Leaky_ReLU:
from tensorflow import nn 

t0 = time.time()
path = '/home/b/b309170'

# Add path with my_classes to sys.path
sys.path.insert(0, path + '/workspace_icon-ml/cloud_cover_parameterization/')

# Reloading custom file to incorporate changes dynamically
import importlib
import my_classes
importlib.reload(my_classes)

from my_classes import write_infofile
from my_classes import read_mean_and_std
from my_classes import TimeOut

# We always pick the second fold (fold = 1)
fold = 1

# Which one of the 10 models to train (no_features in [4, 7])
no_features = int(sys.argv[1])

# Batch normalization and third layer: Bool
bn = bool(int(sys.argv[2]))
third_layer = bool(int(sys.argv[3]))

# Number of units per layer [16,32,64,128]
no_units = int(sys.argv[4])

# Minutes per fold
timeout = 450 

# Maximum amount of epochs for each model
epochs = 25 

# Set seed for reproducibility
seed = 10
tf.random.set_seed(seed)

# gpus = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_visible_devices(gpus[3], 'GPU')

print(tf.__version__)

In [9]:
import matplotlib
matplotlib.use('PDF')

In [15]:
# Cloud Cover or Cloud Area?
output_var = 'cl_area' # Set output_var to one of {'cl_volume', 'cl_area'} 

path_base = os.path.join(path, 'workspace_icon-ml/cloud_cover_parameterization/neighborhood_based_on_seq_feat_sel_DYAMOND')
path_data = os.path.join(path, 'my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND')
    
path_model = os.path.join(path_base, 'saved_models/hyperparameter_tests')
path_figures = os.path.join(path_base, 'figures/hyperparameter_tests')

In [16]:
# Won't run on a CPU node
try:
    # Prevents crashes of the code
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.set_visible_devices(physical_devices[0], 'GPU')
    # Allow the growth of memory Tensorflow allocates (limits memory usage overall)
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
except:
    pass

In [17]:
scaler = StandardScaler()

### Load the data

In [18]:
features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

loc = {}
for i in range(len(features)):
    loc[features[i]] = i

In [19]:
# Input data
# The 17_15 ran with the largest amount of training data
with open('~/workspace_icon-ml/symbolic_regression/finding_symmetries/seq_feature_selector_dyamond_nns/\
seq_feat_selector_training_data_%s_17_15.json'%output_var, 'r') as file:
    seq_results = json.load(file)
selected_vars = seq_results['features_%d'%no_features]

input_data = np.load(path_data + '/cloud_cover_input_dyamond.npy')
input_data = np.concatenate([np.expand_dims(input_data[:, loc[sel_var]], axis=1) for sel_var in selected_vars], axis = 1)

In [23]:
input_data.shape

(285179494, 4)

In [25]:
# Output data
if output_var == 'cl_volume':
    output_data = np.load(path_data + '/cloud_cover_output_dyamond.npy')
elif output_var == 'cl_area':
    output_data = np.load(path_data + '/cloud_area_output_dyamond.npy')

In [26]:
samples_total, _ = input_data.shape
(samples_total, no_features)

(285179494, 4)

*Temporal cross-validation*

Split into 2-weeks increments (when working with 3 months of data). It's 25 day increments with 5 months of data. <br>
1.: Validate on increments 1 and 4 <br>
2.: Validate on increments 2 and 5 <br>
3.: Validate on increments 3 and 6

--> 2/3 training data, 1/3 validation data

In [27]:
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

### Define the model

Activation function for the last layer

In [37]:
# Create the model
model = Sequential()

# First hidden layer
model.add(Dense(units=no_units, activation='tanh', input_dim=no_features, 
                kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))

# Second hidden layer
model.add(Dense(units=no_units, activation=nn.leaky_relu, kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))
# model.add(Dropout(0.221)) # We drop 18% of the hidden nodes
if bn:
    model.add(BatchNormalization())

if third_layer:
    # Third hidden layer
    model.add(Dense(units=no_units, activation='tanh', kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))
    # model.add(Dropout(0.221)) # We drop 18% of the hidden nodes

# Output layer
model.add(Dense(1, activation='linear', kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))

Preliminary baselines

In [38]:
# # This would be the loss of a NN which outputs zeros everywhere
# np.mean(np.array(output_data)**2)

In [39]:
# # This would be the loss of a NN which outputs the best constant value everywhere
# constant_mean = np.mean(np.array(output_data))
# np.mean((np.array(output_data) - constant_mean)**2)

### 3-fold cross-validation

When the training is lost in a local minimum, often a re-run helps with a different initialization of the model weights.
Or possibly a different shuffling seed.

In [None]:
# By decreasing timeout we make sure every fold gets the same amount of time
# After all, data-loading took some time (Have 3 folds, 60 seconds/minute)
# timeout = timeout - 1/3*1/60*(time.time() - t0)
timeout = timeout - 1/60*(time.time() - t0)
t0 = time.time()
    
filename = 'neighborhood_based_sfs_%s_no_features_%s_%s_%s_%d'%(output_var, no_features, bn, third_layer, no_units)

#Standardize according to the fold
scaler.fit(input_data[training_folds[fold]])

# Write the accompanying info-file [only once]
if not os.path.exists(os.path.join(path_model, filename + '.txt')):
    # We save the scaling parameters in a file [only once]
    if output_var == 'cl_volume':
        seed_i = int(str(0) + str(fold))
    elif output_var == 'cl_area':
        seed_i = int(str(1) + str(fold))
    with open(path_model+'/scaler_%d.txt'%seed_i, 'a') as file:
        file.write('Standard Scaler mean values:\n')
        file.write(str(scaler.mean_))
        file.write('\nStandard Scaler standard deviation:\n')
        file.write(str(np.sqrt(scaler.var_)))

    # Taken from preprocessing
    in_and_out_variables = np.array(selected_vars + [output_var])
    input_variables = np.array(selected_vars)
    with open(os.path.join(path_model, filename + '.txt'), 'a') as file:
        write_infofile(file, str(in_and_out_variables), str(input_variables), path_model, path_data, seed_i)

#Load the data for the respective fold and convert it to tf data
input_train = scaler.transform(input_data[training_folds[fold]])
input_valid = scaler.transform(input_data[validation_folds[fold]]) 
output_train = output_data[training_folds[fold]]
output_valid = output_data[validation_folds[fold]]

# Clear memory (Reduces memory requirement to 151 GB)
del input_data, output_data, first_incr, second_incr, validation_folds, training_folds
gc.collect()

# Column-based: batchsize of 128
# Cell-based: batchsize of at least 512
# Shuffle is actually very important because we start off with the uppermost layers with clc=0 basically throughout
# This can push us into a local minimum, preferrably yielding clc=0.
# The size of the shuffle buffer significantly impacts RAM requirements! Do not increase to above 10000.
# Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
# We might want to cache before shuffling, however it seems to slow down training
# We do not repeat after shuffle, because the validation set should be evaluated after each epoch
train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
                            tf.data.Dataset.from_tensor_slices(output_train))) \
            .shuffle(10**5, seed=seed) \
            .batch(batch_size=1028, drop_remainder=True) \
            .prefetch(1)

# Clear memory
del input_train, output_train
gc.collect()

# No need to add prefetch.
# tf data with batch_size=10**5 makes the validation evaluation 10 times faster
valid_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_valid), 
                            tf.data.Dataset.from_tensor_slices(output_valid))) \
            .batch(batch_size=10**5, drop_remainder=True)

# Clear memory (Reduces memory requirement to 151 GB)
del input_valid, output_valid
gc.collect()

#Feed the model. Increase the learning rate by a factor of 2 when increasing the batch size by a factor of 4
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.000433, epsilon=0.1),
    loss=tf.keras.losses.MeanSquaredError()
)

#Train the model
#     time_callback = TimeOut(t0, timeout*(i+1))
time_callback = TimeOut(t0, timeout)
# 20 mins per epoch
history = model.fit(train_ds, validation_data=valid_ds, epochs=epochs, verbose=2,
                    callbacks=[time_callback])
#     history = model.fit(train_ds, epochs=epochs, validation_data=valid_ds, callbacks=[time_callback])

#Save the model     
#Serialize model to YAML
model_yaml = model.to_yaml()
with open(os.path.join(path_model, filename+".yaml"), "w") as yaml_file:
    yaml_file.write(model_yaml)
#Serialize model and weights to a single HDF5-file
model.save(os.path.join(path_model, filename+'.h5'), "w")
print('Saved model to disk')

#Plot the training history
if len(history.history['loss']) > len(history.history['val_loss']):
    del history.history['loss'][-1]
pd.DataFrame(history.history).plot(figsize=(8,5))
plt.grid(True)
plt.ylabel('Mean Squared Error')
plt.xlabel('Number of epochs')
plt.savefig(os.path.join(path_figures, filename+'.pdf'))

with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
    file.write('Results from the %d-th fold\n'%(fold+1))
    file.write('Training epochs: %d\n'%(len(history.history['val_loss'])))
    file.write('Weights restored from epoch: %d\n\n'%(1+np.argmin(history.history['val_loss'])))

Starting training
Epoch 1/25
