## Cross-Validation

1. We read the data from the npy files
2. We combine the QUBICC and NARVAL data
3. We remove columns that are constant throughout
4. Set up cross validation

During cross-validation:

1. We scale the data, convert to tf data (note that it is troublesome to save and load tf data)
2. Plot training progress
3. Write epochs into a text file

In [3]:
# Requires at least 500GB to run

import sys
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import gc

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Activation

t0 = time.time()
path = '/pf/b/b309170'

# Add path with my_classes to sys.path
sys.path.insert(0, path + '/workspace_icon-ml/cloud_cover_parameterization/')

# Reloading custom file to incorporate changes dynamically
import importlib
import my_classes
importlib.reload(my_classes)

from my_classes import read_mean_and_std
from my_classes import TimeOut

# Minutes per fold
timeout = 2120 

# For logging purposes
days = 'all_days'

# Maximum amount of epochs for each model
epochs = 50

# Set seed for reproducibility
seed = 10
tf.random.set_seed(seed)

# For store_mean_model_biases
VERT_LAYERS = 31

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[0], 'GPU')

In [4]:
# Cloud Cover or Cloud Area?
output_var = 'cl_area' # Set output_var to one of {'clc', 'cl_area'}
# QUBICC only or QUBICC+NARVAL training data? Always True for the paper
qubicc_only = True

path_base = os.path.join(path, 'workspace_icon-ml/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05')
path_data = os.path.join(path, 'my_work/icon-ml_data/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/based_on_var_interpolated_data')

if output_var == 'clc':
    full_output_var_name = 'cloud_cover'
elif output_var == 'cl_area':
    full_output_var_name = 'cloud_area'
    
if qubicc_only:
    output_folder = '%s_R2B5_QUBICC'%full_output_var_name
else:
    output_folder = '%s_R2B5_QUBICC+NARVAL'%full_output_var_name
path_model = os.path.join(path_base, 'saved_models', output_folder)
path_figures = os.path.join(path_base, 'figures', output_folder)
narval_output_file = '%s_output_narval.npy'%full_output_var_name
qubicc_output_file = '%s_output_qubicc.npy'%full_output_var_name

In [None]:
# Prevents crashes of the code
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

In [None]:
# Allow the growth of memory Tensorflow allocates (limits memory usage overall)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
scaler = StandardScaler()

### Load the data

In [7]:
# input_narval = np.load(path_data + '/cloud_cover_input_narval.npy')
# input_qubicc = np.transpose(np.load(path_data + '/cloud_cover_input_qubicc.npy'))
# output_narval = np.load(path_data + '/cloud_cover_output_narval.npy')
# output_qubicc = np.transpose(np.load(path_data + '/cloud_cover_output_qubicc.npy'))

In [None]:
# input_data = np.concatenate(input_narval, input_qubicc)
# output_data = np.concatenate(output_narval, output_qubicc)
input_data = np.concatenate((np.load(path_data + '/cloud_cover_input_narval.npy'), 
                             np.transpose(np.load(path_data + '/cloud_cover_input_qubicc.npy'))), axis=0)
output_data = np.concatenate((np.load(os.path.join(path_data, narval_output_file)), 
                              np.transpose(np.load(os.path.join(path_data, qubicc_output_file)))), axis=0)

In [None]:
samples_narval = np.load(path_data + '/cloud_cover_output_narval.npy').shape[0]

In [None]:
if qubicc_only:
    input_data = input_data[samples_narval:]
    output_data = output_data[samples_narval:]

In [9]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(176209421, 163)

*Temporal cross-validation*

Split into 2-weeks increments (when working with 3 months of data). It's 25 day increments with 5 months of data. <br>
1.: Validate on increments 1 and 4 <br>
2.: Validate on increments 2 and 5 <br>
3.: Validate on increments 3 and 6

--> 2/3 training data, 1/3 validation data

In [10]:
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

Remove columns that are constant in at least one of the training folds

In [11]:
# This takes a bit of time
# remove_fields = []
# constant_0 = (np.max(input_data[training_folds[0]], axis=0) - np.min(input_data[training_folds[0]], axis=0) < 1e-10)
# constant_1 = (np.max(input_data[training_folds[1]], axis=0) - np.min(input_data[training_folds[1]], axis=0) < 1e-10)
# constant_2 = (np.max(input_data[training_folds[2]], axis=0) - np.min(input_data[training_folds[2]], axis=0) < 1e-10)
# for i in range(no_of_features):
#     if constant_0[i] or constant_1[i] or constant_2[i]:
#         print(i)
#         remove_fields.append(i)

# These features correspond to qc_4, qc_5, qc_6, qc_7, qc_8, qc_9, zg_4, zg_5, zg_6
remove_fields = [27, 28, 29, 30, 31, 32, 135, 136, 137]
assert no_of_features == 163
input_data = np.delete(input_data, remove_fields, axis=1)
no_of_features = no_of_features - len(remove_fields)

### Define the model

Activation function for the last layer

In [13]:
model = tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation='relu', input_dim = no_of_features),
                    tf.keras.layers.Dense(256, activation='relu'),
                    tf.keras.layers.Dense(27, activation='linear', dtype='float32'),
                ],
                name="column_based_model",
            )

In [14]:
# # Stepping down on the ladder of complexity
# # Can't save a custom model
# class CustomModel(tf.keras.Model):
    
#     def __init__(self, model):
#         super(CustomModel, self).__init__()
#         self.model = model
    
#     def compile(self, optimizer, loss_fn):
#         super(CustomModel, self).compile()
#         self.optimizer = optimizer
#         self.loss_fn = loss_fn
    
#     # Call accepts only tf.tensors
#     def call(self, x):
#         return self.model(x)  
    
#     # Compile with XLA (throws an error)
#     # @tf.function(experimental_compile=True)
# #     @tf.function
#     def train_step(self, data):
#         # Unpack the data. Its structure depends on your model and
#         # on what you pass to `fit()`.
#         x,y = data

#         with tf.GradientTape() as tape:
#             y_pred = self(x, training=True)  # Forward pass
#             # Compute the loss value
#             # (the loss function is configured in `compile()`)
#             loss = self.loss_fn(y, y_pred)

#         # Compute gradients
#         trainable_vars = self.trainable_variables
#         gradients = tape.gradient(loss, trainable_vars)
#         # Update weights
#         self.optimizer.apply_gradients(zip(gradients, trainable_vars))
#         # Update metrics (includes the metric that tracks the loss)
#         self.compiled_metrics.update_state(y, y_pred)
#         # Return a dict mapping metric names to current value
#         return {'loss': loss}
    
#     # Without the test step, our model would yield 0 in every kind of evaluation outside training itself
#     def test_step(self, data):
#         # Unpack the data
#         x, y = data
#         # Compute predictions
#         y_pred = self(x, training=False)
#         # Updates the metrics tracking the loss
#         loss = self.loss_fn(y, y_pred)
#         # Update the metrics.
# #         self.compiled_metrics.update_state(y, y_pred)
#         # Return a dict mapping metric names to current value.
#         # Note that it will include the loss (tracked in self.metrics).
#         return {'loss': loss}

### 3-fold cross-validation

In [None]:
# By decreasing timeout we make sure every fold gets the same amount of time
# After all, data-loading took some time (Have 3 folds, 60 seconds/minute)
# timeout = timeout - 1/3*1/60*(time.time() - t0)
timeout = timeout - 1/60*(time.time() - t0)
t0 = time.time()

#We loop through the folds
for i in range(3):
    
#     print('hello')
    
    filename = 'cross_validation_column_based_fold_%d'%(i+1)
    
    #Standardize according to the fold
    scaler.fit(input_data[training_folds[i],:])
    
#     print('hello')

    #Load the data for the respective fold and convert it to tf data
    input_train = scaler.transform(input_data[training_folds[i]])
    input_valid = scaler.transform(input_data[validation_folds[i]])
    output_train = output_data[training_folds[i]]
    output_valid = output_data[validation_folds[i]]
    
    # Clear memory (Reduces memory requirement to 151 GB)
    del input_data, output_data, first_incr, second_incr, validation_folds, training_folds
    gc.collect()
    
#     print('hello')
#     # Use a batchsize of 64 or 128
#     # Possibly better to use .apply(tf.data.experimental.copy_to_device("/gpu:0")) before prefetch
#     train_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_train), 
#                                 tf.data.Dataset.from_tensor_slices(output_train))) \
#                 .batch(batch_size=128, drop_remainder=True).prefetch(1)
    
#     # Clear memory
#     del input_train, output_train
#     gc.collect()
    
# #     print('hello')
#     # No need to add prefetch.
#     # tf data with batch_size=10**5 makes the validation evaluation 10 times faster
#     valid_ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_valid), 
#                                 tf.data.Dataset.from_tensor_slices(output_valid))) \
#                 .batch(batch_size=10**5, drop_remainder=True)
    
#     # Clear memory (Reduces memory requirement to 151 GB)
#     del input_valid, output_valid
#     gc.collect()
    
    #Feed the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss=tf.keras.losses.MeanSquaredError()
    )
    
    #Train the model
#     time_callback = TimeOut(t0, timeout*(i+1))
    time_callback = TimeOut(t0, timeout)
    # Batch size is specified by the tf dataset
    history = model.fit(input_train, output_train, epochs=epochs, verbose=2, batch_size=128,
                        validation_data=(input_valid, output_valid), callbacks=[time_callback])
#     history = model.fit(train_ds, epochs=epochs, verbose=2, 
#                         validation_data=valid_ds, callbacks=[time_callback])
    
    #Save the model     
    #Serialize model to YAML
    model_yaml = model.to_yaml()
    with open(os.path.join(path_model, filename+".yaml"), "w") as yaml_file:
        yaml_file.write(model_yaml)
    #Serialize model and weights to a single HDF5-file
    model.save(os.path.join(path_model, filename+'.h5'), "w")
    print('Saved model to disk')
    
    #Plot the training history
    if len(history.history['loss']) > len(history.history['val_loss']):
        del history.history['loss'][-1]
    pd.DataFrame(history.history).plot(figsize=(8,5))
    plt.grid(True)
    plt.ylabel('Mean Squared Error')
    plt.xlabel('Number of epochs')
    plt.savefig(os.path.join(path_figures, filename+'.pdf'))
    
    with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
        file.write('Results from the %d-th fold\n'%(i+1))
        file.write('Training epochs: %d\n'%(len(history.history['val_loss'])))
        file.write('Weights restored from epoch: %d\n\n'%(np.argmin(history.history['val_loss']) + 1))

How much time did it take? <br>
With batch_size=10^5: 330s <br>
With batch_size=10^3: 477s <br>
With batch_size=10^1: 8511s <br>
Without tf data: 3000s