## Cross-Validation

1. We read the data from the npy files
2. We combine the QUBICC and NARVAL data
4. Set up cross validation

During cross-validation:

1. We scale the data, convert to tf data
2. Plot training progress, model biases 
3. Write losses and epochs into file

In [1]:
# Ran with 800GB (750GB should also be fine)

import sys
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import gc

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1_l2

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Activation

# For Leaky_ReLU:
from tensorflow import nn 

import matplotlib
matplotlib.use('PDF')

t0 = time.time()
path = '/home/b/b309170'

# Add path with my_classes to sys.path
sys.path.insert(0, path + '/workspace_icon-ml/cloud_cover_parameterization/')

# Reloading custom file to incorporate changes dynamically
import importlib
import my_classes
importlib.reload(my_classes)

from my_classes import write_infofile
from my_classes import read_mean_and_std
from my_classes import TimeOut

# Cross-validation fold (in 0,1,2)
fold = int(sys.argv[1])

# 'all', 'no_spinup'
days = 'no_spinup'

# Minutes per fold
timeout = 450 

# Maximum amount of epochs for each model
epochs = 30 

# Set seed for reproducibility
seed = 10
tf.random.set_seed(seed)

# gpus = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_visible_devices(gpus[3], 'GPU')

print(tf.__version__)

2.7.0


In [2]:
# Cloud Cover or Cloud Area?
output_var = sys.argv[2] # Set output_var to one of {'cl_volume', 'cl_area'}

path_base = os.path.join(path, 'workspace_icon-ml/cloud_cover_parameterization/grid_column_based_DYAMOND')
path_data = os.path.join(path, 'my_work/icon-ml_data/cloud_cover_parameterization/grid_column_based_DYAMOND')
    
path_model = os.path.join(path_base, 'saved_models')
path_figures = os.path.join(path_base, 'figures')

In [None]:
# Won't run on a CPU node
try:
    # Prevents crashes of the code
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.set_visible_devices(physical_devices[0], 'GPU')
    # Allow the growth of memory Tensorflow allocates (limits memory usage overall)
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
except:
    pass

In [4]:
scaler = StandardScaler()

### Load the data

In [5]:
input_data = np.transpose(np.load(path_data + '/cloud_cover_input_dyamond.npy'))

if output_var == 'cl_volume':
    output_data = np.transpose(np.load(path_data + '/cloud_cover_output_dyamond.npy'))
elif output_var == 'cl_area':
    output_data = np.transpose(np.load(path_data + '/cloud_area_output_dyamond.npy'))

In [6]:
# Actually, we have to remove the spinup here, if we want to have a model comparable to the other ones from symbolic regression!
if days == 'no_spinup':
    
    t_steps = 619
    h_fields = 79342
    no_vars = 163
    
    ## For the input data
    B = np.zeros((t_steps, no_vars, h_fields))
    # Invert reshaping
    for i in range(no_vars):
        B[:, i] = np.reshape(input_data[:, i], (t_steps, h_fields))
    # Discard spinup
    input_data = np.concatenate((B[80:329], B[(329+72):]), axis=0)
    
    # Reshape back
    B = [np.reshape(input_data[:, i], -1) for i in range(no_vars)]
    input_data = np.array(B).T
    
    no_vars = 27
    
    ## For the output data
    B = np.zeros((t_steps, no_vars, h_fields))
    # Invert reshaping
    for i in range(no_vars):
        B[:, i] = np.reshape(output_data[:, i], (t_steps, h_fields))
    # Discard spinup
    output_data = np.concatenate((B[80:329], B[(329+72):]), axis=0)
    
    # Reshape back
    B = [np.reshape(output_data[:, i], -1) for i in range(no_vars)]
    output_data = np.array(B).T

In [10]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(37052714, 163)

*Temporal cross-validation*

Split into 2-weeks increments (when working with 3 months of data). It's 25 day increments with 5 months of data. <br>
1.: Validate on increments 1 and 4 <br>
2.: Validate on increments 2 and 5 <br>
3.: Validate on increments 3 and 6

--> 2/3 training data, 1/3 validation data

In [7]:
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

Remove columns that are constant in at least one of the training folds

In [13]:
# remove_fields = []
# constant_0 = (np.max(input_data[training_folds[0]], axis=0) - np.min(input_data[training_folds[0]], axis=0) < 1e-6)
# constant_1 = (np.max(input_data[training_folds[1]], axis=0) - np.min(input_data[training_folds[1]], axis=0) < 1e-6)
# constant_2 = (np.max(input_data[training_folds[2]], axis=0) - np.min(input_data[training_folds[2]], axis=0) < 1e-6)
# for i in range(no_of_features):
#     if constant_0[i] or constant_1[i] or constant_2[i]:
#         print(i)
#         remove_fields.append(i)

# remove_fields

remove_fields = [27, 28, 29, 30, 31, 32, 135, 136, 137]
assert no_of_features == 163
input_data = np.delete(input_data, remove_fields, axis=1)
no_of_features = no_of_features - len(remove_fields)

[]

### Define the model

Activation function for the last layer

In [None]:
model = tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation='relu', input_dim = no_of_features),
                    tf.keras.layers.Dense(256, activation='relu'),
                    tf.keras.layers.Dense(27, activation='linear', dtype='float32'),
                ],
                name="column_based_model",
            )

### 3-fold cross-validation

When the training is lost in a local minimum, often a re-run helps with a different initialization of the model weights.
Or possibly a different shuffling seed.

In [None]:
# By decreasing timeout we make sure every fold gets the same amount of time
# After all, data-loading took some time (Have 3 folds, 60 seconds/minute)
# timeout = timeout - 1/3*1/60*(time.time() - t0)
timeout = timeout - 1/60*(time.time() - t0)
t0 = time.time()

if days == 'all':
    filename = 'cross_validation_column_based_%s_fold_%d'%(output_var, fold+1)
elif days == 'no_spinup':
    filename = 'cross_validation_column_based_%s_fold_%d_no_spinup'%(output_var, fold+1)

#Standardize according to the fold
scaler.fit(input_data[training_folds[fold]])

# Write the accompanying info-file [only once]
if not os.path.exists(os.path.join(path_model, filename + '.txt')):
    # We save the scaling parameters in a file [only once]
    if output_var == 'cl_volume':
        seed_i = int(str(0) + str(fold))
    elif output_var == 'cl_area':
        seed_i = int(str(1) + str(fold))
    with open(path_model+'/scaler_%d.txt'%seed_i, 'a') as file:
        file.write('Standard Scaler mean values:\n')
        file.write(str(scaler.mean_))
        file.write('\nStandard Scaler standard deviation:\n')
        file.write(str(np.sqrt(scaler.var_)))
    # Taken from preprocessing
    input_variables = []
    variables = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg']
    for el in variables:
        for i in range(21, 48):
            input_variables.append(el+'_%d'%i)
    input_variables.append('fr_land')
    np.delete(input_variables, remove_fields)
    
    in_and_out_variables = input_variables.copy()
    variables = [output_var]
    for el in variables:
        for i in range(21, 48):
            in_and_out_variables.append(el+'_%d'%i)
    with open(os.path.join(path_model, filename + '.txt'), 'a') as file:
        write_infofile(file, str(in_and_out_variables), str(input_variables), path_model, path_data, seed_i)

#Load the data for the respective fold and convert it to tf data
input_train = scaler.transform(input_data[training_folds[fold]])
input_valid = scaler.transform(input_data[validation_folds[fold]]) 
output_train = output_data[training_folds[fold]]
output_valid = output_data[validation_folds[fold]]

# Clear memory (Reduces memory requirement to 151 GB)
del input_data, output_data, first_incr, second_incr, validation_folds, training_folds
gc.collect()

#Feed the model. Increase the learning rate by a factor of 2 when increasing the batch size by a factor of 4
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.MeanSquaredError()
)

#Train the model
#     time_callback = TimeOut(t0, timeout*(i+1))
time_callback = TimeOut(t0, timeout)
history = model.fit(input_train, output_train, epochs=epochs, verbose=2, batch_size=128,
                    validation_data=(input_valid, output_valid), callbacks=[time_callback])
#     history = model.fit(train_ds, epochs=epochs, validation_data=valid_ds, callbacks=[time_callback])

#Save the model     
#Serialize model to YAML
model_yaml = model.to_yaml()
with open(os.path.join(path_model, filename+".yaml"), "w") as yaml_file:
    yaml_file.write(model_yaml)
#Serialize model and weights to a single HDF5-file
model.save(os.path.join(path_model, filename+'.h5'), "w")
print('Saved model to disk')

#Plot the training history
if len(history.history['loss']) > len(history.history['val_loss']):
    del history.history['loss'][-1]
pd.DataFrame(history.history).plot(figsize=(8,5))
plt.grid(True)
plt.ylabel('Mean Squared Error')
plt.xlabel('Number of epochs')
plt.savefig(os.path.join(path_figures, filename+'.pdf'))

with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
    file.write('Results from the %d-th fold\n'%(fold+1))
    file.write('Training epochs: %d\n'%(len(history.history['val_loss'])))
    file.write('Weights restored from epoch: %d\n\n'%(1+np.argmin(history.history['val_loss'])))