## Cross-Validation

1. We read the data from the npy files
2. We combine the QUBICC and NARVAL data
4. Set up cross validation

During cross-validation:

1. We scale the data, convert to tf data
2. Plot training progress, model biases 
3. Write losses and epochs into file

In [1]:
# Ran with 800GB (750GB should also be fine)

import sys
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import gc

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1_l2

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Activation

# For Leaky_ReLU:
from tensorflow import nn 

t0 = time.time()
path = '/home/b/b309170'

# Add path with my_classes to sys.path
sys.path.insert(0, path + '/workspace_icon-ml/cloud_cover_parameterization/')

# Reloading custom file to incorporate changes dynamically
import importlib
import my_classes
importlib.reload(my_classes)

from my_classes import write_infofile
from my_classes import read_mean_and_std
from my_classes import TimeOut

# Cross-validation fold (in 0,1,2)
fold = int(sys.argv[1]) 

# Minutes per fold
timeout = 450 

# Maximum amount of epochs for each model
epochs = 30 

# Set seed for reproducibility
seed = 10
tf.random.set_seed(seed)

# Do we want to evaluate a non_spinup model on no_spinup data?
# Choose from ['all', 'no_spinup']
days = sys.argv[2]

# gpus = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_visible_devices(gpus[3], 'GPU')

print(tf.__version__)

2.4.1


In [2]:
# Cloud Cover or Cloud Area?
output_var = sys.argv[3] # Set output_var to one of {'cl_volume', 'cl_area'} 

path_base = os.path.join(path, 'workspace_icon-ml/cloud_cover_parameterization/grid_column_based_DYAMOND')
path_data = os.path.join(path, 'my_work/icon-ml_data/cloud_cover_parameterization/grid_column_based_DYAMOND')
    
path_model = os.path.join(path_base, 'saved_models')
path_figures = os.path.join(path_base, 'figures')

In [3]:
# Won't run on a CPU node
try:
    # Prevents crashes of the code
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.set_visible_devices(physical_devices[0], 'GPU')
    # Allow the growth of memory Tensorflow allocates (limits memory usage overall)
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
except:
    pass

In [4]:
scaler = StandardScaler()

### Load the data

In [5]:
if days == 'all':
    input_data = np.transpose(np.load(path_data + '/cloud_cover_input_dyamond.npy'))

    if output_var == 'cl_volume':
        output_data = np.transpose(np.load(path_data + '/cloud_cover_output_dyamond.npy'))
    elif output_var == 'cl_area':
        output_data = np.transpose(np.load(path_data + '/cloud_area_output_dyamond.npy'))
elif days == 'no_spinup':
    input_data = np.transpose(np.load(path_data + '/cloud_cover_input_dyamond.npy'))

    if output_var == 'cl_volume':
        output_data = np.transpose(np.load(path_data + '/cloud_cover_output_dyamond.npy'))
    elif output_var == 'cl_area':
        output_data = np.transpose(np.load(path_data + '/cloud_area_output_dyamond.npy'))
      
    ## Remove the spin-up & reshape back (467*79342, 163)
    # Actually, we have to remove the spinup here, if we want to have a model comparable to the other ones from symbolic regression!
    t_steps = 619
    h_fields = 79342
    no_vars = 163

    ## For the input data
    B = np.zeros((t_steps, no_vars, h_fields))
    # Invert reshaping
    for i in range(no_vars):
        B[:, i] = np.reshape(input_data[:, i], (t_steps, h_fields))
    # Discard spinup
    input_data = np.concatenate((B[80:329], B[(329+72):]), axis=0)

    # Reshape back
    B = [np.reshape(input_data[:, i], -1) for i in range(no_vars)]
    input_data = np.array(B).T

    no_vars = 27

    ## For the output data
    B = np.zeros((t_steps, no_vars, h_fields))
    # Invert reshaping
    for i in range(no_vars):
        B[:, i] = np.reshape(output_data[:, i], (t_steps, h_fields))
    # Discard spinup
    output_data = np.concatenate((B[80:329], B[(329+72):]), axis=0)

    # Reshape back
    B = [np.reshape(output_data[:, i], -1) for i in range(no_vars)]
    output_data = np.array(B).T

In [6]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(49112698, 163)

*Temporal cross-validation*

Split into 2-weeks increments (when working with 3 months of data). It's 25 day increments with 5 months of data. <br>
1.: Validate on increments 1 and 4 <br>
2.: Validate on increments 2 and 5 <br>
3.: Validate on increments 3 and 6

--> 2/3 training data, 1/3 validation data

In [7]:
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

Remove columns that are constant in at least one of the training folds

In [8]:
remove_fields = [27, 28, 29, 30, 31, 32, 135, 136, 137]
assert no_of_features == 163
input_data = np.delete(input_data, remove_fields, axis=1)
no_of_features = no_of_features - len(remove_fields)

### Define the model

Activation function for the last layer

In [9]:
custom_objects = {}
custom_objects['leaky_relu'] = nn.leaky_relu

In [10]:
if days == 'all':
    model_name = 'cross_validation_column_based_%s_fold_%d.h5'%(output_var, (fold+1))
elif days == 'no_spinup':
    model_name = 'cross_validation_column_based_%s_fold_%d_no_spinup.h5'%(output_var, (fold+1))

model = load_model(os.path.join(path_model, model_name), custom_objects)

#### The data will need to be scaled according to the training folds

In [11]:
scaler = StandardScaler()

#### Useful functions to plot results

In [12]:
def mean_clc_per_vertical_layer(model, input_data, output_data, batch_size=2**20):
    '''
        Model prediction and the Ground Truth
    '''
    # output_var means for first model
    clc_data_mean = []
    for i in range(27):
        clc_data_mean.append(np.mean(output_data[:, i], dtype=np.float64))
        
    for i in range(input_data.shape[0]//batch_size): 
        if i == 0:
            a = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
        else:
            a = np.concatenate((a, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()
        
    pred_adj = np.minimum(np.maximum(a, 0), 100) 
    
    return list(np.mean(pred_adj, axis=0, dtype=np.float64)), clc_data_mean

#### Evaluate the models on the data

Add training and validation losses to the text files. <br>
Print results per vertical layer (respective validation set)

In [13]:
train_losses = [] ; valid_losses = [] ; valid_means = [] ; valid_model_predictions = [] ;
narval_means = [] ; narval_model_predictions = [] ; qubicc_means = [] ; qubicc_model_predictions = [] ;
qubicc_month_0 = [] ; qubicc_model_pred_month_0 = [] ; qubicc_month_1 = [] ; qubicc_model_pred_month_1 = [] ;
qubicc_month_2 = [] ; qubicc_model_pred_month_2 = [] ;

filename = 'cross_validation_column_based_%s_fold_%d'%(output_var, (fold+1))

#Standardize according to the fold
scaler.fit(input_data[training_folds[fold]])

#Load the data for the respective fold
input_train = scaler.transform(input_data[training_folds[fold]])
input_valid = scaler.transform(input_data[validation_folds[fold]])
output_train = output_data[training_folds[fold]]
output_valid = output_data[validation_folds[fold]]

## Training and validation losses
train_loss = model.evaluate(input_train, output_train, verbose=2, batch_size=10**5)
valid_loss = model.evaluate(input_valid, output_valid, verbose=2, batch_size=10**5)

train_losses.append(train_loss)
valid_losses.append(valid_loss)

with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
    file.write('Unbounded training loss: %.4f\n'%(train_loss))
    file.write('Unbounded validation loss: %.4f\n'%(valid_loss))

## Compute mean cloud cover per vertical layer
# On the respective validation sets (QUBICC and NARVAL)
try:
    clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_valid, output_valid)
except(ResourceExhaustedError):
    print('Resource Exhausted Qubicc')
    clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_valid, output_valid, batch_size=2**15)
valid_means.append(clc_data_mean)
valid_model_predictions.append(clc_pred_mean)

328/328 - 18s - loss: 53.6250
164/164 - 8s - loss: 56.1918


In [14]:
# In case we want to reproduce the plots without running everything again:
with open(os.path.join(path_figures, 'values_for_figures_%s_fold_%d_%s.txt'%(output_var, fold, days)), 'w') as file:
    file.write('On validation sets\n')
    file.write(str(valid_means))
    file.write(str(valid_model_predictions))

#### Compute bounded losses

We also save the scaling parameters for the fold-based models as we haven't done that yet.

In [15]:
def compute_bounded_loss(model, input_data, output_data, batch_size=2**20):
    for i in range(1 + input_data.shape[0]//batch_size):
        if i == 0:
            a = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
        else:
            a = np.concatenate((a, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()
        
    # Bounded output!
    pred_adj = np.minimum(np.maximum(a, 0), 100) 
    
    # Mean Squared Error
    return np.mean((pred_adj - output_data)**2, dtype=np.float64)

In [None]:
train_loss = compute_bounded_loss(model, input_train, output_train, batch_size=2**15)
valid_loss = compute_bounded_loss(model, input_valid, output_valid, batch_size=2**15)

with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
    file.write('Bounded training loss: %.4f\n'%(train_loss))
    file.write('Bounded validation loss: %.4f\n'%(valid_loss))