## Cross-Validation

1. We read the data from the npy files
2. We combine the QUBICC and NARVAL data
4. Set up cross validation

During cross-validation:

1. We scale the data, convert to tf data
2. Plot training progress, model biases 
3. Write losses and epochs into file

In [1]:
# Ran with 800GB (750GB should also be fine)

import sys
import json
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import gc

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1_l2

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Activation

# For Leaky_ReLU:
from tensorflow import nn 

t0 = time.time()
path = '/home/b/b309170'

# Add path with my_classes to sys.path
sys.path.insert(0, path + '/workspace_icon-ml/cloud_cover_parameterization/')

# Reloading custom file to incorporate changes dynamically
import importlib
import my_classes
importlib.reload(my_classes)

from my_classes import write_infofile
from my_classes import read_mean_and_std
from my_classes import TimeOut

# We always pick the second fold (fold = 1)
fold = 1

# Which one of the 10 models to train (no_features in [4, 7])
no_features = int(sys.argv[1])

# Batch normalization and third layer: Bool
bn = bool(int(sys.argv[2]))
third_layer = bool(int(sys.argv[3]))

# Number of units per layer [16,32,64,128]
no_units = int(sys.argv[4])

# Minutes per fold
timeout = 450 

# Maximum amount of epochs for each model
epochs = 30 

# Set seed for reproducibility
seed = 10
tf.random.set_seed(seed)

# gpus = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_visible_devices(gpus[3], 'GPU')

print(tf.__version__)

2.4.1


In [2]:
# Cloud Cover or Cloud Area?
output_var = 'cl_area' # Set output_var to one of {'cl_volume', 'cl_area'} 

path_base = os.path.join(path, 'workspace_icon-ml/cloud_cover_parameterization/neighborhood_based_on_seq_feat_sel_DYAMOND')
path_data = os.path.join(path, 'my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND')
    
path_model = os.path.join(path_base, 'saved_models/hyperparameter_tests')
path_figures = os.path.join(path_base, 'figures/hyperparameter_tests')

In [3]:
# Won't run on a CPU node
try:
    # Prevents crashes of the code
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.set_visible_devices(physical_devices[0], 'GPU')
    # Allow the growth of memory Tensorflow allocates (limits memory usage overall)
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
except:
    pass

In [4]:
scaler = StandardScaler()

### Load the data

In [5]:
features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

loc = {}
for i in range(len(features)):
    loc[features[i]] = i

In [6]:
# Input data
# The 17_15 ran with the largest amount of training data
with open('~/workspace_icon-ml/symbolic_regression/finding_symmetries/seq_feature_selector_dyamond_nns/\
seq_feat_selector_training_data_%s_17_15.json'%output_var, 'r') as file:
    seq_results = json.load(file)
selected_vars = seq_results['features_%d'%no_features]

input_data = np.load(path_data + '/cloud_cover_input_dyamond.npy')
input_data = np.concatenate([np.expand_dims(input_data[:, loc[sel_var]], axis=1) for sel_var in selected_vars], axis = 1)

layers_data = np.load(path_data + '/samples_vertical_layers_dyamond.npy')

In [7]:
input_data.shape

(285179494, 4)

In [8]:
# Output data
if output_var == 'cl_volume':
    output_data = np.load(path_data + '/cloud_cover_output_dyamond.npy')
elif output_var == 'cl_area':
    output_data = np.load(path_data + '/cloud_area_output_dyamond.npy')

In [9]:
samples_total, _ = input_data.shape
(samples_total, no_features)

(285179494, 4)

*Temporal cross-validation*

Split into 2-weeks increments (when working with 3 months of data). It's 25 day increments with 5 months of data. <br>
1.: Validate on increments 1 and 4 <br>
2.: Validate on increments 2 and 5 <br>
3.: Validate on increments 3 and 6

--> 2/3 training data, 1/3 validation data

In [10]:
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

### Define the model

Activation function for the last layer

In [11]:
custom_objects = {}
custom_objects['leaky_relu'] = nn.leaky_relu

In [None]:
model_name = 'neighborhood_based_sfs_%s_no_features_%s_%s_%s_%d.h5'%(output_var, no_features, bn, third_layer, no_units)

model = load_model(os.path.join(path_model, model_name), custom_objects)

#### The data will need to be scaled according to the training folds

In [None]:
scaler = StandardScaler()

#### Useful functions to plot results

In [None]:
def mean_clc_per_vertical_layer(model, input_data, output_data, layers_data, batch_size=2**20):
    '''
        Input: 
            model: neural network
            input_data: Usually the validation data
            output_data: The ground truth output
            layers_data: Vector that tells us the vertical layer of a given sample
            
        Model prediction and the Ground Truth means per vertical layer
    '''
    # Predicted cloud cover means
    # Curiously it works best if we use predict_on_batch on small subsets of the data instead of predict(..., batch_size=...) 
    for i in range(1 + input_data.shape[0]//batch_size):
        if i == 0:
            a = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
        else:
            a = np.concatenate((a, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()
        
    pred_adj = np.minimum(np.maximum(a, 0), 100) 
    
    # Computing means with the help of layers_data
    clc_pred_mean = []; clc_data_mean = [];
    for i in range(5, 32):
        ind = np.where(layers_data == i)
        clc_data_mean.append(np.mean(output_data[ind], dtype=np.float64))
        clc_pred_mean.append(np.mean(pred_adj[ind], dtype=np.float64))
    
    return clc_pred_mean, clc_data_mean

#### Evaluate the models on the data

Add training and validation losses to the text files. <br>
Print results per vertical layer (respective validation set)

In [None]:
train_losses = [] ; valid_losses = [] ; valid_means = [] ; valid_model_predictions = [] ;
narval_means = [] ; narval_model_predictions = [] ; qubicc_means = [] ; qubicc_model_predictions = [] ;
qubicc_month_0 = [] ; qubicc_model_pred_month_0 = [] ; qubicc_month_1 = [] ; qubicc_model_pred_month_1 = [] ;
qubicc_month_2 = [] ; qubicc_model_pred_month_2 = [] ;

filename = 'neighborhood_based_sfs_%s_no_features_%s_%s_%s_%d'%(output_var, no_features, bn, third_layer, no_units)

#Standardize according to the fold
scaler.fit(input_data[training_folds[fold]])

#Load the data for the respective fold
input_train = scaler.transform(input_data[training_folds[fold]])
input_valid = scaler.transform(input_data[validation_folds[fold]])
output_train = output_data[training_folds[fold]]
output_valid = output_data[validation_folds[fold]]

## Training and validation losses
train_loss = model.evaluate(input_train, output_train, verbose=2, batch_size=10**5)
valid_loss = model.evaluate(input_valid, output_valid, verbose=2, batch_size=10**5)

train_losses.append(train_loss)
valid_losses.append(valid_loss)

with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
    file.write('Unbounded training loss: %.4f\n'%(train_loss))
    file.write('Unbounded validation loss: %.4f\n'%(valid_loss))

## Compute mean cloud cover per vertical layer
# On the respective validation sets (QUBICC and NARVAL)
try:
    clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_valid, output_valid, 
                                                               layers_data[validation_folds[fold]])
except(ResourceExhaustedError):
    print('Resource Exhausted Qubicc')
    clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_valid, output_valid, 
                                                               layers_data[validation_folds[fold]], batch_size=2**15)
valid_means.append(clc_data_mean)
valid_model_predictions.append(clc_pred_mean)

In [15]:
# # In case we want to reproduce the plots without running everything again:
# with open(os.path.join(path_figures, 'values_for_figures.txt'), 'w') as file:
#     file.write('On validation sets\n')
#     file.write(str(valid_means))
#     file.write(str(valid_model_predictions))

#### Compute bounded losses

We also save the scaling parameters for the fold-based models as we haven't done that yet.

In [16]:
def compute_bounded_loss(model, input_data, output_data, batch_size=2**20):
    for i in range(1 + input_data.shape[0]//batch_size):
        if i == 0:
            a = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
        else:
            a = np.concatenate((a, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()
        
    # Bounded output!
    pred_adj = np.minimum(np.maximum(a[:,0], 0), 100) 
    
    # Mean Squared Error
    return np.mean((pred_adj - output_data)**2, dtype=np.float64)

In [None]:
train_loss = compute_bounded_loss(model, input_train, output_train, batch_size=2**15)
valid_loss = compute_bounded_loss(model, input_valid, output_valid, batch_size=2**15)

with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
    file.write('Bounded training loss: %.4f\n'%(train_loss))
    file.write('Bounded validation loss: %.4f\n'%(valid_loss))