### Evaluate the cross-validation models

Compared to cross_validation_evaluate.ipynb in the column-based model it sufficed to <br>
i) Change the names from column to cell, <br>
ii) add leaky_relu, <br>
iii) load layers_data and adapt the function mean_clc_per_vertical_layer accordingly

I called model_all_data model_final before. Note that final_model does not mean it is the one that will be used in practice!

In [1]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import os
import gc
import importlib

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tensorflow.errors import ResourceExhaustedError

# Add path with my_classes to sys.path
path = '/home/b/b309170'
sys.path.insert(0, path + '/workspace_icon-ml/iconml_clc/')

import my_classes
importlib.reload(my_classes)
from my_classes import write_infofile
from my_classes import read_mean_and_std

# For Leaky_ReLU:
import tensorflow as tf
from tensorflow import nn 

2022-04-20 16:04:29.387951: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [26]:
## Set these parameters!

# Cloud Cover or Cloud Area?
output_var = 'clc' # Set output_var to one of {'clc', 'cl_area'}
# QUBICC only or QUBICC+NARVAL training data? Always set to True for the paper
qubicc_only = True
# Do we evaluate a model trained on all data? Always set to False for the paper
all_data_model = False

In [27]:
path_base = os.path.join(path, 'workspace_icon-ml/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05')
path_data = os.path.join(path, 'my_work/icon-ml_data/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05/based_on_var_interpolated_data')

if output_var == 'clc':
    full_output_var_name = 'cloud_cover'
elif output_var == 'cl_area':
    full_output_var_name = 'cloud_area'
    
if qubicc_only:
    output_folder = '%s_R2B5_QUBICC'%full_output_var_name
else:
    output_folder = '%s_R2B5_QUBICC+NARVAL'%full_output_var_name
path_model = os.path.join(path_base, 'saved_models', output_folder)
path_figures = os.path.join(path_base, 'figures', output_folder)
narval_output_file = '%s_output_narval.npy'%full_output_var_name
qubicc_output_file = '%s_output_qubicc.npy'%full_output_var_name

#### Load models

In [28]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[0], 'GPU')

IndexError: list index out of range

In [29]:
# def lrelu(x):
#     return nn.leaky_relu(x, alpha=0.01)

custom_objects = {}
custom_objects['leaky_relu'] = nn.leaky_relu

In [30]:
fold_1 = 'cross_validation_cell_based_fold_1.h5'
fold_2 = 'cross_validation_cell_based_fold_2.h5'
fold_3 = 'cross_validation_cell_based_fold_3.h5'

model_fold_1 = load_model(os.path.join(path_model, fold_1), custom_objects)
model_fold_2 = load_model(os.path.join(path_model, fold_2), custom_objects)
model_fold_3 = load_model(os.path.join(path_model, fold_3), custom_objects)

if all_data_model:
    all_data = 'cell_based_all_data_seed_10.h5'
    model_all_data = load_model(os.path.join(path_model, all_data), custom_objects)

# model_fold_1 = load_model(os.path.join(path_model, fold_1))
# model_fold_2 = load_model(os.path.join(path_model, fold_2))
# model_fold_3 = load_model(os.path.join(path_model, fold_3))

#### Load data

In [31]:
os.listdir(path_data)

['cloud_area_output_qubicc.npy',
 'cloud_cover_input_qubicc.npy',
 'cloud_cover_output_qubicc.npy',
 'samples_vertical_layers_qubicc.npy',
 'cloud_cover_input_narval.npy',
 'cloud_area_output_narval.npy',
 'cloud_cover_output_narval.npy',
 'ps_input_qubicc.npy',
 'samples_vertical_layers_narval.npy']

In [32]:
input_data = np.concatenate((np.load(path_data + '/cloud_cover_input_narval.npy'), 
                             np.load(path_data + '/cloud_cover_input_qubicc.npy')), axis=0)
output_data = np.concatenate((np.load(os.path.join(path_data, narval_output_file)), 
                              np.load(os.path.join(path_data, qubicc_output_file))), axis=0)

In [8]:
layers_data = np.concatenate((np.load(path_data + '/samples_vertical_layers_narval.npy'), 
                              np.load(path_data + '/samples_vertical_layers_qubicc.npy')), axis=0)

In [9]:
samples_narval = np.load(os.path.join(path_data, narval_output_file)).shape[0]

In [10]:
(samples_total, no_of_features) = input_data.shape

#### Define cross-validation folds to recreate training and validation data

In [11]:
def set_training_validation_folds(samples_total, samples_narval):
    training_folds = []
    validation_folds = []
    two_week_incr = (samples_total-samples_narval)//6

    for i in range(3):
        # Note that this is a temporal split since time was the first dimension in the original tensor
        first_incr = np.arange(samples_narval+two_week_incr*i, samples_narval+two_week_incr*(i+1))
        second_incr = np.arange(samples_narval+two_week_incr*(i+3), samples_narval+two_week_incr*(i+4))
        
        print(second_incr)

        validation_folds.append(np.append(first_incr, second_incr))
        training_folds.append(np.arange(samples_narval, samples_total))
        training_folds[i] = np.setdiff1d(training_folds[i], validation_folds[i])
        
    return training_folds, validation_folds

if qubicc_only:
    # We have to skip the NARVAL data if we do qubicc_only
    training_folds, validation_folds = set_training_validation_folds(samples_total, samples_narval)
else:
    training_folds, validation_folds = set_training_validation_folds(samples_total, 0)

[568092977 568092978 568092979 ... 715172741 715172742 715172743]
[715172744 715172745 715172746 ... 862252508 862252509 862252510]
[ 862252511  862252512  862252513 ... 1009332275 1009332276 1009332277]


#### The data will need to be scaled according to the training folds

In [14]:
scaler = StandardScaler()

#### Useful functions to plot results

In [15]:
def mean_clc_per_vertical_layer(model, input_data, output_data, layers_data, batch_size=2**20):
    '''
        Input: 
            model: neural network
            input_data: Usually the validation data
            output_data: The ground truth output
            layers_data: Vector that tells us the vertical layer of a given sample
            
        Model prediction and the Ground Truth means per vertical layer
    '''
    # Predicted cloud cover means
    # Curiously it works best if we use predict_on_batch on small subsets of the data instead of predict(..., batch_size=...) 
    for i in range(1 + input_data.shape[0]//batch_size):
        if i == 0:
            a = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
        else:
            a = np.concatenate((a, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()
        
    pred_adj = np.minimum(np.maximum(a, 0), 100) 
    
    # Computing means with the help of layers_data
    clc_pred_mean = []; clc_data_mean = [];
    for i in range(5, 32):
        ind = np.where(layers_data == i)
        clc_data_mean.append(np.mean(output_data[ind], dtype=np.float64))
        clc_pred_mean.append(np.mean(pred_adj[ind], dtype=np.float64))
    
    return clc_pred_mean, clc_data_mean

In [16]:
def save_figure(fig_name, fig_title, model_predictions, valid_means=None, all_data_model=False):
    '''
        Note that this figure truly is a different performance measure than the validation error.
        The reason is that the mean can in principle be good even when the model is really bad.
        
        model_predictions: Array of length 3 or 4, covers predictions from all three folds for a given TL setup
        valid_means: Array of length 3 or 4, covers validation means from all three folds for a given TL setup
    '''
#     assert len(model_biases) == 3
    
    # Vertical layers
    a = np.linspace(5, 31, 27)
    fig = plt.figure(figsize=(11,7))
    # For model
    ax = fig.add_subplot(111, xlabel='Mean %s'%output_var, ylabel='Vertical layer', title=fig_title)
    
    if all_data_model:
        if not valid_means[0] == valid_means[1] == valid_means[2] == valid_means[3]:
            colors = ['g', 'b', 'r']
            for i in range(len(model_predictions)):
                ax.plot(model_predictions[i], a, colors[i])
                if valid_means != None:
                    ax.plot(valid_means[i], a, '%s--'%colors[i])
            plt.gca().invert_yaxis()
            ax.legend(['Model Fold 1 Predictions', 'Fold 1 Truth', 'Model Fold 2 Predictions', 'Fold 2 Truth', 
                       'Model Fold 3 Predictions', 'Fold 3 Truth', 'Model All Data Predictions', 'Truth'])
        else:
            for i in range(len(model_predictions)):
                ax.plot(model_predictions[i], a)
            ax.plot(valid_means[0], a, 'black')
            plt.gca().invert_yaxis()
            ax.legend(['Model Fold 1 Predictions', 'Model Fold 2 Predictions', 'Model Fold 3 Predictions', 
                       'Model All Data Predictions', 'Truth'])
    else:
        if not valid_means[0] == valid_means[1] == valid_means[2]:
            colors = ['g', 'b', 'r']
            for i in range(len(model_predictions)):
                ax.plot(model_predictions[i], a, colors[i])
                if valid_means != None:
                    ax.plot(valid_means[i], a, '%s--'%colors[i])
            plt.gca().invert_yaxis()
            ax.legend(['Model Fold 1 Predictions', 'Fold 1 Truth', 'Model Fold 2 Predictions', 'Fold 2 Truth', 
                       'Model Fold 3 Predictions', 'Fold 3 Truth'])
        else:
            for i in range(len(model_predictions)):
                ax.plot(model_predictions[i], a)
            ax.plot(valid_means[0], a, 'black')
            plt.gca().invert_yaxis()
            ax.legend(['Model Fold 1 Predictions', 'Model Fold 2 Predictions', 'Model Fold 3 Predictions', 
                       'Truth'])

    fig.savefig(os.path.join(path_figures, fig_name+'.pdf'))

#### Evaluate the models on the data

Add training and validation losses to the text files. <br>
Print results per vertical layer (respective validation set/NARVAL/QUBICC)

In [16]:
train_losses = [] ; valid_losses = [] ; valid_means = [] ; valid_model_predictions = [] ;
narval_means = [] ; narval_model_predictions = [] ; qubicc_means = [] ; qubicc_model_predictions = [] ;
qubicc_month_0 = [] ; qubicc_model_pred_month_0 = [] ; qubicc_month_1 = [] ; qubicc_model_pred_month_1 = [] ;
qubicc_month_2 = [] ; qubicc_model_pred_month_2 = [] ;

for i in range(3): 
    filename = 'cross_validation_cell_based_fold_%d'%(i+1)
    # Choose appropriate model for this fold
    if i == 0: model = model_fold_1
    if i == 1: model = model_fold_2
    if i == 2: model = model_fold_3
    
    #Standardize according to the fold
    scaler.fit(input_data[training_folds[i]])
    
    #Load the data for the respective fold
    input_train = scaler.transform(input_data[training_folds[i]])
    input_valid = scaler.transform(input_data[validation_folds[i]])
    output_train = output_data[training_folds[i]]
    output_valid = output_data[validation_folds[i]]
    
    ## Training and validation losses
    train_loss = model.evaluate(input_train, output_train, verbose=2, batch_size=10**5)
    valid_loss = model.evaluate(input_valid, output_valid, verbose=2, batch_size=10**5)
    
    # Clear up some memory
    del input_train, output_train
    gc.collect()
    
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
        file.write('Unbounded training loss: %.4f\n'%(train_loss))
        file.write('Unbounded validation loss: %.4f\n'%(valid_loss))
        
    ## Compute mean cloud cover per vertical layer
    # On the respective validation sets (QUBICC and NARVAL)
    try:
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_valid, output_valid, 
                                                                   layers_data[validation_folds[i]])
    except(ResourceExhaustedError):
        print('Resource Exhausted Qubicc')
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_valid, output_valid, 
                                                                   layers_data[validation_folds[i]], batch_size=2**15)
    valid_means.append(clc_data_mean)
    valid_model_predictions.append(clc_pred_mean)
    
    # Clear up some memory
    del input_valid, output_valid
    gc.collect()
    
    # For NARVAL
    input_narval = scaler.transform(input_data[:samples_narval])
    output_narval = output_data[:samples_narval]
    try:
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_narval, output_narval,
                                                                  layers_data[:samples_narval])
    except(ResourceExhaustedError):
        print('Resource Exhausted Narval')
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_narval, output_narval, 
                                                                   layers_data[:samples_narval], 
                                                                   batch_size=2**15)
    narval_means.append(clc_data_mean)
    narval_model_predictions.append(clc_pred_mean)
    
    # Clear up some memory
    del input_narval, output_narval
    gc.collect()
    
    # For QUBICC  
    input_qubicc = scaler.transform(input_data[samples_narval:])
    output_qubicc = output_data[samples_narval:]
    try:
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc,
                                                                  layers_data[samples_narval:])
    except(ResourceExhaustedError):
        print('Resource Exhausted Qubicc')
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc, 
                                                                   layers_data[samples_narval:], 
                                                                   batch_size=2**15)
    qubicc_means.append(clc_data_mean)
    qubicc_model_predictions.append(clc_pred_mean)
    
    # Clear up some memory
    del input_qubicc, output_qubicc
    gc.collect()
    
    # QUBICC months
    qubicc_month = (samples_total - samples_narval)//3
    for month in range(3):
        first_ind = samples_narval + month*qubicc_month
        last_ind = samples_narval + (month+1)*qubicc_month
        input_qubicc = scaler.transform(input_data[first_ind:last_ind])
        output_qubicc = output_data[first_ind:last_ind]
        try:
            clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc,
                                                                      layers_data[first_ind:last_ind])
        except(ResourceExhaustedError):
            print('Resource Exhausted Qubicc')
            clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc, 
                                                                       layers_data[first_ind:last_ind],
                                                                       batch_size=2**15)
        if month==0: 
            qubicc_month_0.append(clc_data_mean)
            qubicc_model_pred_month_0.append(clc_pred_mean)
        if month==1:
            qubicc_month_1.append(clc_data_mean)
            qubicc_model_pred_month_1.append(clc_pred_mean)
        if month==2:
            qubicc_month_2.append(clc_data_mean)
            qubicc_model_pred_month_2.append(clc_pred_mean)

    # Clear up some memory
    del input_qubicc, output_qubicc
    gc.collect()

5884/5884 - 43s - loss: 33.4026
2942/2942 - 22s - loss: 33.7873
5884/5884 - 46s - loss: 33.2210
2942/2942 - 22s - loss: 32.7654
5884/5884 - 45s - loss: 39.6006
2942/2942 - 21s - loss: 40.9376


For model_all_data

In [17]:
# if all_data_model:
#     filename = 'cell_based_all_data_seed_10'
#     model = model_all_data

#     #Standardize according to the fold
#     scaler.fit(input_data)

#     #Load the data for the respective fold
#     input_train = scaler.transform(input_data)
#     output_train = output_data

#     ## Training loss
#     train_loss = model.evaluate(input_train, output_train, verbose=2, batch_size=10**5)

#     # Clear up some memory
#     del input_train, output_train
#     gc.collect()

#     train_losses.append(train_loss)

#     with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
#         file.write('Unbounded training loss: %.4f\n'%(train_loss))

#     ## For NARVAL
#     input_narval = scaler.transform(input_data[:samples_narval])
#     output_narval = output_data[:samples_narval]
#     try:
#         clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_narval, output_narval,
#                                                                   layers_data[:samples_narval])
#     except(ResourceExhaustedError):
#         print('Resource Exhausted Narval')
#         clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_narval, output_narval, 
#                                                                    layers_data[:samples_narval], 
#                                                                    batch_size=2**15)
#     narval_means.append(clc_data_mean)
#     narval_model_predictions.append(clc_pred_mean)

#     # Clear up some memory
#     del input_narval, output_narval
#     gc.collect()

#     ## For QUBICC  
#     input_qubicc = scaler.transform(input_data[samples_narval:])
#     output_qubicc = output_data[samples_narval:]
#     try:
#         clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc,
#                                                                   layers_data[samples_narval:])
#     except(ResourceExhaustedError):
#         print('Resource Exhausted Qubicc')
#         clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc, 
#                                                                    layers_data[samples_narval:], 
#                                                                    batch_size=2**15)
#     qubicc_means.append(clc_data_mean)
#     qubicc_model_predictions.append(clc_pred_mean)

#     # Clear up some memory
#     del input_qubicc, output_qubicc
#     gc.collect()

#     ## QUBICC months
#     qubicc_month = (samples_total - samples_narval)//3
#     for month in range(3):
#         first_ind = samples_narval + month*qubicc_month
#         last_ind = samples_narval + (month+1)*qubicc_month
#         input_qubicc = scaler.transform(input_data[first_ind:last_ind])
#         output_qubicc = output_data[first_ind:last_ind]
#         try:
#             clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc,
#                                                                       layers_data[first_ind:last_ind])
#         except(ResourceExhaustedError):
#             print('Resource Exhausted Qubicc')
#             clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc, 
#                                                                        layers_data[first_ind:last_ind],
#                                                                        batch_size=2**15)
#         if month==0: 
#             qubicc_month_0.append(clc_data_mean)
#             qubicc_model_pred_month_0.append(clc_pred_mean)
#         if month==1:
#             qubicc_month_1.append(clc_data_mean)
#             qubicc_model_pred_month_1.append(clc_pred_mean)
#         if month==2:
#             qubicc_month_2.append(clc_data_mean)
#             qubicc_model_pred_month_2.append(clc_pred_mean)

#     # Clear up some memory
#     del input_qubicc, output_qubicc
#     gc.collect()

In [18]:
# # # Plot results
# save_figure('cross_validation_validation_means', 'Cell-based models on the respective validation sets', 
#             valid_model_predictions, valid_means, all_data_model)
# save_figure('cross_validation_narval', 'Cell-based models on the NARVAL data', 
#             narval_model_predictions, narval_means, all_data_model)
# save_figure('cross_validation_qubicc', 'Cell-based models on the QUBICC data', 
#             qubicc_model_predictions, qubicc_means, all_data_model)
# # Qubicc months
# save_figure('cross_validation_qubicc_hc2', 'Cell-based models on the QUBICC data, November 2004', 
#             qubicc_model_pred_month_0, qubicc_month_0, all_data_model)
# save_figure('cross_validation_qubicc_hc3', 'Cell-based models on the QUBICC data, April 2005', 
#             qubicc_model_pred_month_1, qubicc_month_1, all_data_model)
# save_figure('cross_validation_qubicc_hc4', 'Cell-based models on the QUBICC data, November 2005', 
#             qubicc_model_pred_month_2, qubicc_month_2, all_data_model)

In [19]:
# In case we want to reproduce the plots without running everything again:
with open(os.path.join(path_figures, 'values_for_figures.txt'), 'w') as file:
    file.write('On validation sets\n')
    file.write(str(valid_means))
    file.write(str(valid_model_predictions))
    file.write('\n\nNARVAL data\n')
    file.write(str(narval_means))
    file.write(str(narval_model_predictions))
    file.write('\n\nQubicc data\n')
    file.write(str(qubicc_means))
    file.write(str(qubicc_model_predictions))
    file.write('\n\nQubicc data, November 2004\n')
    file.write(str(qubicc_month_0))
    file.write(str(qubicc_model_pred_month_0))
    file.write('\n\nQubicc data, April 2005\n')
    file.write(str(qubicc_month_1))
    file.write(str(qubicc_model_pred_month_1))
    file.write('\n\nQubicc data, November 2005\n')
    file.write(str(qubicc_month_2))
    file.write(str(qubicc_model_pred_month_2))

#### Compute bounded losses

We also save the scaling parameters for the fold-based models as we haven't done that yet.

In [13]:
def compute_bounded_loss(model, input_data, output_data, batch_size=2**20):
    for i in range(1 + input_data.shape[0]//batch_size):
        if i == 0:
            a = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
        else:
            a = np.concatenate((a, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()
        
    # Bounded output!
    pred_adj = np.minimum(np.maximum(a[:,0], 0), 100) 
    
    # Mean Squared Error
    return np.mean((pred_adj - output_data)**2, dtype=np.float64)

In [None]:
seed = 10

for i in range(3): #for i in range(3):
    filename = 'cross_validation_cell_based_fold_%d'%(i+1)
    # Choose appropriate model for this fold
    if i == 0: model = model_fold_1
    if i == 1: model = model_fold_2
    if i == 2: model = model_fold_3
        
    #Standardize according to the fold
    scaler.fit(input_data[training_folds[i]])
    
    # We save the scaling parameters in a file [only once]
#     seed_i = int(str(seed) + str(i))
#     with open(path_model+'/scaler_%d.txt'%seed_i, 'a') as file:
#         file.write('Standard Scaler mean values:\n')
#         file.write(str(scaler.mean_))
#         file.write('\nStandard Scaler standard deviation:\n')
#         file.write(str(np.sqrt(scaler.var_)))

    # Taken from preprocessing_narval
    in_and_out_variables = np.array(['qv', 'qc', 'qi', 'temp', 'pres', 'u', 'v', 'zg', 'coriolis', 'fr_land', output_var])
    input_variables = np.array(['qv', 'qc', 'qi', 'temp', 'pres', 'u', 'v', 'zg', 'coriolis', 'fr_land'])

#     # Write the accompanying info-file [only once]
#     with open(os.path.join(path_model, filename + '.txt'), 'a') as file:
#         write_infofile(file, str(in_and_out_variables), str(input_variables), path_model, path_data, seed_i)
    
    #Load the data for the respective fold
    input_train = scaler.transform(input_data[training_folds[i]])
    input_valid = scaler.transform(input_data[validation_folds[i]])
    output_train = output_data[training_folds[i]]
    output_valid = output_data[validation_folds[i]]
    
    train_loss = compute_bounded_loss(model, input_train, output_train, batch_size=2**15)
    valid_loss = compute_bounded_loss(model, input_valid, output_valid, batch_size=2**15)
        
    with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
        file.write('Bounded training loss: %.4f\n'%(train_loss))
        file.write('Bounded validation loss: %.4f\n'%(valid_loss))

Trained on all data

In [None]:
# if all_data_model:
#     filename = 'cell_based_all_data_seed_10'
#     model = model_all_data

#     #Standardize according to the fold
#     scaler.fit(input_data)

#     #Load the data for the respective fold
#     input_train = scaler.transform(input_data)
#     output_train = output_data

#     train_loss = compute_bounded_loss(model, input_train, output_train, batch_size=2**15)

#     with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
#         file.write('Bounded training loss: %.4f\n'%(train_loss))

### Range of possible output values

In [12]:
#Standardize the input data. Why did we look at fold 3 here, if the second one is the best one??
mean, std = read_mean_and_std(os.path.join(path_model, 
                                               'cross_validation_cell_based_fold_3.txt'))
input_train = (input_data - mean)/std

In [14]:
# model_fold_3 is implemented in ICON-A
batch_size = 2**20

for i in range(1 + input_train.shape[0]//batch_size):
    if i == 0:
        image = model_fold_3.predict_on_batch(input_train[i*batch_size:(i+1)*batch_size])
    else:
        image = np.concatenate((image, model_fold_3.predict_on_batch(input_train[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [15]:
image.shape

(1009332282, 1)

In [16]:
np.min(image, axis=0)

array([-5.9592514], dtype=float32)

In [17]:
np.max(image, axis=0)

array([107.00978], dtype=float32)

### How often are the predictions of the model from split 2 outside [0, 100]?

In [33]:
model = model_fold_2

In [34]:
#Standardize the input data.
mean, std = read_mean_and_std(os.path.join(path_model, 
                                               'cross_validation_cell_based_fold_2.txt'))

input_train = (input_data[training_folds[1]] - mean)/std
input_valid = (input_data[validation_folds[1]] - mean)/std
output_train = output_data[training_folds[1]]
output_valid = output_data[validation_folds[1]]

Validation data

In [35]:
# Need 170GB in total
batch_size = 2**26

for i in range(1 + input_valid.shape[0]//batch_size): ## 1 + input_valid.shape[0]//batch_size
    if i == 0:
        image_valid = model.predict_on_batch(input_valid[i*batch_size:(i+1)*batch_size])
    else:
        image_valid = np.concatenate((image_valid, model.predict_on_batch(input_valid[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [36]:
# Outside of the [0, 100] range
(np.sum(np.where(image_valid < 0, True, False)) + np.sum(np.where(image_valid > 100, True, False)))/len(image_valid)

0.3348307656756078

In [37]:
# Outside of the [-1, 100] range
(np.sum(np.where(image_valid < -1, True, False)) + np.sum(np.where(image_valid > 100, True, False)))/len(image_valid)

0.08770510222524353

Training data

In [44]:
# Need 170GB in total
batch_size = 2**25

for i in range(1 + input_train.shape[0]//batch_size):
    if i == 0:
        image_train = model_fold_3.predict_on_batch(input_train[i*batch_size:(i+1)*batch_size])
    else:
        image_train = np.concatenate((image_train, model_fold_3.predict_on_batch(input_train[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [45]:
# Outside of the [0, 100] range
(np.sum(np.where(image_train < 0, True, False)) + np.sum(np.where(image_train > 100, True, False)))/len(image_train)

0.1338802594521362

In [46]:
# Outside of the [-1, 100] range
(np.sum(np.where(image_train < -1, True, False)) + np.sum(np.where(image_train > 100, True, False)))/len(image_train)

0.007492116454793429

Entire data set

In [47]:
image_all = np.concatenate((image_train, image_valid))

In [48]:
# Outside of the [0, 100] range
(np.sum(np.where(image_all < 0, True, False)) + np.sum(np.where(image_all > 100, True, False)))/len(image_all)

0.20086376122301144

In [49]:
# Outside of the [-1, 100] range
(np.sum(np.where(image_all < -1, True, False)) + np.sum(np.where(image_all > 100, True, False)))/len(image_all)

0.03422977825708332