### Evaluate the cross-validation models

Throws an error when run inside a slurm job:

*QStandardPaths: XDG_RUNTIME_DIR not set, defaulting to '/tmp/runtime-b309170'
qt.qpa.screen: QXcbConnection: Could not connect to display mlogin103:31.0
Could not connect to any X display.*

-> This error happens inside save_figure

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import sys
import os
import gc

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tensorflow.errors import ResourceExhaustedError
import tensorflow as tf

# Add path with my_classes to sys.path
path = '/home/b/b309170'
sys.path.insert(0, path + '/workspace_icon-ml/iconml_clc/')

import my_classes
from my_classes import write_infofile
from my_classes import read_mean_and_std

2022-04-20 16:04:03.175493: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[0], 'GPU')

In [4]:
# Won't run on a CPU node
try:
    # Prevents crashes of the code
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.set_visible_devices(physical_devices[0], 'GPU')
    # Allow the growth of memory Tensorflow allocates (limits memory usage overall)
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
except:
    pass

In [49]:
# Cloud Cover or Cloud Area?
output_var = 'clc' # Set output_var to one of {'clc', 'cl_area'}
# QUBICC only or QUBICC+NARVAL training data?
qubicc_only = True
# Do we evaluate a model trained on all data?
all_data_model = False

path_base = os.path.join(path, 'workspace_icon-ml/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05')
path_data = os.path.join(path, 'my_work/icon-ml_data/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/based_on_var_interpolated_data')

if output_var == 'clc':
    full_output_var_name = 'cloud_cover'
elif output_var == 'cl_area':
    full_output_var_name = 'cloud_area'
    
if qubicc_only:
    output_folder = '%s_R2B5_QUBICC'%full_output_var_name
else:
    output_folder = '%s_R2B5_QUBICC+NARVAL'%full_output_var_name
path_model = os.path.join(path_base, 'saved_models', output_folder)
path_figures = os.path.join(path_base, 'figures', output_folder)
narval_output_file = '%s_output_narval.npy'%full_output_var_name
qubicc_output_file = '%s_output_qubicc.npy'%full_output_var_name

#### Load models

In [50]:
fold_1 = 'cross_validation_column_based_fold_1.h5'
fold_2 = 'cross_validation_column_based_fold_2.h5'
fold_3 = 'cross_validation_column_based_fold_3.h5'

model_fold_1 = load_model(os.path.join(path_model, fold_1))
model_fold_2 = load_model(os.path.join(path_model, fold_2))
model_fold_3 = load_model(os.path.join(path_model, fold_3))

#### Load data

In [51]:
input_data = np.concatenate((np.load(path_data + '/cloud_cover_input_narval.npy'), 
                             np.transpose(np.load(path_data + '/cloud_cover_input_qubicc.npy'))), axis=0)
output_data = np.concatenate((np.load(os.path.join(path_data, narval_output_file)), 
                              np.transpose(np.load(os.path.join(path_data, qubicc_output_file)))), axis=0)

In [52]:
samples_narval = np.load(os.path.join(path_data, narval_output_file)).shape[0]

In [53]:
(samples_total, no_of_features) = input_data.shape

#### Remove columns that were constant in at least one of the training folds

In [54]:
remove_fields = [27, 28, 29, 30, 31, 32, 135, 136, 137]
assert no_of_features == 163
input_data = np.delete(input_data, remove_fields, axis=1)
no_of_features = no_of_features - len(remove_fields)

#### Define cross-validation folds to recreate training and validation data

In [31]:
def set_training_validation_folds(samples_total, samples_narval):
    training_folds = []
    validation_folds = []
    two_week_incr = (samples_total-samples_narval)//6

    for i in range(3):
        # Note that this is a temporal split since time was the first dimension in the original tensor
        first_incr = np.arange(samples_narval+two_week_incr*i, samples_narval+two_week_incr*(i+1))
        second_incr = np.arange(samples_narval+two_week_incr*(i+3), samples_narval+two_week_incr*(i+4))
        
        print(second_incr)

        validation_folds.append(np.append(first_incr, second_incr))
        training_folds.append(np.arange(samples_narval, samples_total))
        training_folds[i] = np.setdiff1d(training_folds[i], validation_folds[i])
        
    return training_folds, validation_folds

if qubicc_only:
    # We have to skip the NARVAL data if we do qubicc_only
    training_folds, validation_folds = set_training_validation_folds(samples_total, samples_narval)
else:
    training_folds, validation_folds = set_training_validation_folds(samples_total, 0)

[ 91933934  91933935  91933936 ... 120025759 120025760 120025761]
[120025762 120025763 120025764 ... 148117587 148117588 148117589]
[148117590 148117591 148117592 ... 176209415 176209416 176209417]


#### The data will need to be scaled according to the training folds

In [12]:
scaler = StandardScaler()

#### Useful functions to plot results

In [13]:
def mean_clc_per_vertical_layer(model, input_data, output_data, batch_size=2**20):
    '''
        Model prediction and the Ground Truth
    '''
    # output_var means for first model
    clc_data_mean = []
    for i in range(27):
        clc_data_mean.append(np.mean(output_data[:, i], dtype=np.float64))
    # Predicted output_var means
#     # The batch predicting makes things faster, however, it can run into oom problems
#     # Start with a large batch size and decrease it until it works
#     for j in range(3):
#         try:
#             pred_adj = np.minimum(np.maximum(model.predict(input_valid, batch_size=batch_size//(8**j)), 0), 100)
#             break
#         except(ResourceExhaustedError):
#             K.clear_session()
#             gc.collect()
#             print('Model predict did not work with a batch size of %d'%(batch_size//(8**j)))

    # Curiously it works best if we use predict_on_batch on small subsets of the data instead of predict(..., batch_size=...) 
    # In future correct to: for i in range(1 + input_data.shape[0]//batch_size):
    for i in range(input_data.shape[0]//batch_size): 
        if i == 0:
            a = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
        else:
            a = np.concatenate((a, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()
        
    pred_adj = np.minimum(np.maximum(a, 0), 100) 
    
    return list(np.mean(pred_adj, axis=0, dtype=np.float64)), clc_data_mean

In [14]:
def save_figure(fig_name, fig_title, model_predictions, valid_means=None, all_data_model=False):
    '''
        Note that this figure truly is a different performance measure than the validation error.
        The reason is that the mean can in principle be good even when the model is really bad.
        
        model_predictions: Array of length 3 or 4, covers predictions from all three folds for a given TL setup
        valid_means: Array of length 3 or 4, covers validation means from all three folds for a given TL setup
   '''
#     assert len(model_biases) == 3
    
    # Vertical layers
    a = np.linspace(5, 31, 27)
    fig = plt.figure(figsize=(11,7))
    # For model
    ax = fig.add_subplot(111, xlabel='Mean %s'%output_var, ylabel='Vertical layer', title=fig_title)
    
    if all_data_model:    
        if not valid_means[0] == valid_means[1] == valid_means[2]:
            colors = ['g', 'b', 'r']
            for i in range(len(model_predictions)):
                ax.plot(model_predictions[i], a, colors[i])
                if valid_means != None:
                    ax.plot(valid_means[i], a, '%s--'%colors[i])
            plt.gca().invert_yaxis()
            ax.legend(['Model Fold 1 Predictions', 'Fold 1 Truth', 'Model Fold 2 Predictions', 'Fold 2 Truth', 
                       'Model Fold 3 Predictions', 'Fold 3 Truth', 'Model All Data Predictions', 'Truth'])
        else:
            for i in range(len(model_predictions)):
                ax.plot(model_predictions[i], a)
            ax.plot(valid_means[0], a, 'black')
            plt.gca().invert_yaxis()
            ax.legend(['Model Fold 1 Predictions', 'Model Fold 2 Predictions', 'Model Fold 3 Predictions', 
                       'Model All Data Predictions', 'Truth'])
    else:
        if not valid_means[0] == valid_means[1] == valid_means[2]:
            colors = ['g', 'b', 'r']
            for i in range(len(model_predictions)):
                ax.plot(model_predictions[i], a, colors[i])
                if valid_means != None:
                    ax.plot(valid_means[i], a, '%s--'%colors[i])
            plt.gca().invert_yaxis()
            ax.legend(['Model Fold 1 Predictions', 'Fold 1 Truth', 'Model Fold 2 Predictions', 'Fold 2 Truth', 
                       'Model Fold 3 Predictions', 'Fold 3 Truth'])
        else:
            for i in range(len(model_predictions)):
                ax.plot(model_predictions[i], a)
            ax.plot(valid_means[0], a, 'black')
            plt.gca().invert_yaxis()
            ax.legend(['Model Fold 1 Predictions', 'Model Fold 2 Predictions', 'Model Fold 3 Predictions', 
                       'Truth'])

    fig.savefig(os.path.join(path_figures, fig_name+'.pdf'))

#### Evaluate the models on the data

Add training and validation losses to the text files. <br>
Print results per vertical layer (respective validation set/NARVAL/QUBICC)

In [14]:
train_losses = [] ; valid_losses = [] ; valid_means = [] ; valid_model_predictions = [] ;
narval_means = [] ; narval_model_predictions = [] ; qubicc_means = [] ; qubicc_model_predictions = [] ;
qubicc_month_0 = [] ; qubicc_model_pred_month_0 = [] ; qubicc_month_1 = [] ; qubicc_model_pred_month_1 = [] ;
qubicc_month_2 = [] ; qubicc_model_pred_month_2 = [] ;

for i in range(3): 
    filename = 'cross_validation_column_based_fold_%d'%(i+1)
    # Choose appropriate model for this fold
    if i == 0: model = model_fold_1
    if i == 1: model = model_fold_2
    if i == 2: model = model_fold_3
    
    #Standardize according to the fold
    scaler.fit(input_data[training_folds[i]])
    
    #Load the data for the respective fold
    input_train = scaler.transform(input_data[training_folds[i]])
    input_valid = scaler.transform(input_data[validation_folds[i]])
    output_train = output_data[training_folds[i]]
    output_valid = output_data[validation_folds[i]]
    
    ## Training and validation losses
    train_loss = model.evaluate(input_train, output_train, verbose=2, batch_size=10**5)
    valid_loss = model.evaluate(input_valid, output_valid, verbose=2, batch_size=10**5)
    
    # Clear up some memory
    del input_train, output_train
    gc.collect()
    
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
        file.write('Unbounded training loss: %.4f\n'%(train_loss))
        file.write('Unbounded validation loss: %.4f\n'%(valid_loss))
        
    ## Compute mean cloud cover per vertical layer
    # On the respective validation sets (QUBICC and NARVAL)
    try:
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_valid, output_valid)
    except(ResourceExhaustedError):
        print('Resource Exhausted Qubicc')
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_valid, output_valid, 
                                                                   batch_size=2**15)
    valid_means.append(clc_data_mean)
    valid_model_predictions.append(clc_pred_mean)
    
    # Clear up some memory
    del input_valid, output_valid
    gc.collect()
    
    # For NARVAL
    input_narval = scaler.transform(input_data[:samples_narval])
    output_narval = output_data[:samples_narval]
    try:
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_narval, output_narval)
    except(ResourceExhaustedError):
        print('Resource Exhausted Narval')
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_narval, output_narval, 
                                                                   batch_size=2**15)
    narval_means.append(clc_data_mean)
    narval_model_predictions.append(clc_pred_mean)
    
    # Clear up some memory
    del input_narval, output_narval
    gc.collect()
    
    # For QUBICC  
    input_qubicc = scaler.transform(input_data[samples_narval:])
    output_qubicc = output_data[samples_narval:]
    try:
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc)
    except(ResourceExhaustedError):
        print('Resource Exhausted Qubicc')
        clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc, 
                                                                   batch_size=2**15)
    qubicc_means.append(clc_data_mean)
    qubicc_model_predictions.append(clc_pred_mean)
    
    # Clear up some memory
    del input_qubicc, output_qubicc
    gc.collect()
    
    # QUBICC months
    qubicc_month = (samples_total - samples_narval)//3
    for month in range(3):
        first_ind = samples_narval + month*qubicc_month
        last_ind = samples_narval + (month+1)*qubicc_month
        input_qubicc = scaler.transform(input_data[first_ind:last_ind])
        output_qubicc = output_data[first_ind:last_ind]
        try:
            clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc)
        except(ResourceExhaustedError):
            print('Resource Exhausted Qubicc')
            clc_pred_mean, clc_data_mean = mean_clc_per_vertical_layer(model, input_qubicc, output_qubicc, 
                                                                       batch_size=2**15)
        if month==0: 
            qubicc_month_0.append(clc_data_mean)
            qubicc_model_pred_month_0.append(clc_pred_mean)
        if month==1:
            qubicc_month_1.append(clc_data_mean)
            qubicc_model_pred_month_1.append(clc_pred_mean)
        if month==2:
            qubicc_month_2.append(clc_data_mean)
            qubicc_model_pred_month_2.append(clc_pred_mean)

    # Clear up some memory
    del input_qubicc, output_qubicc
    gc.collect()

In [15]:
# # Plot results
save_figure('cross_validation_validation_means', 'Column-based models on the respective validation sets', 
            valid_model_predictions, valid_means, all_data_model)
save_figure('cross_validation_narval', 'Column-based models on the NARVAL data', 
            narval_model_predictions, narval_means, all_data_model)
save_figure('cross_validation_qubicc', 'Column-based models on the QUBICC data', 
            qubicc_model_predictions, qubicc_means, all_data_model)
# Qubicc months (I checked below that the order is hc2, then hc3, then hc4.)
save_figure('cross_validation_qubicc_hc2', 'Column-based models on the QUBICC data, November 2004', 
            qubicc_model_pred_month_0, qubicc_month_0, all_data_model)
save_figure('cross_validation_qubicc_hc3', 'Column-based models on the QUBICC data, April 2005', 
            qubicc_model_pred_month_1, qubicc_month_1, all_data_model)
save_figure('cross_validation_qubicc_hc4', 'Column-based models on the QUBICC data, November 2005', 
            qubicc_model_pred_month_2, qubicc_month_2, all_data_model)

In [16]:
# In case we want to reproduce the plots without running everything again:
with open(os.path.join(path_figures, 'values_for_figures.txt'), 'w') as file:
    file.write('On validation sets\n')
    file.write(str(valid_means))
    file.write(str(valid_model_predictions))
    file.write('\n\nNARVAL data\n')
    file.write(str(narval_means))
    file.write(str(narval_model_predictions))
    file.write('\n\nQubicc data\n')
    file.write(str(qubicc_means))
    file.write(str(qubicc_model_predictions))
    file.write('\n\nQubicc data, November 2004\n')
    file.write(str(qubicc_month_0))
    file.write(str(qubicc_model_pred_month_0))
    file.write('\n\nQubicc data, April 2005\n')
    file.write(str(qubicc_month_1))
    file.write(str(qubicc_model_pred_month_1))
    file.write('\n\nQubicc data, November 2005\n')
    file.write(str(qubicc_month_2))
    file.write(str(qubicc_model_pred_month_2))

In [17]:
# The QUBICC data is loaded in the order that I would expect (hc2, then hc3, then hc4)

path = '/pf/b/b309170/my_work/QUBICC/data_var_vertinterp_R02B05/'
resolution = 'R02B05'

# Order of experiments
DS = xr.open_mfdataset(path+'hus/*'+resolution+'.nc', combine='by_coords')
print(DS.time[0*len(DS.time)//3])
print(DS.time[1*len(DS.time)//3])
print(DS.time[2*len(DS.time)//3])

#### Compute bounded losses

We also save the scaling parameters for the fold-based models as we haven't done that yet.

In [18]:
# Takes long!
def compute_bounded_loss(model, input_data, output_data, batch_size=2**20):
    for i in range(1 + input_data.shape[0]//batch_size): 
        if i == 0:
            a = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
        else:
            a = np.concatenate((a, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()
        
    pred_adj = np.minimum(np.maximum(a, 0), 100)
    
    # Mean Squared Error
    return np.mean((pred_adj - output_data)**2, dtype=np.float64)

In [None]:
seed = 10

for i in range(3): # for i in range(3): 
    filename = 'cross_validation_column_based_fold_%d'%(i+1)
    # Choose appropriate model for this fold
    if i == 0: model = model_fold_1
    if i == 1: model = model_fold_2
    if i == 2: model = model_fold_3
        
    #Standardize according to the fold
    scaler.fit(input_data[training_folds[i]])
    
#     # We save the scaling parameters in a file [only once]
#     seed_i = int(str(seed) + str(i))
#     with open(path_model+'/scaler_%d.txt'%seed_i, 'a') as file:
#         file.write('Standard Scaler mean values:\n')
#         file.write(str(scaler.mean_))
#         file.write('\nStandard Scaler standard deviation:\n')
#         file.write(str(np.sqrt(scaler.var_)))
        
#     # Define remove_fields
#     remove_fields = [27, 28, 29, 30, 31, 32, 135, 136, 137]

#     # Taken from preprocessing_narval
#     input_variables = []
#     variables = ['qv', 'qc', 'qi', 'temp', 'pres', 'zg']
#     for el in variables:
#         for i in range(21, 48):
#             input_variables.append(el+'_%d'%i)
#     input_variables.append('fr_land')

#     in_and_out_variables = input_variables.copy()
#     variables = [output_var]
#     for el in variables:
#         for i in range(21, 48):
#             in_and_out_variables.append(el+'_%d'%i)
        
#     in_and_out_variables = np.delete(in_and_out_variables, remove_fields)
#     input_variables = np.delete(input_variables, remove_fields)

#     # Write the accompanying info-file [only once]
#     with open(os.path.join(path_model, filename + '.txt'), 'a') as file:
#         write_infofile(file, str(in_and_out_variables), str(input_variables), path_model, path_data, seed_i)
    
    print(i)
    
    #Load the data for the respective fold
    input_train = scaler.transform(input_data[training_folds[i]])
    input_valid = scaler.transform(input_data[validation_folds[i]])
    output_train = output_data[training_folds[i]]
    output_valid = output_data[validation_folds[i]]
    
    train_loss = compute_bounded_loss(model, input_train, output_train, batch_size=2**17)
    valid_loss = compute_bounded_loss(model, input_valid, output_valid, batch_size=2**17)
        
    with open(os.path.join(path_model, filename+'.txt'), 'a') as file:
        file.write('Bounded training loss: %.4f\n'%(train_loss))
        file.write('Bounded validation loss: %.4f\n'%(valid_loss))

0


### How often are the predictions of the model from split 2 outside [0, 100]?

In [55]:
model = model_fold_2

In [56]:
#Standardize the input data.
if output_var == 'cl_area':
    mean = [2.57681365e-06, 2.60161901e-06, 2.86229890e-06, 3.49524686e-06,  6.32444387e-06, 1.62852938e-05, 4.26197236e-05, 1.00492283e-04,  2.10850387e-04, 3.96992495e-04, 6.62768743e-04, 1.00639902e-03,  1.42273038e-03, 1.89269379e-03, 2.42406883e-03, 2.97704256e-03,  3.52303812e-03, 4.15430913e-03, 4.89285256e-03, 5.71192194e-03,  6.58451740e-03, 7.47955824e-03, 8.42949837e-03, 9.18162558e-03,  9.58900058e-03, 9.80246788e-03, 9.98071441e-03, 1.99662055e-48,  1.97795858e-36, 2.80309683e-33, 1.17327341e-31, 1.33296743e-30,  1.45585956e-29, 2.57897497e-16, 1.24502901e-08, 5.43912468e-07,  1.97554777e-06, 2.10205332e-06, 3.45718981e-06, 4.17987790e-06,  4.89876027e-06, 6.03250921e-06, 6.71487544e-06, 7.71281746e-06,  9.96528417e-06, 1.40351017e-05, 1.87534642e-05, 2.15523809e-05,  1.77725032e-05, 1.10700238e-05, 6.98113679e-06, 5.98240074e-06,  8.03857856e-06, 1.55278994e-05, 1.98903187e-13, 1.45240003e-10,  2.39426913e-08, 5.63226688e-07, 3.10209365e-06, 6.64324795e-06,  8.83422658e-06, 9.89681102e-06, 9.97096463e-06, 7.74324652e-06,  4.95774608e-06, 2.61087000e-06, 1.29680563e-06, 7.46596833e-07,  4.94444102e-07, 3.51674311e-07, 2.61199355e-07, 2.03219747e-07,  1.66907845e-07, 1.42871199e-07, 1.25114261e-07, 1.11956533e-07,  1.02782118e-07, 9.86031894e-08, 9.95790399e-08, 1.06733810e-07,  1.26921172e-07, 2.10924633e+02, 2.07944695e+02, 2.05115507e+02,  2.03204784e+02, 2.06103772e+02, 2.12329817e+02, 2.19299382e+02,  2.26348890e+02, 2.33352039e+02, 2.40105681e+02, 2.46401637e+02,  2.52153555e+02, 2.57207037e+02, 2.61575645e+02, 2.65446543e+02,  2.68951996e+02, 2.72093136e+02, 2.74765728e+02, 2.76963041e+02,  2.78775116e+02, 2.80398659e+02, 2.81959850e+02, 2.83501227e+02,  2.84935364e+02, 2.86119192e+02, 2.86867707e+02, 2.87046277e+02,  4.78805278e+03, 6.25615004e+03, 8.06726288e+03, 1.03500805e+04,  1.30603494e+04, 1.61944127e+04, 1.97232230e+04, 2.36181577e+04,  2.78401230e+04, 3.23377105e+04, 3.70511232e+04, 4.19785078e+04,  4.70365400e+04, 5.21124420e+04, 5.72512536e+04, 6.23517142e+04,  6.72989145e+04, 7.20972394e+04, 7.66740332e+04, 8.09510300e+04,  8.49437983e+04, 8.85136468e+04, 9.16490946e+04, 9.42529147e+04,  9.63348759e+04, 9.77633315e+04, 9.86144363e+04, 2.07846270e+04,  1.91533379e+04, 1.76039570e+04, 1.61343240e+04, 1.47416307e+04,  1.34230525e+04, 1.21768751e+04, 1.10012039e+04, 9.89431495e+03,  8.85470770e+03, 7.88104473e+03, 6.97198713e+03, 6.12617252e+03,  5.34218664e+03, 4.61854836e+03, 3.95376191e+03, 3.34629894e+03,  2.79465640e+03, 2.29750295e+03, 1.85381761e+03, 1.46282067e+03,  1.12390793e+03, 8.36771545e+02, 6.01482480e+02, 4.18667943e+02,  2.90324051e+02, 2.20122534e+02, 2.57179068e-01]
    var = [2.77480154e-14, 7.25973243e-14, 3.99634524e-13, 2.15710125e-12, 8.11907271e-12, 7.61775265e-11, 8.77316868e-10, 6.92865621e-09, 3.75024285e-08, 1.46188082e-07, 3.94244487e-07, 8.66497643e-07, 1.62353588e-06, 2.62130562e-06, 3.83192714e-06, 4.99991907e-06, 6.21865064e-06, 8.01241132e-06, 1.03693004e-05, 1.31320424e-05, 1.64074160e-05, 2.06036189e-05, 2.64319947e-05, 3.14889456e-05, 3.38882152e-05, 3.50739205e-05, 3.62542223e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.75468927e-24, 3.77859683e-14, 8.81399575e-12, 7.67332158e-11, 1.05524206e-10, 2.10047036e-10, 2.74443979e-10, 3.28792625e-10, 4.40183561e-10, 5.82035529e-10, 7.95968863e-10, 1.27540590e-09, 2.26932659e-09, 3.55040982e-09, 4.36412766e-09, 3.36921887e-09, 1.94898287e-09, 1.31932279e-09, 1.47840427e-09, 3.00631977e-09, 1.02475319e-08, 8.08985898e-20, 7.53000365e-15, 6.63133983e-12, 7.29001841e-11, 3.89494832e-10, 7.85358520e-10, 9.33067822e-10, 1.00578451e-09, 1.04125431e-09, 7.02788016e-10, 3.35540989e-10, 1.10604147e-10, 3.62540093e-11, 2.00711744e-11, 1.03744494e-11, 4.94660958e-12, 2.65516680e-12, 1.73694133e-12, 1.30232159e-12, 1.05309331e-12, 8.92800996e-13, 7.95007256e-13, 7.34063569e-13, 6.97679020e-13, 6.71116696e-13, 6.55904608e-13, 7.00761126e-13, 2.13386790e+01, 2.83620543e+01, 4.35373333e+01, 6.58895818e+01, 4.11327685e+01, 1.25161844e+01, 1.04050676e+01, 3.03990896e+01, 5.92444799e+01, 8.44920290e+01, 1.01262923e+02, 1.09040431e+02, 1.10571059e+02, 1.10489555e+02, 1.11241419e+02, 1.15661026e+02, 1.22910740e+02, 1.29881594e+02, 1.37074237e+02, 1.45675391e+02, 1.56059349e+02, 1.66116415e+02, 1.76086646e+02, 1.87903449e+02, 2.02713567e+02, 2.17913355e+02, 2.36656922e+02, 2.74458895e+04, 5.11391869e+04, 1.04446470e+05, 2.32219429e+05, 4.84596804e+05, 8.52038001e+05, 1.28343711e+06, 1.71290868e+06, 2.08304820e+06, 2.37089800e+06, 2.59898548e+06, 2.83694575e+06, 3.17998519e+06, 3.73075481e+06, 4.63958852e+06, 6.01537006e+06, 7.86646627e+06, 1.02806323e+07, 1.32050071e+07, 1.65713281e+07, 2.04336800e+07, 2.44266133e+07, 2.84576052e+07, 3.20684854e+07, 3.51439430e+07, 3.73393052e+07, 3.88460516e+07, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.78852050e+00, 2.02838259e+01, 5.57561156e+01, 1.42624190e+02, 3.40661029e+02, 7.63342522e+02, 1.66784615e+03, 3.61855509e+03, 7.50449371e+03, 1.44226070e+04, 2.53310370e+04, 4.07648391e+04, 6.09571406e+04, 8.56022800e+04, 1.13726811e+05, 1.44066944e+05, 1.75744678e+05, 2.07956727e+05, 2.39436769e+05, 2.68796829e+05, 2.94535011e+05, 3.15079095e+05, 3.29944182e+05, 3.36064654e+05, 1.78957540e-01]
    std = np.sqrt(var)

    mean = np.delete(mean, remove_fields)
    std = np.delete(std, remove_fields)
else:
    mean, std = read_mean_and_std(os.path.join(path_model, 
                                               'cross_validation_column_based_fold_2.txt'))

input_train = (input_data[training_folds[1]] - mean)/std
input_valid = (input_data[validation_folds[1]] - mean)/std
output_train = output_data[training_folds[1]]
output_valid = output_data[validation_folds[1]]

Validation data

In [57]:
# Need 170GB in total
batch_size = 2**25

for i in range(1 + input_valid.shape[0]//batch_size): ## 1 + input_valid.shape[0]//batch_size
    if i == 0:
        image_valid = model.predict_on_batch(input_valid[i*batch_size:(i+1)*batch_size])
    else:
        image_valid = np.concatenate((image_valid, model.predict_on_batch(input_valid[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()



In [58]:
# Outside of the [0, 100] range
(np.sum(np.where(image_valid < 0, True, False)) + np.sum(np.where(image_valid > 100, True, False)))/np.size(image_valid)

0.2846731144255428

In [59]:
# Outside of the [-1, 100] range
(np.sum(np.where(image_valid < -1, True, False)) + np.sum(np.where(image_valid > 100, True, False)))/np.size(image_valid)

0.011236769903596427

Training data

In [60]:
# Need 170GB in total
batch_size = 2**25

for i in range(1 + input_train.shape[0]//batch_size): ## 1 + input_valid.shape[0]//batch_size
    if i == 0:
        image_train = model.predict_on_batch(input_train[i*batch_size:(i+1)*batch_size])
    else:
        image_train = np.concatenate((image_train, model.predict_on_batch(input_train[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()



In [61]:
# Outside of the [0, 100] range
(np.sum(np.where(image_train < 0, True, False)) + np.sum(np.where(image_train > 100, True, False)))/np.size(image_train)

0.28111301628816043

In [62]:
# Outside of the [-1, 100] range
(np.sum(np.where(image_train < -1, True, False)) + np.sum(np.where(image_train > 100, True, False)))/np.size(image_train)

0.011329645563319297

All data

In [63]:
image_all = np.concatenate((image_train, image_valid))

In [64]:
# Outside of the [0, 100] range
(np.sum(np.where(image_all < 0, True, False)) + np.sum(np.where(image_all > 100, True, False)))/np.size(image_all)

0.28229971564616607

In [65]:
# Outside of the [-1, 100] range
(np.sum(np.where(image_all < -1, True, False)) + np.sum(np.where(image_all > 100, True, False)))/np.size(image_all)

0.011298687010629364