### Evaluate SFS NNs

In [None]:
# Need 960GB!

In [1]:
# Import libraries
import os
import gc
import sys
import json
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow import nn 
from tensorflow.keras import backend as K 

sys.path.insert(0, '~/workspace_icon-ml/cloud_cover_parameterization/')
import my_classes
from my_classes import read_mean_and_std
from my_classes import load_data

sys.path.insert(0, '~/workspace_icon-ml/symbolic_regression/')
from functions import append_dict_to_json

model_type = 'trained' # ['trained', 'original']
print(model_type)

# Good performance with bs_exp = 23 and on a gpu
# OOM when bs_exp too high, but possibly bs_exp > 23 would be better.
bs_exp = int(sys.argv[1]) # 23
print(bs_exp)

# num_cells = int(sys.argv[2]) #[1, 8, 32]
SFS_MODEL = int(sys.argv[2]) #[1, ..., 10]

trained


**Load the data**

In [2]:
# Load the data
order_of_vars = ['q', 'clwc', 'ciwc', 't', 'pa', 'u', 'v', 'zg', 'fr_land', 'cc']
data_dict = load_data(source='era5', days='all', order_of_vars=order_of_vars) #!

TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

data_dict['fr_land'] = np.repeat(np.expand_dims(data_dict['fr_land'], axis=1), VLAYERS, axis=1)

# Add magnitude of horizontal wind
data_dict['U'] = np.sqrt(data_dict['u']**2 + data_dict['v']**2)
del data_dict['u']
del data_dict['v']

# Add RH
T0 = 273.15
r = 0.00263*data_dict['pa']*data_dict['q']*np.exp((17.67*(data_dict['t']-T0))/(data_dict['t']-29.65))**(-1)
data_dict['rh'] = r

# Add ps
ps = np.repeat(np.expand_dims(data_dict['pa'][:, -1], axis=1), VLAYERS, axis=1)
data_dict['ps'] = ps

# Removing four upper-most levels
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, 4:].copy()

# Data output
data_output = data_dict['cc']
del data_dict['cc']

# ## LESS DATA ## #!
# for key in data_dict.keys():
#     data_dict[key] = data_dict[key][0::3]
# data_output = data_output[0::3]
# TIMESTEPS = TIMESTEPS//3

# Requires 15 - 20 minutes: Takes around one minute per folder. There are 14 folders
# Requires 133G: Requires 9.5G = 56G/(2*3) per folder

# Load derivatives
for folder in os.listdir('~/bd1179_work/ERA5/hvcg_data'):
    if folder.endswith('z'):
        # Initialize all_npy_files with empty tensor
        all_npy_files = np.zeros((0, VLAYERS-4, HFIELDS), dtype=np.float32)
        
        # Load all filenames in the folder containing the derivatives. The filenames are sorted chronologically.
        npy_file_names = sorted(os.listdir(os.path.join('~/bd1179_work/ERA5/hvcg_data', folder)))        
        
        for file in npy_file_names: #!
            # Load three-hourly data and convert directly to float32
            npy_file = np.load('~/bd1179_work/ERA5/hvcg_data/%s/%s'%(folder,file), mmap_mode='r')
            npy_file = np.float32(npy_file[0::3].copy())
            all_npy_files = np.concatenate((all_npy_files, npy_file), axis=0)
        data_dict[folder] = all_npy_files

one
100.0


**All features**

In [3]:
features_nn = ['q', 'clwc', 'ciwc', 't', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'q_z', 'q_zz', 'clwc_z',\
            'clwc_zz', 'ciwc_z', 'ciwc_zz', 't_z', 't_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

k = 0
loc = {}
for feat in features_nn:
    loc[feat] = k
    k = k + 1

**Cast dict into ndarray and reshape**

In [4]:
# data_array = np.zeros((data_dict['q'].size, len(data_dict.keys())), dtype=np.float32)

k = 0
data_array_not_T = []
for key in features_nn:
    data_array_not_T.append(np.reshape(data_dict[key], -1))
    del data_dict[key]
    k += 1

# Convert into np array and transpose
data_array = np.transpose(np.array(data_array_not_T, dtype=np.float32))
data_output = np.reshape(data_output, -1)

del data_array_not_T
gc.collect()

370

**Loop through SFS NNs**

In [5]:
def which_features(sfs_ind):
    '''
        Extract the relevant feature names and their order for a given SFS NN
    '''
    conv = {'cli': 'ciwc', 'clw': 'clwc', 'ta': 't', 'ta_z': 't_z'}
    with open('~/workspace_icon-ml/cloud_cover_parameterization/neighborhood_based_on_seq_feat_sel_DYAMOND/saved_models/neighborhood_based_sfs_cl_area_no_features_%d.txt'%sfs_ind, 'r') as file:
        lines = file.readlines()
        for k in range(len(lines)):
            if lines[k].startswith('The (order of) input variables'):
                out_line = lines[k+1][1:-2].split(' ')
    for ind in range(len(out_line)):
        out_line[ind] = out_line[ind][1:-1]
        # Rename if the name is different in ERA5
        if out_line[ind] in conv.keys():
            out_line[ind] = conv[out_line[ind]]
    return out_line

**Final cell**

In [6]:
def predict_on_small_batches(model, input_data, batch_size=2**20):
    # Using predict_on_batch on the entire dataset results in an OOM error
    # Curiously it works best if we use predict_on_batch on small subsets of the data instead of predict(..., batch_size=...) 
    for i in range(1 + input_data.shape[0]//batch_size):
        if i == 0:
            a = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
        else:
            a = np.concatenate((a, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()

    pred_adj = np.minimum(np.maximum(a, 0), 100) 
    
    return pred_adj

In [7]:
# Compute only once
VAR = np.var(data_output)

# For the NNs
custom_objects = {}
custom_objects['leaky_relu'] = nn.leaky_relu

In [8]:
for sfs_ind in range(SFS_MODEL, SFS_MODEL + 1):
    ## Get mean and std
    nn_path = '~/workspace_icon-ml/cloud_cover_parameterization/neighborhood_based_on_seq_feat_sel_DYAMOND/saved_models'

    # Select the appropriate features    
    features_inds = []
    features_nn = which_features(sfs_ind)
    for k in range(sfs_ind):
        features_inds.append(loc[features_nn[k]])
    data_array_sfs_nn = data_array[:, features_inds]

    if sfs_ind in [4,5,6,7]:
        if sfs_ind == 4:
            thrd_lay = 'False'
        else:
            thrd_lay = 'True'
        model_name = 'hyperparameter_tests/neighborhood_based_sfs_cl_area_no_features_%d_False_%s_16'%(sfs_ind,thrd_lay)
        if sfs_ind == 7:
            model_name = 'hyperparameter_tests/neighborhood_based_sfs_cl_area_no_features_7_True_True_32'
    else:
        model_name = 'neighborhood_based_sfs_cl_area_no_features_%d'%sfs_ind

    ## Get mean and std from the model-file
    mean, std = read_mean_and_std(os.path.join(nn_path, model_name + '.txt'))

    ## Scale all data using this mean and std
    data_array_scaled = (data_array_sfs_nn - np.float32(mean))/np.float32(std)
    
    # Case 1
    if model_type == 'trained':
        for subset_exp in [2]:
            for seed in 10*np.arange(1, 7):           

                ## Load model
                results = {} 
                parent_key = 'SFS_NN_%d_tl_%d_seed_%d'%(sfs_ind, subset_exp, seed)
                results[parent_key] = {}  

                nn_path = '~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/models'

                try:
                    model = load_model(os.path.join(nn_path, parent_key + '.h5'), custom_objects)
                except:
                    continue

                ## Evaluate model on scaled data
                predictions = predict_on_small_batches(model, data_array_scaled)

                # Mean-squared error
                mse = np.mean((predictions[:, 0] - data_output)**2)
                results[parent_key]['MSE'] = float(mse)
                print(mse)

                # R2-value
                r2 = 1 - mse/VAR
                results[parent_key]['R2'] = float(r2)

                ## Write results to json-file
                append_dict_to_json(results, '~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/sfs_based_nn.json')

    # Case 2
    if model_type == 'original':
        results = {} 
        parent_key = 'SFS_NN_%d_no_tl'%(sfs_ind)
        results[parent_key] = {}  
            
        model = load_model(os.path.join(nn_path, model_name + '.h5'), custom_objects)

        ## Evaluate model on scaled data
        predictions = predict_on_small_batches(model, data_array_scaled)

        # Mean-squared error
        mse = np.mean((predictions[:, 0] - data_output)**2)
        results[parent_key]['MSE'] = float(mse)
        print(mse)

        # R2-value
        r2 = 1 - mse/VAR
        results[parent_key]['R2'] = float(r2)

        ## Write results to json-file
        append_dict_to_json(results, '~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/sfs_based_nn.json')

628.8782
New file created or first entry added
266.2604
261.9217
262.42874
253.12701
338.86075
289.49634
273.83743
257.10925


KeyboardInterrupt: 