### Evaluate the SFS NNs on higher-res DYAMOND data

- Data path: /home/b/b309170/bd1179_work/DYAMOND/hcg_data*

In [None]:
# Need 960GB!

In [2]:
# Import libraries
import os
import gc
import sys
import json
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow import nn 
from tensorflow.keras import backend as K 

sys.path.insert(0, os.environ['HOME'] + '/my_work/published_code/grundner23james_EquationDiscovery_CloudCover_addressing_reviews/sec2_data/')
import my_classes
from my_classes import read_mean_and_std
from my_classes import load_data
from functions import append_dict_to_json

# Good performance with bs_exp = 23 and on a gpu
# OOM when bs_exp too high, but possibly bs_exp > 23 would be better.
bs_exp = 20
print(bs_exp)

# num_cells = int(sys.argv[2]) #[1, 8, 32]
SFS_MODEL = 6

23


**Load the data**

In [None]:
# Load the data
order_of_vars = ['q', 'qc', 'qi', 't', 'pres', 'u', 'v', 'zg', 'fr_land', 'clc']

data_path = '/home/b/b309170/bd1179_work/DYAMOND/hcg_data_r2b7'
data_dict = load_data(source='split_by_var_name', days='all', vert_interp=False, \
                      resolution='R02B07', order_of_vars=order_of_vars, path=data_path)

TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

data_dict['fr_land'] = np.repeat(np.expand_dims(data_dict['fr_land'], axis=0), VLAYERS, axis=0)

data_dict['zg'] = np.repeat(np.expand_dims(data_dict['zg'].T, axis=0), TIMESTEPS, axis=0)
data_dict['fr_land'] = np.repeat(np.expand_dims(data_dict['fr_land'], axis=0), TIMESTEPS, axis=0)

# Only keep the lowest 60 levels (ensure that all fields have the same vertical grid)
# To avoid OOM I now only take every second entry!
print('Expecting around 650000 horizontal fields')
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, -60:, ::2].copy()
    print(data_dict[key].shape)

# Add magnitude of horizontal wind
data_dict['U'] = np.sqrt(data_dict['u']**2 + data_dict['v']**2)
del data_dict['u']
del data_dict['v']

# Add RH
T0 = 273.15
r = 0.00263*data_dict['pres']*data_dict['q']*np.exp((17.67*(data_dict['t']-T0))/(data_dict['t']-29.65))**(-1)
data_dict['rh'] = r

# Update
TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

# Add ps
ps = np.repeat(np.expand_dims(data_dict['pres'][:, -1], axis=1), VLAYERS, axis=1)
data_dict['ps'] = ps

# Add derivatives
data_dict['rh_z'] = (r[:, :-1] - r[:, 1:])/(data_dict['zg'][:, :-1] - data_dict['zg'][:, 1:])
data_dict['rh_zz'] = (data_dict['rh_z'][:, :-1] - data_dict['rh_z'][:, 1:])/(data_dict['zg'][:, 1:-1] - data_dict['zg'][:, 2:])
data_dict['t_z'] = (data_dict['t'][:, :-1] - data_dict['t'][:, 1:])/(data_dict['zg'][:, :-1] - data_dict['zg'][:, 1:])
data_dict['t_zz'] = (data_dict['t_z'][:, :-1] - data_dict['t_z'][:, 1:])/(data_dict['zg'][:, 1:-1] - data_dict['zg'][:, 2:])
data_dict['pres_z'] = (data_dict['pres'][:, :-1] - data_dict['pres'][:, 1:])/(data_dict['zg'][:, :-1] - data_dict['zg'][:, 1:])
data_dict['pres_zz'] = (data_dict['pres_z'][:, :-1] - data_dict['pres_z'][:, 1:])/(data_dict['zg'][:, 1:-1] - data_dict['zg'][:, 2:])
data_dict['qc_z'] = (data_dict['qc'][:, :-1] - data_dict['qc'][:, 1:])/(data_dict['zg'][:, :-1] - data_dict['zg'][:, 1:])
data_dict['qc_zz'] = (data_dict['qc_z'][:, :-1] - data_dict['qc_z'][:, 1:])/(data_dict['zg'][:, 1:-1] - data_dict['zg'][:, 2:])
data_dict['U_z'] = (data_dict['U'][:, :-1] - data_dict['U'][:, 1:])/(data_dict['zg'][:, :-1] - data_dict['zg'][:, 1:])
data_dict['U_zz'] = (data_dict['U_z'][:, :-1] - data_dict['U_z'][:, 1:])/(data_dict['zg'][:, 1:-1] - data_dict['zg'][:, 2:])
data_dict['qi_z'] = (data_dict['qi'][:, :-1] - data_dict['qi'][:, 1:])/(data_dict['zg'][:, :-1] - data_dict['zg'][:, 1:])
data_dict['qi_zz'] = (data_dict['qi_z'][:, :-1] - data_dict['qi_z'][:, 1:])/(data_dict['zg'][:, 1:-1] - data_dict['zg'][:, 2:])
data_dict['q_z'] = (data_dict['q'][:, :-1] - data_dict['q'][:, 1:])/(data_dict['zg'][:, :-1] - data_dict['zg'][:, 1:])
data_dict['q_zz'] = (data_dict['q_z'][:, :-1] - data_dict['q_z'][:, 1:])/(data_dict['zg'][:, 1:-1] - data_dict['zg'][:, 2:])

# Only keep the lowest 58 levels (up to 21km)
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, -58:].copy()

# Data output
data_output = 100*data_dict['clc']
del data_dict['clc']

# ## LESS DATA ## #!
# for key in data_dict.keys():
#     data_dict[key] = data_dict[key][0::3]
# data_output = data_output[0::3]
# TIMESTEPS = TIMESTEPS//3

will change. To retain the existing behavior, pass
combine='nested'. To use future default behavior, pass
combine='by_coords'. See
http://xarray.pydata.org/en/stable/combining.html#combining-multi

  DS = xr.open_mfdataset(path+'/zg/zg*')
to use the new `combine_by_coords` function (or the
`combine='by_coords'` option to `open_mfdataset`) to order the datasets
before concatenation. Alternatively, to continue concatenating based
on the order the datasets are supplied in future, please use the new
`combine_nested` function (or the `combine='nested'` option to
open_mfdataset).
  from_openmfds=True,


q
qc
qi
t
pres
u
v
clc


**All features**

In [None]:
features_nn = ['q', 'qc', 'qi', 't', 'pres', 'zg', 'fr_land', 'U', 'rh', 'ps', 'q_z', 'q_zz', 'qc_z',\
            'qc_zz', 'qi_z', 'qi_zz', 't_z', 't_zz', 'pres_z', 'pres_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

k = 0
loc = {}
for feat in features_nn:
    loc[feat] = k
    k = k + 1

**Cast dict into ndarray and reshape**

In [None]:
# data_array = np.zeros((data_dict['q'].size, len(data_dict.keys())), dtype=np.float32)

k = 0
data_array_not_T = []
for key in features_nn:
    data_array_not_T.append(np.reshape(data_dict[key], -1))
    del data_dict[key]
    k += 1

# Convert into np array and transpose
data_array = np.transpose(np.array(data_array_not_T, dtype=np.float32))
data_output = np.reshape(data_output, -1)

del data_array_not_T
gc.collect()

**Loop through SFS NNs**

In [None]:
def which_features(sfs_ind):
    '''
        Extract the relevant feature names and their order for a given SFS NN
    '''
    conv = {'clw': 'qc', 'cli': 'qi', 'ta': 't', 'pa_z': 'pres_z'}
    with open(os.environ['HOME'] + '/workspace_icon-ml/cloud_cover_parameterization/neighborhood_based_on_seq_feat_sel_DYAMOND/saved_models/neighborhood_based_sfs_cl_area_no_features_%d.txt'%sfs_ind, 'r') as file:
        lines = file.readlines()
        for k in range(len(lines)):
            if lines[k].startswith('The (order of) input variables'):
                out_line = lines[k+1][1:-2].split(' ')
    for ind in range(len(out_line)):
        out_line[ind] = out_line[ind][1:-1]
        # Rename if the name is different in ERA5
        if out_line[ind] in conv.keys():
            out_line[ind] = conv[out_line[ind]]
    return out_line

**Final cell**

In [None]:
def predict_on_small_batches(model, input_data, batch_size=2**20):
    # Using predict_on_batch on the entire dataset results in an OOM error
    # Curiously it works best if we use predict_on_batch on small subsets of the data instead of predict(..., batch_size=...) 
    for i in range(1 + input_data.shape[0]//batch_size):
        if i == 0:
            a = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
        else:
            a = np.concatenate((a, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()

    pred_adj = np.minimum(np.maximum(a, 0), 100) 
    
    return pred_adj

In [None]:
# Compute only once
VAR = np.var(data_output)

# For the NNs
custom_objects = {}
custom_objects['leaky_relu'] = nn.leaky_relu

In [None]:
for sfs_ind in range(SFS_MODEL, SFS_MODEL + 1):
    ## Get mean and std
    nn_path = os.environ['HOME'] + '/workspace_icon-ml/cloud_cover_parameterization/neighborhood_based_on_seq_feat_sel_DYAMOND/saved_models'

    # Select the appropriate features    
    features_inds = []
    features_nn = which_features(sfs_ind)
    for k in range(sfs_ind):
        features_inds.append(loc[features_nn[k]])
    data_array_sfs_nn = data_array[:, features_inds]

    if sfs_ind in [4,5,6,7]:
        if sfs_ind == 4:
            thrd_lay = 'False'
        else:
            thrd_lay = 'True'
        model_name = 'hyperparameter_tests/neighborhood_based_sfs_cl_area_no_features_%d_False_%s_16'%(sfs_ind,thrd_lay)
        if sfs_ind == 7:
            model_name = 'hyperparameter_tests/neighborhood_based_sfs_cl_area_no_features_7_True_True_32'
    else:
        model_name = 'neighborhood_based_sfs_cl_area_no_features_%d'%sfs_ind

    ## Get mean and std from the model-file
    mean, std = read_mean_and_std(os.path.join(nn_path, model_name + '.txt'))

    ## Scale all data using this mean and std
    data_array_scaled = (data_array_sfs_nn - np.float32(mean))/np.float32(std)
    
    results = {} 
    parent_key = 'SFS_NN_%d_no_tl'%(sfs_ind)
    results[parent_key] = {}  

    model = load_model(os.path.join(nn_path, model_name + '.h5'), custom_objects)

    ## Evaluate model on scaled data
    predictions = predict_on_small_batches(model, data_array_scaled)

    # Mean-squared error
    mse = np.mean((predictions[:, 0] - data_output)**2)
    results[parent_key]['MSE'] = float(mse)
    print(mse)

    # R2-value
    r2 = 1 - mse/VAR
    results[parent_key]['R2'] = float(r2)

    ## Write results to json-file
    append_dict_to_json(results, os.environ['HOME'] + '/my_work/published_code/grundner23james_EquationDiscovery_CloudCover_addressing_reviews/sec5_results/transfer_to_higher_resolutions/results/sfs6_nn_r2b7.json')