### Sundqvist Scheme

How about we fit the Sundqvist model where we fit the tuning parameters to the data? <br>
We let the parameters depend on whether they are taken over land or over the sea.

In this version, we find the optimal set of hyperparameters automatically!

In [5]:
# 1000 samples, grid_spacing of 0.2: 12 seconds
# 1000 samples, grid_spacing of 0.1: 130 seconds

# 100.000 samples, grid_spacing of 0.2: 850 seconds
# 100.000 samples, grid_spacing of 0.1: Should take 2-3 hours

In [6]:
import os
import gc
import sys
import time
import json
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import LambdaCallback

sys.path.insert(0, '/home/b/b309170/workspace_icon-ml/symbolic_regression/')
from functions import evaluate_sundqvist

# Added to the PDF name
ran = np.random.randint(10**3)
print(ran)

output_var = sys.argv[1] 

seed = 7
np.random.seed(seed)

175


In [7]:
# Load columns of data
folder_data = '/home/b/b309170/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND/'

input_data = np.load(os.path.join(folder_data, 'cloud_cover_input_dyamond.npy'))
if output_var == 'cl_volume':
    output_data = np.load(os.path.join(folder_data, 'cloud_cover_output_dyamond.npy'))
elif output_var == 'cl_area':
    output_data = np.load(os.path.join(folder_data, 'cloud_area_output_dyamond.npy'))
    
vert_layers = np.load(os.path.join(folder_data, 'samples_vertical_layers_dyamond.npy'))

new_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
                'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

In [8]:
samples_total, no_of_features = input_data.shape

# Split into train/valid
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

# The second fold yields the best model
input_train = input_data[training_folds[1]]
input_valid = input_data[validation_folds[1]]
output_train = output_data[training_folds[1]]
output_valid = output_data[validation_folds[1]]

# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

558

In [9]:
# To locate variables
loc = {}
for i in range(len(new_features)):
    loc[new_features[i]] = i
    
input_train.shape

(190119664, 24)

In [10]:
# We split the training data into cells over land vs sea
land_ind = np.where(input_train[:, loc['fr_land']] > 0.5)[0]
sea_ind = np.where(input_train[:, loc['fr_land']] <= 0.5)[0]

input_land = input_train[land_ind]
output_land = output_train[land_ind]
input_sea = input_train[sea_ind]
output_sea = output_train[sea_ind]

In [11]:
len(land_ind)/input_train.shape[0]

0.23899404745423913

### Fitting hyperparameters
Original ones: $r_{sat} = 1, r_{0, top} = 0.8, r_{0, surf} = 0.968, n = 2$

### Create custom layer

In [12]:
class Sundq_Layer(tf.keras.layers.Layer):

    # These are the output units
    def __init__(self, units=1):
        super(Sundq_Layer, self).__init__()
        self.units = units

    def build(self, input_shape):  # Create the state of the layer (weights)
        
        # Initializing with the original values
        # rsat must always be greater than r0_top and r0_surf! How could we enforce this? (*)
        rsat_init = tf.constant_initializer(1)
        r0_top_init = tf.constant_initializer(0.8)
        r0_surf_init = tf.constant_initializer(0.968)
        n_init = tf.constant_initializer(2)  
    
        self.rsat = tf.Variable(name='rsat', initial_value=rsat_init(shape=(1, self.units), dtype='float32'), trainable=True)
        self.r0_top = tf.Variable(name='r0_top', initial_value=r0_top_init(shape=(1, self.units), dtype='float32'), trainable=True)
        self.r0_surf = tf.Variable(name='r0_surf', initial_value=r0_surf_init(shape=(1, self.units), dtype='float32'), trainable=True)
        self.n = tf.Variable(name='n', initial_value=n_init(shape=(1, self.units), dtype='float32'), trainable=True)

    def call(self, inputs):  # Defines the computation from inputs to outputs
        ps = inputs[:, 0]
        p = inputs[:, 1]
        rh = inputs[:, 2]
        
        r0 = self.r0_top + (self.r0_surf - self.r0_top)*tf.exp(1-(ps/p)**self.n)
        
        # div < 0, only if rsat < r0. But this goes against (*)
        div = (tf.minimum(rh, self.rsat) - self.rsat)/(r0 - self.rsat)
        
        # tf.sqrt is tricky, because its gradient in 0 is infinite!
        c = 1 - tf.sqrt(tf.maximum(div, 1e-9)) # in [0,1]
        
        # If rh > r0 we return c, otherwise we set it to 0
        c_out = tf.maximum(tf.sign(rh - r0), 0)*c
        
        return 100*tf.transpose(c)

**Land**

In [13]:
# Best parameters from the hyperparameter search
epochs_opt = 10
batchsize_opt = 32
optimizer_opt = tf.keras.optimizers.Adagrad
lr_opt = 0.0523026

In [14]:
sundq_layer = Sundq_Layer()
model = tf.keras.models.Sequential(sundq_layer)
model.compile(optimizer=optimizer_opt(learning_rate=lr_opt), loss='mse')

2022-06-19 17:18:53.498872: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-06-19 17:18:53.542349: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-06-19 17:18:53.542663: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-06-19 17:18:53.543308: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (l40033.atos.local): /proc/driver/nvidia/version does not exist
2022-06-19 17:18:53.550318: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild

In [None]:
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

print_weights = LambdaCallback(on_epoch_end=lambda batch, logs: print(model.layers[0].get_weights()))

inds = np.random.randint(0, input_land.shape[0], input_land.shape[0]//10)
history = model.fit(input_land[inds][:, [loc['ps'], loc['pa'], loc['rh']]], output_land[inds], epochs=epochs_opt, batch_size=batchsize_opt, verbose=2, \
                   callbacks = [print_weights])

Epoch 1/10
1419921/1419921 - 601s - loss: 969.3347
[array([[0.97819006]], dtype=float32), array([[0.05164935]], dtype=float32), array([[0.39053056]], dtype=float32), array([[1.3016738]], dtype=float32)]
Epoch 2/10
1419921/1419921 - 612s - loss: 969.3267
[array([[0.97544396]], dtype=float32), array([[0.05206378]], dtype=float32), array([[0.39572528]], dtype=float32), array([[1.3008726]], dtype=float32)]
Epoch 3/10


In [None]:
# rsat, r0_top, r0_surf, n
best_land = [model.weights[i].numpy()[0][0] for i in range(len(model.weights))]

In [None]:
print('Trained the land model')

**Sea**

In [None]:
sundq_layer = Sundq_Layer()
model = tf.keras.models.Sequential(sundq_layer)
model.compile(optimizer=optimizer_opt(learning_rate=lr_opt), loss='mse')

In [None]:
print_weights = LambdaCallback(on_epoch_end=lambda batch, logs: print(model.layers[0].get_weights()))

inds = np.random.randint(0, input_sea.shape[0], input_sea.shape[0]//10)
history = model.fit(input_sea[inds][:, [loc['ps'], loc['pa'], loc['rh']]], output_sea[inds], epochs=epochs_opt, batch_size=batchsize_opt, verbose=2, \
                   callbacks = [print_weights])

In [None]:
# rsat, r0_top, r0_surf, n
best_sea = [model.weights[i].numpy()[0][0] for i in range(len(model.weights))]

In [None]:
print('Trained the sea model')

### Performance with the best hyperparameter setting

In [None]:
# Write results to JSON. In case we don't make it to the second-next cell.
results = {}
results['Order of parameters'] = 'rsat, r0_top, r0_surf, n'
results['Best params land'] = str(best_land)
results['Best params sea'] = str(best_sea)

with open('/home/b/b309170/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_dyamond/results_auto_tuned_%s.json'%output_var, 'w') as file:
    json.dump(results, file)

In [None]:
# Differentiate between original, manually and automatically tuned!
mse_train = evaluate_sundqvist(input_train, output_train, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)
mse_train_land = evaluate_sundqvist(input_land, output_land, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)
mse_train_sea = evaluate_sundqvist(input_sea, output_sea, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)
mse_valid = evaluate_sundqvist(input_valid, output_valid, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)

In [None]:
# Write results to JSON
results = {}
results['Order of parameters'] = 'rsat, r0_top, r0_surf, n'
results['Best params land'] = str(best_land)
results['Best params sea'] = str(best_sea)
results['Training MSE'] = mse_train
results['Land MSE'] = mse_train_land
results['Sea MSE'] = mse_train_sea
results['Validation MSE'] = mse_valid

with open('/home/b/b309170/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_dyamond/results_auto_tuned_%s.json'%output_var, 'w') as file:
    json.dump(results, file)

**Extra plot**

In [None]:
# plt.hist(output_land, bins=40, log=True)
# plt.hist(predictions, bins=40, log = True)
# plt.legend(['Truth: QUBICC over land', 'Original Sundqvist Scheme'])
# plt.title('Cloud Cover distributions')