### Sundqvist Scheme

How about we fit the Sundqvist model where we fit the tuning parameters to the data? <br>
We let the parameters depend on whether they are taken over land or over the sea.

In [36]:
# All samples (190119664): 5.5s per entry of the tensor

In [12]:
import os
import gc
import sys
import time
import numpy as np
import datetime
import matplotlib.pyplot as plt

sys.path.insert(0, '~/workspace_icon-ml/symbolic_regression/')
from functions import evaluate_sundqvist

# Grid search space of hyperparameters
grid_spacing = 0.05

# Shall we use the tuned hyperparameters?
tuned = False

# Added to the PDF name
hour_min = '%d_%d'%(datetime.datetime.now().hour, datetime.datetime.now().minute)

output_var = sys.argv[1] 

In [39]:
# Load columns of data
folder_data = '~/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND/'

input_data = np.load(os.path.join(folder_data, 'cloud_cover_input_dyamond.npy'))
if output_var == 'cl_volume':
    output_data = np.load(os.path.join(folder_data, 'cloud_cover_output_dyamond.npy'))
elif output_var == 'cl_area':
    output_data = np.load(os.path.join(folder_data, 'cloud_area_output_dyamond.npy'))

new_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
                'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

In [40]:
samples_total, no_of_features = input_data.shape

# Split into train/valid
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

# The second fold yields the best model
input_train = input_data[training_folds[1]]
input_valid = input_data[validation_folds[1]]
output_train = output_data[training_folds[1]]
output_valid = output_data[validation_folds[1]]

# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

38

In [41]:
# To locate variables
loc = {}
for i in range(len(new_features)):
    loc[new_features[i]] = i
    
input_train.shape

(190119664, 24)

In [42]:
# We split the training data into cells over land vs sea
land_ind = np.where(input_train[:, loc['fr_land']] > 0.5)[0]
sea_ind = np.where(input_train[:, loc['fr_land']] <= 0.5)[0]

input_land = input_train[land_ind]
output_land = output_train[land_ind]
input_sea = input_train[sea_ind]
output_sea = output_train[sea_ind]

In [43]:
len(land_ind)/input_train.shape[0]

0.23899404745423913

In [11]:
# rsat actually shouldn't really ever be smaller than r0_top or r0_surf

# # First try (0.08):
# rsat_range_land = np.arange(0.9, 1.2, grid_spacing)
# r0_top_range_land = np.arange(0.2, 0.9, grid_spacing)
# r0_surf_range_land = np.arange(0.6, 1, grid_spacing)
# n_range_land = np.arange(0.6, 2.2, grid_spacing)

# rsat_range_sea = np.arange(0.9, 1.2, grid_spacing)
# r0_top_range_sea = np.arange(0.2, 0.9, grid_spacing)
# r0_surf_range_sea = np.arange(0.6, 1, grid_spacing)
# n_range_sea = np.arange(0.6, 2.2, grid_spacing)

# --> Ran for six hours

# Second try (0.05):
if output_var == 'cl_area':
    rsat_range_land = np.arange(0.75, 0.9, grid_spacing)
elif output_var == 'cl_volume':
    rsat_range_land = np.arange(0.9, 1.05, grid_spacing)
r0_top_range_land = np.arange(0.01, 0.3, grid_spacing)
r0_surf_range_land = np.arange(0.4, 0.6, grid_spacing)
n_range_land = np.arange(2.12, 3, grid_spacing)

rsat_range_sea = np.arange(0.9, 1.05, grid_spacing)
r0_top_range_sea = np.arange(0.01, 0.3, grid_spacing)
if output_var == 'cl_area':
    r0_surf_range_sea = np.arange(0.4, 0.6, grid_spacing)
elif output_var == 'cl_volume':
    r0_surf_range_sea = np.arange(0.8, 0.95, grid_spacing)
n_range_sea = np.arange(2.12, 3, grid_spacing)

IndentationError: expected an indented block (<ipython-input-11-6ca57ad496a8>, line 31)

In [9]:
# Estimated required time to run the notebook in hours
# Factor of 2 instead of len(rsat_range_land) due to multiprocessing
print('Estimated required time to run the notebook: %.1f hours'%((5.5*(2*len(r0_top_range_sea)*len(r0_surf_range_sea)*len(n_range_sea) +\
     2*len(r0_top_range_land)*len(r0_surf_range_land)*len(n_range_land)))/3600))

Estimated required time to run the notebook: 5.5 hours


In [46]:
def search_hyperparams_outer_land(rsat):
    return search_hyperparams_inner_land(rsat, r0_top_range_land, r0_surf_range_land, n_range_land)

In [47]:
def search_hyperparams_outer_sea(rsat):
    return search_hyperparams_inner_sea(rsat, r0_top_range_sea, r0_surf_range_sea, n_range_sea)

In [48]:
from contextlib import contextmanager
import multiprocessing as mlp
import gc

@contextmanager
def poolcontext(*args, **kwargs):
    pool = mlp.Pool(*args, **kwargs)
    yield pool
    pool.terminate()

In [49]:
def search_hyperparams_inner_land(rsat, r0_top_range, r0_surf_range, n_range):
    mse_tensor = -np.ones((1, len(r0_top_range), len(r0_surf_range), len(n_range)))
    i2 = -1
    for r0_top in r0_top_range:
        i3 = -1
        i2 += 1
        for r0_surf in r0_surf_range:
            i4 = -1
            i3 += 1
            for n in n_range:
                i4 += 1
                # What is the average error with this set of tuning parameters?
                ps = input_land[:, loc['ps']]
                p = input_land[:, loc['pa']]
                r = input_land[:, loc['rh']]
                try:
                    r0 = r0_top + (r0_surf - r0_top)*np.exp(1-(ps/p)**n)
                    c = np.where(r>r0, 1-np.sqrt((np.minimum(r, rsat) - rsat)/(r0 - rsat)), 0)
                except:
                    c = 2

                mse_tensor[0, i2, i3, i4] = np.mean((100*c - output_land)**2)
                    
    return mse_tensor

def search_hyperparams_inner_sea(rsat, r0_top_range, r0_surf_range, n_range):
    mse_tensor = -np.ones((1, len(r0_top_range), len(r0_surf_range), len(n_range)))
    i2 = -1
    for r0_top in r0_top_range:
        i3 = -1
        i2 += 1
        for r0_surf in r0_surf_range:
            i4 = -1
            i3 += 1
            for n in n_range:
                i4 += 1
                # What is the average error with this set of tuning parameters?
                ps = input_sea[:, loc['ps']]
                p = input_sea[:, loc['pa']]
                r = input_sea[:, loc['rh']]
                try:
                    r0 = r0_top + (r0_surf - r0_top)*np.exp(1-(ps/p)**n)
                    c = np.where(r>r0, 1-np.sqrt((np.minimum(r, rsat) - rsat)/(r0 - rsat)), 0)
                except:
                    c = 2

                mse_tensor[0, i2, i3, i4] = np.mean((100*c - output_sea)**2)
                    
    return mse_tensor

### Fitting hyperparameters
Originally: $r_{sat} = 1, r_{0, top} = 0.8, r_{0, surf} = 0.968, n = 2$

**Land**

In [50]:
t0 = time.time()

In [None]:
procs = len(rsat_range_land)
with poolcontext(processes=procs) as pool:
    # Every process received a part of data_dict
    mse_tensor_land = pool.map(search_hyperparams_outer_land, rsat_range_land)
    
mse_tensor_land = np.squeeze(np.array(mse_tensor_land))
                
# assert np.all(mse_tensor_land >= 0)
np.save('~/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_dyamond/mse_tensor_land_%s.npy'%hour_min, mse_tensor_land)

In [None]:
min_mse = 10**10
opt_ind = []
for i in range(mse_tensor_land.shape[0]):
    for j in range(mse_tensor_land.shape[1]):
        for k in range(mse_tensor_land.shape[2]):
            for l in range(mse_tensor_land.shape[3]):
                if mse_tensor_land[i,j,k,l] < min_mse:
                    min_mse = mse_tensor_land[i,j,k,l]
                    opt_ind = [i, j, k, l]                 
                    
with open('~/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_dyamond/best_results.txt', 'a') as file:
    file.write('Time it took to get through land: %.3f\n'%(time.time() - t0))
    file.write('Output variable: %s\n'%output_var)
    file.write('Best values for the land part: %s\n'%str([rsat_range_land[opt_ind[0]], r0_top_range_land[opt_ind[1]], r0_surf_range_land[opt_ind[2]], n_range_land[opt_ind[3]]]))

**Sea**

In [None]:
t0 = time.time()

In [None]:
procs = len(rsat_range_sea)
with poolcontext(processes=procs) as pool:
    # Every process received a part of data_dict
    mse_tensor_sea = pool.map(search_hyperparams_outer_sea, rsat_range_sea)
    
mse_tensor_sea = np.squeeze(np.array(mse_tensor_sea))
                
# assert np.all(mse_tensor_land >= 0)
np.save('~/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_dyamond/mse_tensor_sea_%s.npy'%hour_min, mse_tensor_sea)

In [None]:
min_mse = 10**10
opt_ind = []
for i in range(mse_tensor_sea.shape[0]):
    for j in range(mse_tensor_sea.shape[1]):
        for k in range(mse_tensor_sea.shape[2]):
            for l in range(mse_tensor_sea.shape[3]):
                if mse_tensor_sea[i,j,k,l] < min_mse:
                    min_mse = mse_tensor_sea[i,j,k,l]
                    opt_ind = [i, j, k, l]                 
                    
with open('~/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_dyamond/best_results.txt', 'a') as file:
    file.write('Time it took to get through the sea: %.3f\n'%(time.time() - t0))
    file.write('Output variable: %s\n'%output_var)
    file.write('Best values for the sea part: %s\n'%str([rsat_range_sea[opt_ind[0]], r0_top_range_sea[opt_ind[1]], r0_surf_range_sea[opt_ind[2]], n_range_sea[opt_ind[3]]]))

### Plotting the results (To run)

In [None]:
# mse_tensor_land = np.load('~/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_narval_r2b4/mse_tensor_land_%d.npy'%ran)

In [None]:
# plt.figure(figsize=(10, 6))
# plt.subplots_adjust(hspace=0.4)
# plt.suptitle("Hyperparameter performance - land", fontsize=18, y=1)

# hyp_par = ['rsat', 'r0_top', 'r0_surf', 'n']
# hyp_par_range = [rsat_range_land, r0_top_range_land, r0_surf_range_land, n_range_land]

# axes = (0,1,2,3)
# for i, par in enumerate(hyp_par):
#     # Add new subplot iteratively
#     ax = plt.subplot(2, 2, i + 1)
    
#     axis = axes[:i] + axes[(i+1):]
#     ax.plot(hyp_par_range[i], np.min(mse_tensor_land, axis=axis))
    
#     ax.set_title(par)
#     if i in [0, 2]:
#         ax.set_ylabel('minimal MSE')

# plt.savefig('~/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_narval_r2b4/hyp_land_%d.pdf'%hour_min, \
#             bbox_inches='tight')

In [None]:
# mse_tensor_sea = np.load('~/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_narval_r2b4/mse_tensor_sea_%d.npy'%ran)

In [None]:
# plt.figure(figsize=(10, 6))
# plt.subplots_adjust(hspace=0.4)
# plt.suptitle("Hyperparameter performance - sea", fontsize=18, y=1)

# hyp_par = ['rsat', 'r0_top', 'r0_surf', 'n']
# hyp_par_range = [rsat_range_sea, r0_top_range_sea, r0_surf_range_sea, n_range_sea]

# axes = (0,1,2,3)
# for i, par in enumerate(hyp_par):
#     # Add new subplot iteratively
#     ax = plt.subplot(2, 2, i + 1)
    
#     axis = axes[:i] + axes[(i+1):]
#     ax.plot(hyp_par_range[i], np.min(mse_tensor_sea, axis=axis))
    
#     ax.set_title(par)
#     if i in [0, 2]:
#         ax.set_ylabel('minimal MSE')

# plt.savefig('~/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_narval_r2b4/hyp_sea_%d.pdf'%hour_min, \
#            bbox_inches='tight')

### Performance with the best hyperparameter setting (To run)

In [None]:
# tuned='manually'

In [None]:
# # Differentiate between original, manually and automatically tuned!
# mse_train = evaluate_sundqvist(input_train, output_train, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)
# mse_train_land = evaluate_sundqvist(input_land, output_land, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)
# mse_train_sea = evaluate_sundqvist(input_sea, output_sea, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)
# mse_valid = evaluate_sundqvist(input_valid, output_valid, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)

In [None]:
# with open('~/workspace_icon-ml/symbolic_regression/baselines/sundqvist_results/manual_gradient_descent/results.txt', 'a') as file:
#     file.write('With tuned hyperparameters: %s\n'%tuned)
#     file.write('Training score:\n')
#     file.write('MSE: %.3f, R2: %.3f\n'%(mse_train, r2_train))
#     file.write('Over land: \n')
#     file.write('MSE: %.3f, R2: %.3f\n'%(mse_train_land, r2_train_land))
#     file.write('Over sea:\n')
#     file.write('MSE: %.3f, R2: %.3f\n'%(mse_train_sea, r2_train_sea))
#     file.write('Validation score:\n')
#     file.write('MSE: %.3f, R2: %.3f\n\n'%(mse_valid, r2_valid))