### Wang Scheme

Actually, the values I found manually were ever so slightly better! And very close!

We had:
*par_opt = 0.9105,
alpha_opt = 913984.9624* (with an MSE of 336.007)

Now we found:
*par_opt = 0.9044,
alpha_opt = 886805.136* (with an MSE of 336.014)

In [50]:
import numpy as np
import sympy as sp
import pandas as pd
import matplotlib.pyplot as plt
import json
import sys
import os
import gc

sys.path.insert(0, '~/workspace_icon-ml/symbolic_regression')
from functions import append_dict_to_json
from sklearn import tree

# Training samples to double-check with csv. Note that this number was not really used!
subset_size = 10**4

np.random.seed(10)

**Read data**

In [51]:
all_possible_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

loc_all = {}
for i in range(len(all_possible_features)):
    loc_all[all_possible_features[i]] = i
    
# Features
features = ['clw', 'cli', 'rh']
no_features = len(features)

loc = {}
for i in range(len(features)):
    loc[features[i]] = i

In [52]:
path_data = os.path.join('~/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND')

input_data = np.load(path_data + '/cloud_cover_input_dyamond.npy')
input_data = np.concatenate([np.expand_dims(input_data[:, loc_all[sel_var]], axis=1) for sel_var in features], axis = 1)

output_data = np.load(path_data + '/cloud_area_output_dyamond.npy')

In [53]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(285179494, 3)

In [54]:
# Construct training and validation data
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

In [55]:
# The second fold yields the best model
flattened_input_train = input_data[training_folds[1]]
flattened_input_valid = input_data[validation_folds[1]]
flattened_output_train = output_data[training_folds[1]]
flattened_output_valid = output_data[validation_folds[1]]
    
# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

525

**Already remove the regime with clw + cli = 0**

In [56]:
flattened_input_valid.shape

(95059830, 3)

In [57]:
reg_0_train = np.where(flattened_input_train[:, loc['clw']] + flattened_input_train[:, loc['cli']] <= 1e-20)[0]
reg_not_0_train = np.where(flattened_input_train[:, loc['clw']] + flattened_input_train[:, loc['cli']] > 1e-20)[0]
flattened_input_train = flattened_input_train[reg_not_0_train]
flattened_output_train = flattened_output_train[reg_not_0_train]

reg_0_valid = np.where(flattened_input_valid[:, loc['clw']] + flattened_input_valid[:, loc['cli']] <= 1e-20)[0]
reg_not_0_valid = np.where(flattened_input_valid[:, loc['clw']] + flattened_input_valid[:, loc['cli']] > 1e-20)[0]

# Relevant values to compute final MSE/R2-scores
mse_reg_0 = np.mean(flattened_output_valid[reg_0_valid]**2)
len_reg_0 = len(reg_0_valid)
len_reg_not_0 = len(reg_not_0_valid)
len_data_output = len(flattened_output_valid)
var_data_output = np.var(flattened_output_valid)

flattened_input_valid = flattened_input_valid[reg_not_0_valid]
flattened_output_valid = flattened_output_valid[reg_not_0_valid]

**Pick the subset**

In [58]:
subset = np.random.randint(0, len(flattened_output_train), subset_size)

Optimize coefficients

In [59]:
def func(X, PAR, ALPHA):
    x0 = X[:, 0] # clw
    x1 = X[:, 1] # cli
    x2 = X[:, 2] # RH

    c = x2**PAR*(1-np.exp(-ALPHA*(x1+x0)))

    # c can exceed 1 very slightly
    c = np.minimum(c, 1)
    
    return 100*c

In [60]:
import scipy as sci
from scipy.optimize import minimize

In [61]:
def objective(P, X,Y):
    '''
        The objective function.
    '''
    PAR, ALPHA = P
    train_preds = np.minimum(np.maximum(func(X, PAR, ALPHA), 0), 100)
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)
    return train_mse

(PAR, ALPHA) = (0.25, 10**5)

res = minimize(objective, (PAR, ALPHA), \
               args=(flattened_input_train[subset], flattened_output_train[subset]), \
               method='BFGS', options={'disp': True})

         Current function value: 504.995694
         Iterations: 41
         Function evaluations: 249
         Gradient evaluations: 79


In [62]:
print(res.x)

[9.04354849e-01 8.86805136e+05]


Original values

In [63]:
objective((PAR, ALPHA), flattened_input_train[subset], \
          flattened_output_train[subset])

831.6921481117523

In [64]:
mse_reg_1 = objective((PAR, ALPHA), \
                      flattened_input_valid, flattened_output_valid)

mse_orig_total = (mse_reg_0*len_reg_0 + mse_reg_1*len_reg_not_0)/len_data_output
r2_orig_total = 1 - mse_orig_total/var_data_output

print(mse_orig_total, r2_orig_total)

545.9372757787937 0.6839930207008795


New values

In [65]:
objective(res.x, flattened_input_train[subset], flattened_output_train[subset])

504.99569398253533

In [66]:
mse_reg_1 = objective(res.x, flattened_input_valid, flattened_output_valid)

In [67]:
results = {}

mse_new_total = (mse_reg_0*len_reg_0 + mse_reg_1*len_reg_not_0)/len_data_output
r2_new_total = 1 - mse_new_total/var_data_output

print(mse_new_total, r2_new_total)

results['MSE'] = mse_new_total
results['R2'] = r2_new_total

336.0143736008881 0.8055035039488188


In [71]:
# [0.9, 900000] is even slightly better!!
print('Best values: ')
res.x

Best values: 


array([9.04354849e-01, 8.86805136e+05])

**Save results**

In [68]:
# Dump results
with open('results_BFGS_tuned_cl_area.json', 'w') as file:
    json.dump(results, file)

In [None]:
def predict(P,X):
    '''
        The objective function.
    '''
    PAR, ALPHA = P
    preds = np.minimum(np.maximum(func(X, PAR, ALPHA), 0), 100)

    return preds

plt.hist(flattened_output_valid,bins=100, histtype='step', color='k')
plt.hist(predict(res.x, flattened_input_valid),bins=100, histtype='step')

plt.yscale('log')
plt.legend(['DYAMOND', 'Wang'])
plt.savefig('results_BFGS_tuned_cl_area.pdf')