### Evaluate the Xu-Randall Scheme

Executed through ~scripts/run_era5_evalute_and_transfer_learn_2.sh

In [1]:
import os
import sys
import json
import numpy as np
import xarray as xr
import sympy as sp
import matplotlib.pyplot as plt

# Training samples to double-check with csv
subset_exp = int(sys.argv[1])
# subset_exp = 2
number_horizontal_locations = 10**subset_exp

sys.path.insert(0, '~/workspace_icon-ml/cloud_cover_parameterization/')
import my_classes
from my_classes import load_data

sys.path.insert(0, '~/workspace_icon-ml/symbolic_regression/')
from functions import append_dict_to_json

tl_bool = True
SEED = int(sys.argv[2])
# SEED = 20

num_cells = int(sys.argv[3])

**Load data**

In [2]:
order_of_vars = ['q', 'clwc', 'ciwc', 't', 'pa', 'cc']
data_dict = load_data(source='era5', days='all', order_of_vars=order_of_vars)

TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

# Removing four upper-most levels
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, 4:].copy()

# Data output
data_output = data_dict['cc']
del data_dict['cc']

for key in data_dict.keys():
    print(data_dict[key].shape)
    assert data_dict[key].shape == data_dict[key].shape

(24, 27, 66655)
(24, 27, 66655)
(24, 27, 66655)
(24, 27, 66655)
(24, 27, 66655)


In [4]:
import time
t0 = time.time()

# Add rh
T0 = 273.15
r = 0.00263*data_dict['pa']*data_dict['q']*np.exp((17.67*(data_dict['t']-T0))/(data_dict['t']-29.65))**(-1)
data_dict['rh'] = r

**Reshaping and keeping only the relevant features**

In [5]:
# Keeping only the relevant features
features = ['clwc', 'ciwc', 'rh']
for key in features:
    data_dict[key] = np.reshape(data_dict[key], -1)
    
data_output = np.reshape(data_output, -1)

del data_dict['q']
del data_dict['pa']
del data_dict['t']

no_features = len(data_dict.keys())

**Cast dict into ndarray**

In [6]:
# data_array = np.zeros((data_dict['q'].size, len(data_dict.keys())), dtype=np.float32)

k = 0
data_array_not_T = []
for key in features:
    print(key)
    data_array_not_T.append(data_dict[key])
    del data_dict[key]
    k += 1

# Convert into np array and transpose
data_array = np.transpose(np.array(data_array_not_T, dtype=np.float32))

clwc
ciwc
rh


In [7]:
# Update loc
loc = {}
for i in range(len(features)):
    loc[features[i]] = i

**Pick the subset to train on**

In [8]:
seed = np.random.seed(SEED)
subset = np.random.randint(0, HFIELDS, number_horizontal_locations)
# Convert to regular int to make check_sum JSON serializable
check_sum = int(np.sum(subset))

# Collecting all grid cell indices for the horizontal fields given by subset
Z = np.zeros((TIMESTEPS, 27, HFIELDS), dtype=int)
for k in range(HFIELDS):
    Z[:,:,k] = k
Z_res = np.reshape(Z, -1)
subset_inds = np.concatenate([np.where(Z_res == s)[0] for s in subset])

In [9]:
train_input = data_array[subset_inds[:num_cells]] #num_hours*27
train_output = data_output[subset_inds[:num_cells]] #num_hours*27

**Already remove the regime with clw + cli = 0**

In [10]:
reg_0 = np.where(data_array[:, loc['clwc']] + data_array[:, loc['ciwc']] <= 1e-20)[0]
reg_not_0 = np.where(data_array[:, loc['clwc']] + data_array[:, loc['ciwc']] > 1e-20)[0]

# Relevant values to compute final MSE/R2-scores
mse_reg_0 = np.mean(data_output[reg_0]**2)
len_reg_0 = len(reg_0)
len_reg_not_0 = len(reg_not_0)
len_data_output = len(data_output)
var_data_output = np.var(data_output)

data_array = data_array[reg_not_0].copy()
data_output = data_output[reg_not_0].copy()

In [11]:
print(mse_reg_0)
print(data_array.shape)
print(data_output.shape)

# Should be 338023
len_reg_0

4.1247884e-05
(19717305, 3)
(19717305,)


23475135

Optimize coefficients

In [12]:
data_array.shape

(19717305, 3)

In [13]:
def func(X, PAR, ALPHA):
    x0 = X[:, 0] # clw
    x1 = X[:, 1] # cli
    x2 = X[:, 2] # RH

    # x2 can be slightly negative which is problematic
    c = np.maximum(0, x2)**PAR*(1-np.exp(-ALPHA*(x1+x0)))

    # c can exceed 1 very slightly
    c = np.minimum(c, 1)
    
    return 100*c

In [14]:
import scipy as sci
from scipy.optimize import minimize

In [15]:
def objective(P, X,Y):
    '''
        The objective function.
    '''
    PAR, ALPHA = P
    train_preds = np.minimum(np.maximum(func(X, PAR, ALPHA), 0), 100)
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)

    return train_mse

(PAR, ALPHA) = 0.9105, 913984.9624

# No iterations with CG or BFGS
if tl_bool:
    res = minimize(objective, (PAR, ALPHA), args=(train_input, train_output), \
                   method='Nelder-Mead', options={'disp': True})
else:
    # Compute the MSE and terminate if not tl_bool
    P = (PAR, ALPHA)
    mse_reg_1 = objective(P, data_array, data_output)
    
    results = {}
    
    mse_new_total = (mse_reg_0*len_reg_0 + mse_reg_1*len_reg_not_0)/len_data_output
    r2_new_total = 1 - mse_new_total/var_data_output

    print(mse_new_total, r2_new_total)

    parent_key = 'xu_randall_no_tl'
    results[parent_key] = {}
    results[parent_key]['MSE_era5_tuned'] = mse_new_total
    results[parent_key]['R2_era5_tuned'] = r2_new_total
    results[parent_key]['Coefficients'] = list(res.x)
    # Should be the same for all runs
    results[parent_key]['Check_sum'] = check_sum
    
    # Finish the code
    1/0

Optimization terminated successfully.
         Current function value: 46.769210
         Iterations: 14
         Function evaluations: 45
         Gradient evaluations: 15


In [16]:
print(res.x)

[1.03745879e+01 9.13984962e+05]


In [17]:
list(np.round(res.x, 2))

[10.37, 913984.96]

In [18]:
results = {}

Original values

In [19]:
mse_reg_1 = objective((PAR, ALPHA), data_array, data_output)

In [20]:
mse_orig_total = (mse_reg_0*len_reg_0 + mse_reg_1*len_reg_not_0)/len_data_output
r2_orig_total = 1 - mse_orig_total/var_data_output

print(mse_orig_total, r2_orig_total)

# results['MSE_dya_tuned'] = mse_orig_total
# results['R2_dya_tuned'] = r2_orig_total

791.96438699287 -1.4353001255213025


New values

In [21]:
mse_reg_1 = objective(res.x, data_array, data_output)

In [22]:
mse_new_total = (mse_reg_0*len_reg_0 + mse_reg_1*len_reg_not_0)/len_data_output
r2_new_total = 1 - mse_new_total/var_data_output

print(mse_new_total, r2_new_total)

parent_key = 'xu_randall_tl_%d_num_cells_%d_seed_%d'%(subset_exp,num_cells,SEED)
results[parent_key] = {}
results[parent_key]['MSE_era5_tuned'] = mse_new_total
results[parent_key]['R2_era5_tuned'] = r2_new_total
results[parent_key]['Coefficients'] = list(res.x)
# Should be the same for all runs
results[parent_key]['Check_sum'] = check_sum

272.31244846409373 0.1626359078547921


**Save results**

In [23]:
# Dump results
append_dict_to_json(results, '~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/era5_xu_randall.json')

In [24]:
# def predict(P,X):
#     '''
#         The objective function.
#     '''
#     PAR, ALPHA = P
#     preds = [np.minimum(np.maximum(func(X[k_ind], PAR, ALPHA), 0), 100) for k_ind in range(X.shape[0])]

#     return preds

# predict(res.x, data_array)

# plt.hist(data_output,bins=100, histtype='step', color='k')
# plt.hist(predict(res.x, data_array),bins=100, histtype='step')

# plt.yscale('log')
# plt.legend(['ERA5', 'Xu_randall Scheme'])
# plt.savefig('~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_xu_randall.pdf')