### Check some of the most promising equations and possibly optimize their coefficients

Executed through /home/b/b309170/scripts/run_era5_evalute_and_transfer_learn_5.sh

In [37]:
import numpy as np
import sympy as sp
import pandas as pd
import matplotlib.pyplot as plt
import json
import sys
import os
import gc

import scipy as sci
from scipy.optimize import minimize

sys.path.insert(0, '/home/b/b309170/workspace_icon-ml/cloud_cover_parameterization/')
import my_classes
from my_classes import load_data

sys.path.insert(0, '/home/b/b309170/workspace_icon-ml/symbolic_regression/')
from functions import append_dict_to_json

subset_exp = int(sys.argv[1])
number_horizontal_locations = 10**subset_exp

tl_bool = True
SEED = int(sys.argv[2])

num_cells = int(sys.argv[3])

**Read data**

In [38]:
order_of_vars = ['q', 'clwc', 'ciwc', 't', 'pa', 'zg', 'cc']
data_dict = load_data(source='era5', days='all', order_of_vars=order_of_vars)

TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

# Removing four upper-most levels
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, 4:].copy()

# Data output
data_output = data_dict['cc']
del data_dict['cc']

for key in data_dict.keys():
    print(data_dict[key].shape)
    assert data_dict[key].shape == data_dict[key].shape

(24, 27, 66655)
(24, 27, 66655)
(24, 27, 66655)
(24, 27, 66655)
(24, 27, 66655)
(24, 27, 66655)


In [39]:
# Add rh
T0 = 273.15
r = 0.00263*data_dict['pa']*data_dict['q']*np.exp((17.67*(data_dict['t']-T0))/(data_dict['t']-29.65))**(-1)
data_dict['rh'] = r

**Reshaping and keeping only the relevant features**

In [40]:
features = ['rh', 't', 'clwc', 'ciwc']

In [41]:
# Keeping only the relevant features
for key in features:
    data_dict[key] = np.reshape(data_dict[key], -1)
    
data_output = np.reshape(data_output, -1)

del data_dict['q']
del data_dict['pa']
del data_dict['zg']

no_features = len(data_dict.keys())

**Cast dict into ndarray**

In [42]:
# data_array = np.zeros((data_dict['q'].size, len(data_dict.keys())), dtype=np.float32)

k = 0
data_array_not_T = []
for key in features:
    print(key)
    data_array_not_T.append(data_dict[key])
    del data_dict[key]
    k += 1

# Convert into np array and transpose
data_array = np.transpose(np.array(data_array_not_T, dtype=np.float32))

rh
t
clwc
ciwc


In [43]:
# Update loc
loc = {}
for i in range(len(features)):
    loc[features[i]] = i

**Pick the subset**

In [44]:
seed = np.random.seed(SEED)
subset = np.random.randint(0, HFIELDS, number_horizontal_locations)
# Convert to regular int to make check_sum JSON serializable
check_sum = int(np.sum(subset))

# Collecting all grid cell indices for the horizontal fields given by subset
Z = np.zeros((TIMESTEPS, 27, HFIELDS), dtype=int)
for k in range(HFIELDS):
    Z[:,:,k] = k
Z_res = np.reshape(Z, -1)
subset_inds = np.concatenate([np.where(Z_res == s)[0] for s in subset])

In [45]:
train_input = data_array[subset_inds[:num_cells]] #num_hours*27
train_output = data_output[subset_inds[:num_cells]] #num_hours*27

**Already remove the regime with clw + cli = 0**

In [46]:
reg_0 = np.where(data_array[:, loc['clwc']] + data_array[:, loc['ciwc']] <= 1e-20)[0]
reg_not_0 = np.where(data_array[:, loc['clwc']] + data_array[:, loc['ciwc']] > 1e-20)[0]

# Relevant values to compute final MSE/R2-scores
mse_reg_0 = np.mean(data_output[reg_0]**2)
len_reg_0 = len(reg_0)
len_reg_not_0 = len(reg_not_0)
len_data_output = len(data_output)
var_data_output = np.var(data_output)

data_array = data_array[reg_not_0]
data_output = data_output[reg_not_0]

In [47]:
print(mse_reg_0)
print(data_array.shape)
print(data_output.shape)

# Should be 338023
len_reg_0

4.1247884e-05
(19717305, 4)
(19717305,)


23475135

**Normalize the features**

In [48]:
all_possible_features = ['hus', 'clwc', 'ciwc', 't', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']
loc_all = {}
for i in range(len(all_possible_features)):
    loc_all[all_possible_features[i]] = i

# Scale the data
mean_all = [4.12205844e-03,2.25493498e-05,3.38180032e-06,2.57065512e+02,6.00030443e+04,5.64080139e+03,2.35046400e-01,1.32776682e+01,6.02512234e-01,9.86270417e+04,-1.27545273e-06,-4.02484958e-10,1.65204582e-08,-4.34660202e-11,4.29441131e-10,-1.82817316e-12,-4.68742483e-03,-7.54899040e-07,-7.51544542e+00,-1.06989723e-04,1.65615172e-03,-9.27604679e-06,-4.76200071e-05,-1.32246548e-07]
std_all = [5.07648249e-03,5.69702638e-05,1.01308124e-05,3.00533874e+01,3.12514292e+04,5.66963918e+03,4.11184302e-01,1.11389888e+01,3.32494615e-01,6.24039256e+03,2.03179260e-06,1.17041141e-08,1.33311867e-07,1.42840744e-09,6.73384546e-09,5.07424672e-11,5.82875686e-03,6.34826092e-05,3.53136052e+00,1.13215264e-02,6.62892130e-03,6.08144307e-05,2.58065098e-04,2.49552692e-06]

mean = np.concatenate([np.expand_dims(mean_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)
std = np.concatenate([np.expand_dims(std_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)

# Work with scaled training folds
data_scaled = (data_array - mean)/std
train_input = (train_input - mean)/std

del data_array

**Define and optimize equations**

In [49]:
# Complexity = 11
def eq_1(X, a=52.6708, b=23.4291, c=1, d=1, e=1, f=119.675, g=1, h=1, i=1, j=1, k=1):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]
    return a - b*(-c*np.exp(d*x0) + e*np.sin(f + g*x1) + (np.sin(np.sqrt(np.abs(h*x3))))/(i*x2 + j*x3 + np.cos(k*x2)))

# Complexity = 9
def eq_2(X, a=1531.13, b=2.53774, c=-89.99, d=1, e=60.204, f=1, g=4.60663, h=-0.89465, i=1):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]
    return a + b*(-np.sqrt(np.abs(c + d*x1))*(e - f*x0 + x1) + g/(h - i*x2 - x3))

# Complexity = 7
def eq_3(X, a=49.9941, b=13.4357, c=1, d=1, e=1, f=1, g=2):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]
    return a + b*(np.exp(c*x0) - d*x2 + e*np.log(np.abs(np.exp(f*x2) + g*x3)))

# Complexity = 10
def eq_4(X, a=91.3875, b=1.7767, c=9.03952, d=1, e=2, f=1, g=1, h=1, i=1, j=1):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]
    return a - (b*(c - d*np.exp(e*x0) + f*x0 + x1))/(g*np.exp(h*x2 + i*x3) + j*x3)

In [50]:
# # Equation 1
# def objective_eq_1(P, X,Y):
#     '''
#         The objective function.
#     '''
#     a,b,c,d,e,f,g,h,i,j,k = P
#     train_preds = np.minimum(np.maximum(eq_1(X, a,b,c,d,e,f,g,h,i,j,k), 0), 100)
#     train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)
#     return train_mse

# (a,b,c,d,e,f,g,h,i,j,k) = (52.6708, 23.4291, 1, 1, 1, 119.675, 1, 1, 1, 1, 1)
# res_1 = minimize(objective_eq_1, (a,b,c,d,e,f,g,h,i,j,k), args=(train_data_scaled[subset_inds], flattened_output_train[subset_inds]), \
#                method='BFGS', options={'disp': True})
# res_1

In [51]:
# # Equation 2
# def objective_eq_2(P, X,Y):
#     '''
#         The objective function.
#     '''
#     a,b,c,d,e,f,g,h,i = P
#     train_preds = np.minimum(np.maximum(eq_2(X, a,b,c,d,e,f,g,h,i), 0), 100)
#     train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)
#     return train_mse

# (a,b,c,d,e,f,g,h,i) = (1531.13, 2.53774, -89.99, 1, 60.204, 1, 4.60663, -0.89465, 1)
# res_2 = minimize(objective_eq_2, (a,b,c,d,e,f,g,h,i), args=(train_data_scaled[subset_inds], flattened_output_train[subset_inds]), \
#                method='BFGS', options={'disp': True})
# res_2

In [52]:
# Equation 3
def objective_eq_3(P, X,Y):
    '''
        The objective function.
    '''
    a,b,c,d,e,f,g = P
    train_preds = np.minimum(np.maximum(eq_3(X, a,b,c,d,e,f,g), 0), 100)
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)
    return train_mse

(a,b,c,d,e,f,g) = (49.9941,13.4357,1,1,1,1,2)

if tl_bool:
    res_3 = minimize(objective_eq_3, (a,b,c,d,e,f,g), args=(train_input, train_output), \
                   method='Nelder-Mead', options={'disp': True})
    res_3
else:
    # Compute the MSE and terminate if not tl_bool
    P = (a,b,c,d,e,f,g)
    mse_reg_1 = objective(P, data_scaled, data_output)
    
    results = {}

    mse_new_total = (mse_reg_0*len_reg_0 + mse_reg_1*len_reg_not_0)/len_data_output
    r2_new_total = 1 - mse_new_total/var_data_output

    print(mse_new_total, r2_new_total)

    parent_key = 'gpgomea_eq_3_no_tl'
    results[parent_key] = {}
    results[parent_key]['MSE'] = mse_new_total
    results[parent_key]['R2'] = r2_new_total
    results[parent_key]['Coefficients'] = [a,b,c,d,e,f,g]
    # Should be the same for all runs
    results[parent_key]['Check_sum'] = check_sum

         Current function value: 8.970821
         Iterations: 327
         Function evaluations: 3812
         Gradient evaluations: 475


      fun: 8.970821087014626
 hess_inv: array([[ 1.80923127e+00, -2.99749886e-02,  1.29410103e+00,
        -7.95576994e+00,  3.64161954e+00,  1.10436514e-01,
         1.18793619e-01],
       [-2.99749886e-02,  7.75291414e-04, -3.41574079e-02,
         2.08787266e-01, -9.21310803e-02, -2.84480372e-03,
        -2.96906573e-03],
       [ 1.29410103e+00, -3.41574079e-02,  1.51628447e+00,
        -9.47973906e+00,  4.23076498e+00,  1.24865745e-01,
         1.30869867e-01],
       [-7.95576995e+00,  2.08787266e-01, -9.47973906e+00,
         7.80123019e+01, -3.90134451e+01, -7.60369638e-01,
        -8.08442894e-01],
       [ 3.64161955e+00, -9.21310803e-02,  4.23076498e+00,
        -3.90134450e+01,  2.02699849e+01,  3.34737042e-01,
         3.59398233e-01],
       [ 1.10436514e-01, -2.84480372e-03,  1.24865745e-01,
        -7.60369639e-01,  3.34737043e-01,  1.04644651e-02,
         1.08838284e-02],
       [ 1.18793619e-01, -2.96906573e-03,  1.30869867e-01,
        -8.08442894e-01,  3.59398233e

In [53]:
# # Equation 4
# def objective_eq_4(P, X,Y):
#     '''
#         The objective function.
#     '''
#     a,b,c,d,e,f,g,h,i,j = P
#     train_preds = np.minimum(np.maximum(eq_4(X, a,b,c,d,e,f,g,h,i,j), 0), 100)
#     train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)
#     return train_mse

# (a,b,c,d,e,f,g,h,i,j) = (91.3875,1.7767,9.03952,1,2,1,1,1,1,1)
# res_4 = minimize(objective_eq_4, (a,b,c,d,e,f,g,h,i,j), args=(train_data_scaled[subset_inds], flattened_output_train[subset_inds]), \
#                method='BFGS', options={'disp': True})
# res_4

**Evaluate optimized equations**

In [54]:
mse_reg_1 = objective_eq_3(res_3.x, data_scaled, data_output)

In [55]:
results = {}

mse_new_total = (mse_reg_0*len_reg_0 + mse_reg_1*len_reg_not_0)/len_data_output
r2_new_total = 1 - mse_new_total/var_data_output

print(mse_new_total, r2_new_total)

parent_key = 'gpgomea_eq_3_tl_%d_num_cells_%d_seed_%d'%(subset_exp,num_cells,SEED)
results[parent_key] = {}
results[parent_key]['MSE'] = mse_new_total
results[parent_key]['R2'] = r2_new_total
results[parent_key]['Coefficients'] = list(res_3.x)
# Should be the same for all runs
results[parent_key]['Check_sum'] = check_sum

108.17831548662839 0.6673503637157349


**Save results**

In [56]:
# Dump results
append_dict_to_json(results, '/home/b/b309170/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/gpgomea_equations.json')