### Check some of the most promising equations and possibly optimize their coefficients

In [1]:
import numpy as np
import sympy as sp
import pandas as pd
import matplotlib.pyplot as plt
import json
import sys
import os
import gc

np.random.seed(10)

**Read data**

In [25]:
all_possible_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

loc_all = {}
for i in range(len(all_possible_features)):
    loc_all[all_possible_features[i]] = i
    
# Features
features = ['rh', 'ta', 'clw', 'cli', 'rh_z']
no_features = len(features)

loc = {}
for i in range(len(features)):
    loc[features[i]] = i

In [4]:
path_data = os.path.join('/home/b/b309170/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND')

# Load the input data and pick the five best features (rh, ta, clw, cli, rh_z)
input_data = np.load(path_data + '/cloud_cover_input_dyamond.npy')
input_data = np.concatenate([np.expand_dims(input_data[:, loc_all[sel_var]], axis=1) for sel_var in features], axis = 1)

output_data = np.load(path_data + '/cloud_area_output_dyamond.npy')

In [5]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(285179494, 5)

In [6]:
# Construct training and validation data
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

In [7]:
# The second fold yields the best model
flattened_input_train = input_data[training_folds[1]]
flattened_input_valid = input_data[validation_folds[1]]
flattened_output_train = output_data[training_folds[1]]
flattened_output_valid = output_data[validation_folds[1]]
    
# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

0

**Already remove the regime with clw + cli = 0**

In [26]:
reg_not_0_train = np.where(flattened_input_train[:, loc['clw']] + flattened_input_train[:, loc['cli']] != 0)[0]
flattened_input_train = flattened_input_train[reg_not_0_train]
flattened_output_train = flattened_output_train[reg_not_0_train]

reg_not_0_valid = np.where(flattened_input_valid[:, loc['clw']] + flattened_input_valid[:, loc['cli']] != 0)[0]
reg_0_valid = np.where(flattened_input_valid[:, loc['clw']] + flattened_input_valid[:, loc['cli']] == 0)[0]

# Relevant values to compute final MSE/R2-scores
mse_reg_0 = np.mean(flattened_output_valid[reg_0_valid]**2)
len_reg_0 = len(reg_0_valid)
len_reg_not_0 = len(reg_not_0_valid)
len_data_output = len(flattened_output_valid)
var_data_output = np.var(flattened_output_valid)

flattened_input_valid = flattened_input_valid[reg_not_0_valid]
flattened_output_valid = flattened_output_valid[reg_not_0_valid]

**Normalize the features**

In [28]:
# Scale the data
mean_all = [4.12205844e-03,2.25493498e-05,3.38180032e-06,2.57065512e+02,6.00030443e+04,5.64080139e+03,2.35046400e-01,1.32776682e+01,6.02512234e-01,9.86270417e+04,-1.27545273e-06,-4.02484958e-10,1.65204582e-08,-4.34660202e-11,4.29441131e-10,-1.82817316e-12,-4.68742483e-03,-7.54899040e-07,-7.51544542e+00,-1.06989723e-04,1.65615172e-03,-9.27604679e-06,-4.76200071e-05,-1.32246548e-07]
std_all = [5.07648249e-03,5.69702638e-05,1.01308124e-05,3.00533874e+01,3.12514292e+04,5.66963918e+03,4.11184302e-01,1.11389888e+01,3.32494615e-01,6.24039256e+03,2.03179260e-06,1.17041141e-08,1.33311867e-07,1.42840744e-09,6.73384546e-09,5.07424672e-11,5.82875686e-03,6.34826092e-05,3.53136052e+00,1.13215264e-02,6.62892130e-03,6.08144307e-05,2.58065098e-04,2.49552692e-06]

mean = np.concatenate([np.expand_dims(mean_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)
std = np.concatenate([np.expand_dims(std_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)

# Work with scaled training folds
train_data_scaled = (flattened_input_train - mean)/std
valid_data_scaled = (flattened_input_valid - mean)/std

**Define and evaluate equations**

In [35]:
def eq_1(X):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]; x4 = X[:, 4]
    return 52.6708 - 23.4291*(-np.exp(x0) + np.sin(119.675 + x1) + (np.sin(np.sqrt(np.abs(x3))))/(x2 + x3 + np.cos(x2)))

def eq_2(X):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]; x4 = X[:, 4]
    return 1531.13 + 2.53774*(-np.sqrt(np.abs(-89.99 + x1))*(60.204 - x0 + x1) + 4.60663/(-0.89465 - x2 - x3))

def eq_3(X):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]; x4 = X[:, 4]
    return 49.9941 + 13.4357*(np.exp(x0) - x2 + np.log(np.exp(x2) + 2*x3))

def eq_4(X):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]; x4 = X[:, 4]
    return 91.3875 - (1.7767*(9.03952 - np.exp(2*x0) + x0 + x1))/(np.exp(x2 + x3) + x3)

In [47]:
# Evaluate on validation data -- unbounded validation loss
preds_valid_1 = eq_1(valid_data_scaled)
preds_valid_2 = eq_2(valid_data_scaled)
preds_valid_3 = eq_3(valid_data_scaled)
preds_valid_4 = eq_4(valid_data_scaled)

mse_reg_1_valid_eq_1 = np.mean((flattened_output_valid - preds_valid_1)**2, dtype=np.float64)
mse_reg_1_valid_eq_2 = np.mean((flattened_output_valid - preds_valid_2)**2, dtype=np.float64)
mse_reg_1_valid_eq_3 = np.mean((flattened_output_valid - preds_valid_3)**2, dtype=np.float64)
mse_reg_1_valid_eq_4 = np.mean((flattened_output_valid - preds_valid_4)**2, dtype=np.float64)

mse_total_eq_1 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_1*len_reg_not_0)/len_data_output
mse_total_eq_2 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_2*len_reg_not_0)/len_data_output
mse_total_eq_3 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_3*len_reg_not_0)/len_data_output
mse_total_eq_4 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_4*len_reg_not_0)/len_data_output

r2_total_eq_1 = 1 - mse_total_eq_1/var_data_output
r2_total_eq_2 = 1 - mse_total_eq_2/var_data_output
r2_total_eq_3 = 1 - mse_total_eq_3/var_data_output
r2_total_eq_4 = 1 - mse_total_eq_4/var_data_output

print(mse_total_eq_1, mse_total_eq_2, mse_total_eq_3, mse_total_eq_4)
print(r2_total_eq_1, r2_total_eq_2, r2_total_eq_3, r2_total_eq_4)

150.33498795650283 171.60863073784284 179.24163023825918 177.589459368471
0.9129810190019828 0.9006671142875085 0.8962488757422884 0.8972052081801183


In [50]:
# Evaluate on validation data -- bounded validation loss
preds_valid_1 = np.minimum(np.maximum(eq_1(valid_data_scaled), 0), 100)
preds_valid_2 = np.minimum(np.maximum(eq_2(valid_data_scaled), 0), 100)
preds_valid_3 = np.minimum(np.maximum(eq_3(valid_data_scaled), 0), 100)
preds_valid_4 = np.minimum(np.maximum(eq_4(valid_data_scaled), 0), 100)

mse_reg_1_valid_eq_1 = np.mean((flattened_output_valid - preds_valid_1)**2, dtype=np.float64)
mse_reg_1_valid_eq_2 = np.mean((flattened_output_valid - preds_valid_2)**2, dtype=np.float64)
mse_reg_1_valid_eq_3 = np.mean((flattened_output_valid - preds_valid_3)**2, dtype=np.float64)
mse_reg_1_valid_eq_4 = np.mean((flattened_output_valid - preds_valid_4)**2, dtype=np.float64)

mse_total_eq_1 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_1*len_reg_not_0)/len_data_output
mse_total_eq_2 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_2*len_reg_not_0)/len_data_output
mse_total_eq_3 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_3*len_reg_not_0)/len_data_output
mse_total_eq_4 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_4*len_reg_not_0)/len_data_output

r2_total_eq_1 = 1 - mse_total_eq_1/var_data_output
r2_total_eq_2 = 1 - mse_total_eq_2/var_data_output
r2_total_eq_3 = 1 - mse_total_eq_3/var_data_output
r2_total_eq_4 = 1 - mse_total_eq_4/var_data_output

print(mse_total_eq_1, mse_total_eq_2, mse_total_eq_3, mse_total_eq_4)
print(r2_total_eq_1, r2_total_eq_2, r2_total_eq_3, r2_total_eq_4)

144.82318624317864 168.31594233476454 173.4763427725344 173.85727224500943
0.9161714364495525 0.9025730338174502 0.8995860193257447 0.8993655244497516


In [51]:
# Test generalization skill on regime 1: The skill is even better on the validation set!
preds_train_1 = np.minimum(np.maximum(eq_1(train_data_scaled), 0), 100)
preds_train_2 = np.minimum(np.maximum(eq_2(train_data_scaled), 0), 100)
preds_train_3 = np.minimum(np.maximum(eq_3(train_data_scaled), 0), 100)
preds_train_4 = np.minimum(np.maximum(eq_4(train_data_scaled), 0), 100)

mse_reg_1_train_eq_1 = np.mean((flattened_output_train - preds_train_1)**2, dtype=np.float64)
mse_reg_1_train_eq_2 = np.mean((flattened_output_train - preds_train_2)**2, dtype=np.float64)
mse_reg_1_train_eq_3 = np.mean((flattened_output_train - preds_train_3)**2, dtype=np.float64)
mse_reg_1_train_eq_4 = np.mean((flattened_output_train - preds_train_4)**2, dtype=np.float64)

print(mse_reg_1_train_eq_1, mse_reg_1_valid_eq_1)
print(mse_reg_1_train_eq_2, mse_reg_1_valid_eq_2)
print(mse_reg_1_train_eq_3, mse_reg_1_valid_eq_3)
print(mse_reg_1_train_eq_4, mse_reg_1_valid_eq_4)

221.96726618493997 219.75643642813347
258.2413318887508 255.40759306220806
266.1967115099247 263.23869825839114
267.3748143037159 263.81677331424237


**Define and optimize equations**

In [63]:
# Complexity = 11
def eq_1(X, a=52.6708, b=23.4291, c=1, d=1, e=1, f=119.675, g=1, h=1, i=1, j=1, k=1):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]; x4 = X[:, 4]
    return a - b*(-c*np.exp(d*x0) + e*np.sin(f + g*x1) + (np.sin(np.sqrt(np.abs(h*x3))))/(i*x2 + j*x3 + np.cos(k*x2)))

# Complexity = 9
def eq_2(X, a=1531.13, b=2.53774, c=-89.99, d=1, e=60.204, f=1, g=4.60663, h=-0.89465, i=1):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]; x4 = X[:, 4]
    return a + b*(-np.sqrt(np.abs(c + d*x1))*(e - f*x0 + x1) + g/(h - i*x2 - x3))

# Complexity = 7
def eq_3(X, a=49.9941, b=13.4357, c=1, d=1, e=1, f=1, g=2):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]; x4 = X[:, 4]
    return a + b*(np.exp(c*x0) - d*x2 + e*np.log(np.abs(np.exp(f*x2) + g*x3)))

# Complexity = 10
def eq_4(X, a=91.3875, b=1.7767, c=9.03952, d=1, e=2, f=1, g=1, h=1, i=1, j=1):
    x0 = X[:, 0]; x1 = X[:, 1]; x2 = X[:, 2]; x3 = X[:, 3]; x4 = X[:, 4]
    return a - (b*(c - d*np.exp(e*x0) + f*x0 + x1))/(g*np.exp(h*x2 + i*x3) + j*x3)

In [None]:
import scipy as sci
from scipy.optimize import minimize

T_subset = 10**6
inds = np.random.randint(0, train_data_scaled.shape[0], T_subset)

In [60]:
# Equation 1
def objective_eq_1(P, X,Y):
    '''
        The objective function.
    '''
    a,b,c,d,e,f,g,h,i,j,k = P
    train_preds = np.minimum(np.maximum(eq_1(X, a,b,c,d,e,f,g,h,i,j,k), 0), 100)
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)
    return train_mse

(a,b,c,d,e,f,g,h,i,j,k) = (52.6708, 23.4291, 1, 1, 1, 119.675, 1, 1, 1, 1, 1)
res_1 = minimize(objective_eq_1, (a,b,c,d,e,f,g,h,i,j,k), args=(train_data_scaled[inds], flattened_output_train[inds]), \
               method='BFGS', options={'disp': True})
res_1

         Current function value: 186.649310
         Iterations: 498
         Function evaluations: 6912
         Gradient evaluations: 576


      fun: 186.6493100195604
 hess_inv: array([[ 2.59399036e+01, -2.52961623e+00, -4.56969841e+00,
         9.34943876e-01,  9.81607048e+00, -1.61350996e-01,
        -2.00669976e-01,  4.09999616e+01,  2.08767247e-01,
         1.70902315e-01, -8.54107708e+00],
       [-2.52961623e+00,  4.47559664e-01,  9.23442243e-02,
        -9.88884058e-02, -1.38847775e+00,  2.17086083e-02,
         1.98590890e-02, -4.42317239e+00, -3.94037350e-02,
        -5.24199437e-03,  7.16956938e-01],
       [-4.56969841e+00,  9.23442243e-02,  1.61879747e+00,
        -1.69193143e-01, -9.31704331e-01,  3.18137950e-02,
         1.85628125e-02, -6.40609765e+00,  4.86409147e-03,
        -5.77384393e-02,  1.64308702e+00],
       [ 9.34943876e-01, -9.88884058e-02, -1.69193143e-01,
         3.63170482e-02,  3.77196768e-01, -6.79319069e-03,
        -6.18768587e-03,  1.49054388e+00,  7.55250556e-03,
         6.44307554e-03, -2.96436760e-01],
       [ 9.81607048e+00, -1.38847775e+00, -9.31704331e-01,
         3.77196768e-

In [61]:
# Equation 2
def objective_eq_2(P, X,Y):
    '''
        The objective function.
    '''
    a,b,c,d,e,f,g,h,i = P
    train_preds = np.minimum(np.maximum(eq_2(X, a,b,c,d,e,f,g,h,i), 0), 100)
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)
    return train_mse

(a,b,c,d,e,f,g,h,i) = (1531.13, 2.53774, -89.99, 1, 60.204, 1, 4.60663, -0.89465, 1)
res_2 = minimize(objective_eq_2, (a,b,c,d,e,f,g,h,i), args=(train_data_scaled[inds], flattened_output_train[inds]), \
               method='BFGS', options={'disp': True})
res_2

         Current function value: 208.608058
         Iterations: 302
         Function evaluations: 3972
         Gradient evaluations: 396


      fun: 208.60805773930016
 hess_inv: array([[ 1.04051879e+04,  3.20832160e+01, -1.44030201e+03,
         1.62352310e+02, -6.67591145e+01, -1.12124844e-01,
        -6.51084719e+00, -2.84596568e+00,  8.39402733e+00],
       [ 3.20832160e+01,  9.92079482e-02, -4.42088311e+00,
         5.03855750e-01, -2.06062435e-01, -5.71472167e-04,
        -1.98755559e-02, -8.46270522e-03,  2.50209224e-02],
       [-1.44030201e+03, -4.42088311e+00,  2.00875360e+02,
        -2.22251397e+01,  9.22879852e+00, -1.48880859e-03,
         9.13971354e-01,  4.23663096e-01, -1.24204878e+00],
       [ 1.62352310e+02,  5.03855750e-01, -2.22251397e+01,
         2.71254057e+00, -1.04768516e+00, -1.14777896e-02,
        -1.00435231e-01, -1.64511882e-02,  5.86969652e-02],
       [-6.67591145e+01, -2.06062435e-01,  9.22879852e+00,
        -1.04768516e+00,  4.29076883e-01,  1.32833201e-03,
         4.04442415e-02,  1.81231962e-02, -5.32239855e-02],
       [-1.12124844e-01, -5.71472168e-04, -1.48880859e-03,
        -1

In [64]:
# Equation 3
def objective_eq_3(P, X,Y):
    '''
        The objective function.
    '''
    a,b,c,d,e,f,g = P
    train_preds = np.minimum(np.maximum(eq_3(X, a,b,c,d,e,f,g), 0), 100)
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)
    return train_mse

(a,b,c,d,e,f,g) = (49.9941,13.4357,1,1,1,1,2)
res_3 = minimize(objective_eq_3, (a,b,c,d,e,f,g), args=(train_data_scaled[inds], flattened_output_train[inds]), \
               method='BFGS', options={'disp': True})
res_3

         Current function value: 246.206770
         Iterations: 105
         Function evaluations: 1314
         Gradient evaluations: 163


      fun: 246.20676969429084
 hess_inv: array([[ 2.13587976, -0.79515861,  0.66920225, -1.18455357,  2.54482588,
        -0.64070437,  0.58766217],
       [-0.79515861,  0.31542437, -0.26570797,  0.47750374, -0.99824192,
         0.25347294, -0.23251238],
       [ 0.66920225, -0.26570797,  0.22645227, -0.40821399,  0.84567481,
        -0.21600032,  0.19811717],
       [-1.18455357,  0.47750374, -0.40821399,  0.74230866, -1.52568572,
         0.3907416 , -0.3583636 ],
       [ 2.54482588, -0.99824192,  0.84567481, -1.52568572,  3.21272803,
        -0.80727422,  0.74035752],
       [-0.64070437,  0.25347294, -0.21600032,  0.3907416 , -0.80727422,
         0.20755857, -0.19036537],
       [ 0.58766217, -0.23251238,  0.19811717, -0.3583636 ,  0.74035752,
        -0.19036537,  0.17460174]])
      jac: array([-5.72204590e-06,  4.38690186e-05,  4.19616699e-05,  1.14440918e-05,
       -3.81469727e-06, -8.20159912e-05,  8.01086426e-05])
  message: 'Desired error not necessarily achieved due to

In [65]:
# Equation 4
def objective_eq_4(P, X,Y):
    '''
        The objective function.
    '''
    a,b,c,d,e,f,g,h,i,j = P
    train_preds = np.minimum(np.maximum(eq_4(X, a,b,c,d,e,f,g,h,i,j), 0), 100)
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)
    return train_mse

(a,b,c,d,e,f,g,h,i,j) = (91.3875,1.7767,9.03952,1,2,1,1,1,1,1)
res_4 = minimize(objective_eq_4, (a,b,c,d,e,f,g,h,i,j), args=(train_data_scaled[inds], flattened_output_train[inds]), \
               method='BFGS', options={'disp': True})
res_4

         Current function value: 247.874392
         Iterations: 68
         Function evaluations: 1056
         Gradient evaluations: 95


      fun: 247.8743917072119
 hess_inv: array([[ 1.34077360e-01,  2.85608723e-02, -2.05675025e-01,
        -1.45826836e-01,  1.79249696e-01, -2.16533076e-01,
         9.36637407e-04, -1.29440378e-02, -1.11233936e-01,
         9.27057658e-02],
       [ 2.85608723e-02,  2.97353157e-01, -6.41928471e-01,
        -1.71115889e-01,  1.33822676e-01, -3.19847390e-01,
         2.20973872e-03,  3.03630669e-02,  1.24667451e-01,
        -1.23155455e-01],
       [-2.05675025e-01, -6.41928471e-01,  1.58285316e+00,
         4.48034672e-01, -3.65611613e-01,  8.11218527e-01,
         4.28779793e-03, -5.11970748e-02, -1.32998281e-01,
         1.78145959e-01],
       [-1.45826836e-01, -1.71115889e-01,  4.48034672e-01,
         4.32898527e-01, -5.81771835e-01,  6.06354614e-01,
        -4.00171630e-02, -6.54497673e-03, -1.39411366e-02,
        -9.16399942e-02],
       [ 1.79249696e-01,  1.33822676e-01, -3.65611613e-01,
        -5.81771835e-01,  8.41100310e-01, -7.48344022e-01,
         6.30310759e-02, -1.49

In [70]:
# Evaluate optimized equations on validation data -- bounded validation loss
preds_valid_1 = np.minimum(np.maximum(eq_1(valid_data_scaled, *res_1.x), 0), 100)
preds_valid_2 = np.minimum(np.maximum(eq_2(valid_data_scaled, *res_2.x), 0), 100)
preds_valid_3 = np.minimum(np.maximum(eq_3(valid_data_scaled, *res_3.x), 0), 100)
preds_valid_4 = np.minimum(np.maximum(eq_4(valid_data_scaled, *res_4.x), 0), 100)

mse_reg_1_valid_eq_1 = np.mean((flattened_output_valid - preds_valid_1)**2, dtype=np.float64)
mse_reg_1_valid_eq_2 = np.mean((flattened_output_valid - preds_valid_2)**2, dtype=np.float64)
mse_reg_1_valid_eq_3 = np.mean((flattened_output_valid - preds_valid_3)**2, dtype=np.float64)
mse_reg_1_valid_eq_4 = np.mean((flattened_output_valid - preds_valid_4)**2, dtype=np.float64)

mse_total_eq_1 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_1*len_reg_not_0)/len_data_output
mse_total_eq_2 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_2*len_reg_not_0)/len_data_output
mse_total_eq_3 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_3*len_reg_not_0)/len_data_output
mse_total_eq_4 = (mse_reg_0*len_reg_0 + mse_reg_1_valid_eq_4*len_reg_not_0)/len_data_output

r2_total_eq_1 = 1 - mse_total_eq_1/var_data_output
r2_total_eq_2 = 1 - mse_total_eq_2/var_data_output
r2_total_eq_3 = 1 - mse_total_eq_3/var_data_output
r2_total_eq_4 = 1 - mse_total_eq_4/var_data_output

print(mse_total_eq_1, mse_total_eq_2, mse_total_eq_3, mse_total_eq_4)
print(r2_total_eq_1, r2_total_eq_2, r2_total_eq_3, r2_total_eq_4)

121.88511471084269 136.63732578874766 159.804892162015 161.45029573528936
0.9294487688785211 0.920909689633453 0.9074995178204286 0.9065471025229447


In [80]:
# Test generalization skill on regime 1: The skill is even better on the validation set!
preds_train_1 = np.minimum(np.maximum(eq_1(train_data_scaled, *res_1.x), 0), 100)
preds_train_2 = np.minimum(np.maximum(eq_2(train_data_scaled, *res_2.x), 0), 100)
preds_train_3 = np.minimum(np.maximum(eq_3(train_data_scaled, *res_3.x), 0), 100)
preds_train_4 = np.minimum(np.maximum(eq_4(train_data_scaled, *res_4.x), 0), 100)

mse_reg_1_train_eq_1 = np.mean((flattened_output_train - preds_train_1)**2, dtype=np.float64)
mse_reg_1_train_eq_2 = np.mean((flattened_output_train - preds_train_2)**2, dtype=np.float64)
mse_reg_1_train_eq_3 = np.mean((flattened_output_train - preds_train_3)**2, dtype=np.float64)
mse_reg_1_train_eq_4 = np.mean((flattened_output_train - preds_train_4)**2, dtype=np.float64)

print(mse_reg_1_train_eq_1, mse_reg_1_valid_eq_1)
print(mse_reg_1_train_eq_2, mse_reg_1_valid_eq_2)
print(mse_reg_1_train_eq_3, mse_reg_1_valid_eq_3)
print(mse_reg_1_train_eq_4, mse_reg_1_valid_eq_4)

186.4366625924015 184.94703484515756
208.6343839832529 207.33407984690123
246.5238147978788 242.49174863568558
248.0013052965759 244.98871154154406


**Save results**

In [82]:
all_possible_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

loc = {}
for i in range(len(all_possible_features)):
    loc[all_possible_features[i]] = i
    
# Features
features = ['rh', 'ta', 'clw', 'cli', 'rh_z']
no_features = len(features)

# mean and std
mean_all = [4.12205844e-03,2.25493498e-05,3.38180032e-06,2.57065512e+02,6.00030443e+04,5.64080139e+03,2.35046400e-01,1.32776682e+01,6.02512234e-01,9.86270417e+04,-1.27545273e-06,-4.02484958e-10,1.65204582e-08,-4.34660202e-11,4.29441131e-10,-1.82817316e-12,-4.68742483e-03,-7.54899040e-07,-7.51544542e+00,-1.06989723e-04,1.65615172e-03,-9.27604679e-06,-4.76200071e-05,-1.32246548e-07]
std_all = [5.07648249e-03,5.69702638e-05,1.01308124e-05,3.00533874e+01,3.12514292e+04,5.66963918e+03,4.11184302e-01,1.11389888e+01,3.32494615e-01,6.24039256e+03,2.03179260e-06,1.17041141e-08,1.33311867e-07,1.42840744e-09,6.73384546e-09,5.07424672e-11,5.82875686e-03,6.34826092e-05,3.53136052e+00,1.13215264e-02,6.62892130e-03,6.08144307e-05,2.58065098e-04,2.49552692e-06]

mean = np.concatenate([np.expand_dims(mean_all[loc[sel_var]], axis=0) for sel_var in features], axis = 0)
std = np.concatenate([np.expand_dims(std_all[loc[sel_var]], axis=0) for sel_var in features], axis = 0)

# Introduce variables
a, b, c, d, e, f, g, h, i, j, k = sp.symbols('a b c d e f g h i j k')
x0, x1, x2, x3, x4 = sp.symbols('x0 x1 x2 x3 x4')
rh, ta, clw, cli, rh_z = sp.symbols('rh ta clw cli rh_z')

X0 = (rh - mean[0])/std[0]
X1 = (ta - mean[1])/std[1]
X2 = (clw - mean[2])/std[2]
X3 = (cli - mean[3])/std[3]
X4 = (rh_z - mean[4])/std[4]

def round_expr(expr, num_digits):
    return expr.xreplace({n : round(n, num_digits) for n in expr.atoms(sp.Number)})

In [86]:
optimized_eqns = {}

exp_1 = a - b*(-c*sp.exp(d*x0) + e*sp.sin(f + g*x1) + (sp.sin(sp.sqrt(sp.Abs(h*x3))))/(i*x2 + j*x3 + sp.cos(k*x2)))
exp_2 = a + b*(-sp.sqrt(sp.Abs(c + d*x1))*(e - f*x0 + x1) + g/(h - i*x2 - x3))
exp_3 = a + b*(sp.exp(c*x0) - d*x2 + e*sp.log(sp.Abs(sp.exp(f*x2) + g*x3)))
exp_4 = a - (b*(c - d*sp.exp(e*x0) + f*x0 + x1))/(g*sp.exp(h*x2 + i*x3) + j*x3)

eq_1_sp = sp.sympify(exp_1)
eq_2_sp = sp.sympify(exp_2)
eq_3_sp = sp.sympify(exp_3)
eq_4_sp = sp.sympify(exp_4)

eq_1_lb = sp.lambdify((x0, x1, x2, x3, x4), eq_1_sp)
eq_2_lb = sp.lambdify((x0, x1, x2, x3, x4), eq_2_sp)
eq_3_lb = sp.lambdify((x0, x1, x2, x3, x4), eq_3_sp)
eq_4_lb = sp.lambdify((x0, x1, x2, x3, x4), eq_4_sp)

subs_dict_1 = {key : val for (key,val) in zip([a, b, c, d, e, f, g, h, i, j, k], np.round(res_1.x, 3))}
subs_dict_2 = {key : val for (key,val) in zip([a, b, c, d, e, f, g, h, i], np.round(res_2.x, 3))}
subs_dict_3 = {key : val for (key,val) in zip([a, b, c, d, e, f, g], np.round(res_3.x, 3))}
subs_dict_4 = {key : val for (key,val) in zip([a, b, c, d, e, f, g, h, i, j], np.round(res_4.x, 3))}

eq_1_sp = eq_1_sp.subs(subs_dict_1)
eq_2_sp = eq_2_sp.subs(subs_dict_2)
eq_3_sp = eq_3_sp.subs(subs_dict_3)
eq_4_sp = eq_4_sp.subs(subs_dict_4)

eq_1_sp_orig_inputs = eq_1_sp.subs({(x0,X0), (x1,X1), (x2,X2), (x3,X3), (x4,X4)})
eq_2_sp_orig_inputs = eq_2_sp.subs({(x0,X0), (x1,X1), (x2,X2), (x3,X3), (x4,X4)})
eq_3_sp_orig_inputs = eq_3_sp.subs({(x0,X0), (x1,X1), (x2,X2), (x3,X3), (x4,X4)})
eq_4_sp_orig_inputs = eq_4_sp.subs({(x0,X0), (x1,X1), (x2,X2), (x3,X3), (x4,X4)})

# Write to dict
optimized_eqns['Equation 1'] = {}
optimized_eqns['Equation 1']['Equation w.r.t. normalized vars'] = str(round_expr(eq_1_sp, 2))
optimized_eqns['Equation 1']['Equation w.r.t. physical vars'] = str(round_expr(eq_1_sp_orig_inputs, 3))
optimized_eqns['Equation 1']['$df/dclw$'] = '%s'%eq_1_sp_orig_inputs.diff('clw')
optimized_eqns['Equation 1']['$df/dcli$'] = '%s'%eq_1_sp_orig_inputs.diff('cli')
optimized_eqns['Equation 1']['$df/dT$'] = '%s'%eq_1_sp_orig_inputs.diff('ta')
optimized_eqns['Equation 1']['Train MSE in regime'] = mse_reg_1_train_eq_1
optimized_eqns['Equation 1']['Valid MSE in regime'] = mse_reg_1_valid_eq_1
optimized_eqns['Equation 1']['Valid MSE'] = mse_total_eq_1
optimized_eqns['Equation 1']['Number of parameters (for reg 1)'] = 12
optimized_eqns['Equation 1']['PCs'] = 'w.r.t. clw'

optimized_eqns['Equation 2'] = {}
optimized_eqns['Equation 2']['Equation w.r.t. normalized vars'] = str(round_expr(eq_2_sp, 2))
optimized_eqns['Equation 2']['Equation w.r.t. physical vars'] = str(round_expr(eq_2_sp_orig_inputs, 3))
optimized_eqns['Equation 2']['$df/dclw$'] = '%s'%eq_2_sp_orig_inputs.diff('clw')
optimized_eqns['Equation 2']['$df/dcli$'] = '%s'%eq_2_sp_orig_inputs.diff('cli')
optimized_eqns['Equation 2']['$df/dT$'] = '%s'%eq_2_sp_orig_inputs.diff('ta')
optimized_eqns['Equation 2']['Train MSE in regime'] = mse_reg_1_train_eq_2
optimized_eqns['Equation 2']['Valid MSE in regime'] = mse_reg_1_valid_eq_2
optimized_eqns['Equation 2']['Valid MSE'] = mse_total_eq_2
optimized_eqns['Equation 2']['Number of parameters (for reg 1)'] = 10
optimized_eqns['Equation 2']['PCs'] = 'w.r.t. T, cli'

optimized_eqns['Equation 3'] = {}
optimized_eqns['Equation 3']['Equation w.r.t. normalized vars'] = str(round_expr(eq_3_sp, 2))
optimized_eqns['Equation 3']['Equation w.r.t. physical vars'] = str(round_expr(eq_3_sp_orig_inputs, 3))
optimized_eqns['Equation 3']['$df/dclw$'] = '%s'%eq_3_sp_orig_inputs.diff('clw')
optimized_eqns['Equation 3']['$df/dcli$'] = '%s'%eq_3_sp_orig_inputs.diff('cli')
optimized_eqns['Equation 3']['$df/dT$'] = '%s'%eq_3_sp_orig_inputs.diff('ta')
optimized_eqns['Equation 3']['Train MSE in regime'] = mse_reg_1_train_eq_3
optimized_eqns['Equation 3']['Valid MSE in regime'] = mse_reg_1_valid_eq_3
optimized_eqns['Equation 3']['Valid MSE'] = mse_total_eq_3
optimized_eqns['Equation 3']['Number of parameters (for reg 1)'] = 8
optimized_eqns['Equation 3']['PCs'] = 'w.r.t. T, cli'

optimized_eqns['Equation 4'] = {}
optimized_eqns['Equation 4']['Equation w.r.t. normalized vars'] = str(round_expr(eq_4_sp, 2))
optimized_eqns['Equation 4']['Equation w.r.t. physical vars'] = str(round_expr(eq_4_sp_orig_inputs, 3))
optimized_eqns['Equation 4']['$df/dclw$'] = '%s'%eq_4_sp_orig_inputs.diff('clw')
optimized_eqns['Equation 4']['$df/dcli$'] = '%s'%eq_4_sp_orig_inputs.diff('cli')
optimized_eqns['Equation 4']['$df/dT$'] = '%s'%eq_4_sp_orig_inputs.diff('ta')
optimized_eqns['Equation 4']['Train MSE in regime'] = mse_reg_1_train_eq_4
optimized_eqns['Equation 4']['Valid MSE in regime'] = mse_reg_1_valid_eq_4
optimized_eqns['Equation 4']['Valid MSE'] = mse_total_eq_4
optimized_eqns['Equation 4']['Number of parameters (for reg 1)'] = 11
optimized_eqns['Equation 4']['PCs'] = 'w.r.t. cli'

# Save output
with open('optimized_eqns.json', 'w') as file:
    json.dump(optimized_eqns, file)