**tmpw0h7p9s5, equation 19**

**Modified to satisfy RH-constraint**

By enforcing the RH-constraint, we reduce the valid_mse from 106.98907 to 106.94799. <br>
By further optimizing the coefficients we can decrease the valid_mse to 106.93606.

--> I think we can communicate the first optimized set of coefficients, because the improvement of the mse is very small

In [1]:
import numpy as np
import sympy as sp
import pandas as pd
import matplotlib.pyplot as plt
import json
import sys
import os
import gc

sys.path.insert(0, '/home/b/b309170/workspace_icon-ml/symbolic_regression')
from functions import append_dict_to_json
from sklearn import tree

np.random.seed(10)

In [2]:
no_of_regimes = 2
regime = 1

**Read data**

In [3]:
all_possible_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

loc_all = {}
for i in range(len(all_possible_features)):
    loc_all[all_possible_features[i]] = i
    
# Features
features = ['rh', 'ta', 'clw', 'cli', 'rh_z']
no_features = len(features)

loc = {}
for i in range(len(features)):
    loc[features[i]] = i

In [4]:
path_data = os.path.join('/home/b/b309170/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND')

# Load the input data and pick the five best features (rh, ta, clw, cli, rh_z)
input_data = np.load(path_data + '/cloud_cover_input_dyamond.npy')
input_data = np.concatenate([np.expand_dims(input_data[:, loc_all[sel_var]], axis=1) for sel_var in features], axis = 1)

output_data = np.load(path_data + '/cloud_area_output_dyamond.npy')

In [5]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(285179494, 5)

In [6]:
# Construct training and validation data
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

In [7]:
# The second fold yields the best model
flattened_input_train = input_data[training_folds[1]]
flattened_input_valid = input_data[validation_folds[1]]
flattened_output_train = output_data[training_folds[1]]
flattened_output_valid = output_data[validation_folds[1]]
    
# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

24

In [8]:
if no_of_regimes > 1:
    # Already remove the regime with clw + cli = 0
    reg_not_0_train = np.where(flattened_input_train[:, loc['clw']] + flattened_input_train[:, loc['cli']] > 1e-20)[0]
    flattened_input_train = flattened_input_train[reg_not_0_train]
    flattened_output_train = flattened_output_train[reg_not_0_train]

    reg_not_0_valid = np.where(flattened_input_valid[:, loc['clw']] + flattened_input_valid[:, loc['cli']] > 1e-20)[0]
    flattened_input_valid = flattened_input_valid[reg_not_0_valid]
    flattened_output_valid = flattened_output_valid[reg_not_0_valid]

**Normalize the features**

In [9]:
# Scale the data
mean_all = [4.12205844e-03,2.25493498e-05,3.38180032e-06,2.57065512e+02,6.00030443e+04,5.64080139e+03,2.35046400e-01,1.32776682e+01,6.02512234e-01,9.86270417e+04,-1.27545273e-06,-4.02484958e-10,1.65204582e-08,-4.34660202e-11,4.29441131e-10,-1.82817316e-12,-4.68742483e-03,-7.54899040e-07,-7.51544542e+00,-1.06989723e-04,1.65615172e-03,-9.27604679e-06,-4.76200071e-05,-1.32246548e-07]
std_all = [5.07648249e-03,5.69702638e-05,1.01308124e-05,3.00533874e+01,3.12514292e+04,5.66963918e+03,4.11184302e-01,1.11389888e+01,3.32494615e-01,6.24039256e+03,2.03179260e-06,1.17041141e-08,1.33311867e-07,1.42840744e-09,6.73384546e-09,5.07424672e-11,5.82875686e-03,6.34826092e-05,3.53136052e+00,1.13215264e-02,6.62892130e-03,6.08144307e-05,2.58065098e-04,2.49552692e-06]

mean = np.concatenate([np.expand_dims(mean_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)
std = np.concatenate([np.expand_dims(std_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)

# Work with scaled training folds
train_data_scaled = (flattened_input_train - mean)/std
valid_data_scaled = (flattened_input_valid - mean)/std

Optimize coefficients

In [10]:
# See ~/workspace_icon-ml/symbolic_regression/finding_symmetries/pysr_results_dyamond_on_regimes/no_of_regimes_2/notes.txt
def func(X, a,b,c,d,e,f,g,h,i):
    x0 = X[:, 0] 
    x1 = X[:, 1] 
    x2 = X[:, 2] 
    x3 = X[:, 3]
    x4 = X[:, 4]
    
    # Modified to always satisfy RH-constrain
    x0 = np.maximum(x0, 1/(2*c*d)*(-c*x1**2-a))
    
    return a*x0 - b*x1 + c*x0*(d*x0 + x1**2) + e*x4**2 + f - g/(x2 + h*x3 + i)

In [11]:
import scipy as sci
from scipy.optimize import minimize

In [12]:
def objective(P, X,Y):
    '''
        The objective function.
    '''
    a,b,c,d,e,f,g,h,i = P
    train_preds = np.minimum(np.maximum(func(X, a,b,c,d,e,f,g,h,i), 0), 100) 
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)

    return train_mse

T_subset = 10**6
inds = np.random.randint(0, train_data_scaled.shape[0], T_subset)

(a,b,c,d,e,f,g,h,i) = (37.8, 37.8, 16.54, 1, 1.91, 49.57, 2.66, 1, 0.76)

res = minimize(objective, (a,b,c,d,e,f,g,h,i), args=(train_data_scaled[inds], flattened_output_train[inds]), \
               method='BFGS', options={'disp': True})

         Current function value: 163.932873
         Iterations: 29
         Function evaluations: 592
         Gradient evaluations: 58


In [13]:
print(res.x)

[38.32793014 42.68906959 19.33085921  1.14265465  2.36366234 45.02410441
  1.89923988  0.65398285  0.63478314]


New values

In [14]:
objective(res.x, train_data_scaled, flattened_output_train)

163.49384344973103

In [15]:
objective(res.x, valid_data_scaled, flattened_output_valid)

162.2612913972502

In [18]:
# From finding_symmetries/pysr_results_dyamond_on_regimes/save_optimized_eqns.ipynb
mse_reg_0 = 0.0353
n_0 = 32419018
n_21 = 62640812
N = n_0 + n_21

valid_reg_mse = 162.2612913972502

print('On the entire dataset')
print('Valid MSE: %.5f'%((n_0*mse_reg_0 + n_21*valid_reg_mse)/N))

On the entire dataset
Valid MSE: 106.93606


Old values

In [21]:
objective([38.85954116, 42.70818472, 19.34746465, 1.11321032, 2.36741444, 44.99763015, 1.90033063, 0.65718667, 0.63587944], valid_data_scaled, flattened_output_valid)

162.27939688193675

In [22]:
# From finding_symmetries/pysr_results_dyamond_on_regimes/save_optimized_eqns.ipynb
mse_reg_0 = 0.0353
n_0 = 32419018
n_21 = 62640812
N = n_0 + n_21

valid_reg_mse = 162.27939688193675

print('On the entire dataset')
print('Valid MSE: %.5f'%((n_0*mse_reg_0 + n_21*valid_reg_mse)/N))

On the entire dataset
Valid MSE: 106.94799


How often do we need to artificially increase the RH?

In [19]:
# Optimal params without the RH-modification
(a,b,c,d) = (38.85954116, 42.70818472, 19.34746465, 1.11321032)

# In roughly 1% of all cases
len(np.where(train_data_scaled[:, 0] < 1/(2*c*d)*(-c*train_data_scaled[:, 1]**2-a))[0])/len(train_data_scaled[:, 0])

0.00844691127213774

In [20]:
# Optimal params with the RH-modification
(a,b,c,d) = (38.32793014, 42.68906959, 19.33085921, 1.14265465)

# In roughly 1% of all cases
len(np.where(train_data_scaled[:, 0] < 1/(2*c*d)*(-c*train_data_scaled[:, 1]**2-a))[0])/len(train_data_scaled[:, 0])

0.009379558645668763