**tmpw0h7p9s5, equation 22**

Ablation study on the DYAMOND data -- setting parameters to 0 and re-tuning

--> Here in the physical form of the equation (I hope Nelder-Mead and BFGS won't fail because of that)

In [2]:
import numpy as np
import sympy as sp
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import json
import sys
import os
import gc

sys.path.insert(0, os.environ['HOME'] + '/my_work/published_code/grundner23james_EquationDiscovery_CloudCover_addressing_reviews/sec2_data/')
from functions import append_dict_to_json
from sklearn import tree

# sys.argv[1] = 10
SEED = int(sys.argv[1])
np.random.seed(SEED)

In [3]:
def round_expr(expr):
    d = {}
    for n in expr.atoms(sp.Number):
        d[n] = sp.Number('%.6g'%n)
    return expr.xreplace(d)

In [4]:
no_of_regimes = 2
regime = 1

**Read data**

In [5]:
all_possible_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

loc_all = {}
for i in range(len(all_possible_features)):
    loc_all[all_possible_features[i]] = i
    
# Features
features = ['rh', 'ta', 'clw', 'cli', 'rh_z']
no_features = len(features)

loc = {}
for i in range(len(features)):
    loc[features[i]] = i

In [6]:
path_data = os.path.join(os.environ['HOME'] + '/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND')

# Load the input data and pick the five best features (rh, ta, clw, cli, rh_z)
input_data = np.load(path_data + '/cloud_cover_input_dyamond.npy')
input_data = np.concatenate([np.expand_dims(input_data[:, loc_all[sel_var]], axis=1) for sel_var in features], axis = 1)

output_data = np.load(path_data + '/cloud_area_output_dyamond.npy')

In [7]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(285179494, 5)

In [8]:
# Construct training and validation data
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

In [9]:
# The second fold yields the best model
flattened_input_train = input_data[training_folds[1]]
flattened_input_valid = input_data[validation_folds[1]]
flattened_output_train = output_data[training_folds[1]]
flattened_output_valid = output_data[validation_folds[1]]
    
# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

499

In [10]:
if no_of_regimes > 1:
    # Already remove the regime with clw + cli = 0
    reg_not_0_train = np.where(flattened_input_train[:, loc['clw']] + flattened_input_train[:, loc['cli']] > 1e-20)[0]
    flattened_input_train = flattened_input_train[reg_not_0_train]
    flattened_output_train = flattened_output_train[reg_not_0_train]

    reg_not_0_valid = np.where(flattened_input_valid[:, loc['clw']] + flattened_input_valid[:, loc['cli']] > 1e-20)[0]
    flattened_input_valid = flattened_input_valid[reg_not_0_valid]
    flattened_output_valid = flattened_output_valid[reg_not_0_valid]

**Optimize coefficients in physical equation**

In [11]:
# See ~/workspace_icon-ml/symbolic_regression/finding_symmetries/pysr_results_dyamond_on_regimes/no_of_regimes_2/notes.txt
def func(X, a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps):
    rh = X[:, loc['rh']] 
    ta = X[:, loc['ta']] 
    clw = X[:, loc['clw']] 
    cli = X[:, loc['cli']]
    rh_z = X[:, loc['rh_z']]
    
    rh0 = 0.6025
    ta0 = 257.06
    
    if np.abs(a_4) > 1e-5:
        rh = np.maximum(rh, (rh0-a_2/a_4) - a_5/(2.*a_4)*(ta-ta0)**2)
        
    I1 = a_1 + a_2*(rh-rh0) + a_3*(ta-ta0) + a_4/2.*(rh-rh0)**2 + a_5/2.*(ta-ta0)**2*(rh-rh0)
    I2 = a_6**3*(rh_z + 3.0/2*a_7)*rh_z**2
    I3 = -1/(clw/a_8 + cli/a_9 + eps)
    
    return 100*(I1 + I2 + I3)

In [12]:
import scipy as sci
from scipy.optimize import minimize

In [13]:
def objective(P, X,Y,force_zero=None):
    '''
        The objective function.
    '''
    a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps = P
    
    # A way to remove terms using a list. Somehow I cannot use locals or exec here...
    if force_zero == 'a_1': a_1 = 0
    elif force_zero == 'a_2': a_2 = 0
    elif force_zero == 'a_3': a_3 = 0
    elif force_zero == 'a_4': a_4 = 0
    elif force_zero == 'a_5': a_5 = 0
    elif force_zero == 'a_6': a_6 = 0
    elif force_zero == 'a_7': a_7 = 0
    elif force_zero == 'a_8': a_8 = 0
    elif force_zero == 'a_9': a_9 = 0
    elif force_zero == 'eps': eps = 0
            
    train_preds = np.minimum(np.maximum(func(X, a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps), 0), 100) 
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)

    return train_mse

T_subset = 10**6
inds = np.random.randint(0, flattened_input_train.shape[0], T_subset)

(a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps) = (0.4435, 1.1593, -0.0145, 4.06, 0.0013176, 584.8036, 0.002, 1.1573e-6, 3.073e-7, 1.06)

**Evaluate reduced equations**

In [14]:
valid_mses = {}

In [15]:
parameters = [None,'a_1','a_2','a_3','a_4','a_5','a_6','a_7','a_8','a_9']

for par_ind in range(len(parameters)):
    force_zero = parameters[par_ind]

    # Nelder-Mead gives me the same result
    res_bfgs = minimize(objective, (a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps), args=(flattened_input_train[inds], flattened_output_train[inds], force_zero), \
                   method='BFGS', options={'disp': True})

    res_nm = minimize(objective, (a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps), args=(flattened_input_train[inds], flattened_output_train[inds], force_zero), \
                   method='Nelder-Mead', options={'disp': True})

    # Compute objective for both minima
    valid_reg_mse_bfgs = objective(res_bfgs.x, flattened_input_valid, flattened_output_valid, force_zero)
    valid_reg_mse_nm = objective(res_nm.x, flattened_input_valid, flattened_output_valid, force_zero)

    # Compute full valid mse
    mse_reg_0 = 0.0353
    n_0 = 32419018
    n_21 = 62640812
    N = n_0 + n_21

    valid_reg_mse = np.minimum(valid_reg_mse_bfgs, valid_reg_mse_nm)

    print('On the entire dataset')
    print('Valid MSE: %.5f'%((n_0*mse_reg_0 + n_21*valid_reg_mse)/N))

    # Add to dictionary
    if force_zero == None:
        valid_mses['full_eq'] = ((n_0*mse_reg_0 + n_21*valid_reg_mse)/N)
    else:      
        valid_mses[force_zero] = ((n_0*mse_reg_0 + n_21*valid_reg_mse)/N)

         Current function value: 343.846216
         Iterations: 223
         Function evaluations: 3476
         Gradient evaluations: 316
On the entire dataset
Valid MSE: 224.15841
         Current function value: 351.869987
         Iterations: 903
         Function evaluations: 15264
         Gradient evaluations: 1387
Optimization terminated successfully.
         Current function value: 1079.085830
         Iterations: 1091
         Function evaluations: 1887
On the entire dataset
Valid MSE: 229.65288
         Current function value: 538.300334
         Iterations: 97
         Function evaluations: 1958
         Gradient evaluations: 178
Optimization terminated successfully.
         Current function value: 2070.813766
         Iterations: 440
         Function evaluations: 868
On the entire dataset
Valid MSE: 351.43575
         Current function value: 720.540238
         Iterations: 45
         Function evaluations: 1530
         Gradient evaluations: 138
On the entire dataset
V

  I3 = -1/(clw/a_8 + cli/a_9 + eps)
  I3 = -1/(clw/a_8 + cli/a_9 + eps)


         Current function value: nan
         Iterations: 0
         Function evaluations: 11
         Gradient evaluations: 1
On the entire dataset
Valid MSE: nan
         Current function value: nan
         Iterations: 0
         Function evaluations: 11
         Gradient evaluations: 1
On the entire dataset
Valid MSE: nan


In [16]:
with open('/home/b/b309170/my_work/published_code/grundner23james_EquationDiscovery_CloudCover_addressing_reviews/sec6_physical_interpretation/ablation_study_new/ablation_study_dyamond_phys_seed_%d.json'%SEED, 'w') as file:
    json.dump(valid_mses, file)