**tmpw0h7p9s5, equation 22**

Ablation study on the DYAMOND data -- setting parameters to 0 and re-tuning

--> Here in the physical form of the equation (I hope Nelder-Mead and BFGS won't fail because of that)

In [None]:
# Python 3 module
import os
import sys
import json
import numpy as np
import xarray as xr
import sympy as sp
import matplotlib.pyplot as plt

sys.path.insert(0, os.environ['HOME'] + '/my_work/published_code/grundner23james_EquationDiscovery_CloudCover_addressing_reviews/sec2_data/')
from functions import append_dict_to_json
from sklearn import tree
import my_classes
from my_classes import load_data

# sys.argv[1] = 10
SEED = int(sys.argv[1])
np.random.seed(SEED)

### Load data

In [3]:
order_of_vars = ['q', 'clwc', 'ciwc', 't', 'pa', 'zg', 'cc']
data_dict = load_data(source='era5', days='all', order_of_vars=order_of_vars)

TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

# Removing four upper-most levels
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, 4:].copy()

# Data output
data_output = data_dict['cc']
del data_dict['cc']

for key in data_dict.keys():
    print(data_dict[key].shape)
    assert data_dict[key].shape == data_dict[key].shape

q
clwc
ciwc
t
pa
cc
all
100.000015
(1368, 27, 66655)
(1368, 27, 66655)
(1368, 27, 66655)
(1368, 27, 66655)
(1368, 27, 66655)
(1368, 27, 66655)


In [4]:
import time
t0 = time.time()

# Add rh
T0 = 273.15
r = 0.00263*data_dict['pa']*data_dict['q']*np.exp((17.67*(data_dict['t']-T0))/(data_dict['t']-29.65))**(-1)
data_dict['rh'] = r

# Add rh_z
folder = 'rh_z'

# Initialize all_npy_files with empty tensor
all_npy_files = np.zeros((0, VLAYERS-4, HFIELDS))

# Load all filenames in the folder containing the derivatives. The filenames are sorted chronologically.
npy_file_names = sorted(os.listdir(os.path.join(os.environ['HOME'] + '/bd1179_work/ERA5/hvcg_data', folder)))        

for file in npy_file_names:
    # Load three-hourly data and convert directly to float32
    npy_file = np.load(os.environ['HOME'] + '/bd1179_work/ERA5/hvcg_data/%s/%s'%(folder,file), mmap_mode='r')
    npy_file = np.float32(npy_file[0::3].copy())
    all_npy_files = np.concatenate((all_npy_files, npy_file), axis=0)
data_dict[folder] = all_npy_files 

FileNotFoundError: [Errno 2] No such file or directory: '~/bd1179_work/ERA5/hvcg_data/rh_z'

**Reshaping and keeping only the relevant features**

In [None]:
# Keeping only the relevant features
features = ['rh', 't', 'clwc', 'ciwc', 'rh_z']
for key in features:
    data_dict[key] = np.reshape(data_dict[key], -1)
    
data_output = np.reshape(data_output, -1)

del data_dict['q']
del data_dict['pa']
del data_dict['zg']

no_features = len(data_dict.keys())

**Cast dict into ndarray**

In [None]:
# data_array = np.zeros((data_dict['q'].size, len(data_dict.keys())), dtype=np.float32)

k = 0
data_array_not_T = []
for key in features:
    print(key)
    data_array_not_T.append(np.reshape(data_dict[key], -1))
    del data_dict[key]
    k += 1

# Convert into np array and transpose
data_array = np.transpose(np.array(data_array_not_T, dtype=np.float32))

In [None]:
# Update loc
loc = {}
for i in range(len(features)):
    loc[features[i]] = i

**Remove condensate-free cells**

In [None]:
# Already remove the regime with clw + cli = 0
reg_not_0 = np.where(data_array[:, loc['clwc']] + data_array[:, loc['ciwc']] > 1e-20)[0]
data_array = data_array[reg_not_0]
data_output = data_output[reg_not_0]

**Define the training/validation sets**

In [None]:
# Defines the training set
T_subset_train = 10**6
inds_train = np.random.randint(0, data_array.shape[0], T_subset_train)

flattened_input_train = data_array[inds_train]
flattened_output_train = data_output[inds_train]

# Defines the validation set
T_subset_valid = 10**6
inds_valid = np.random.randint(0, data_array.shape[0], T_subset_valid)

flattened_input_valid = data_array[inds_valid]
flattened_output_valid = data_output[inds_valid]

### Ablation Study

**Optimize coefficients in physical equation**

In [None]:
# See ~/workspace_icon-ml/symbolic_regression/finding_symmetries/pysr_results_dyamond_on_regimes/no_of_regimes_2/notes.txt
def func(X, a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps):
    rh = X[:, loc['rh']] 
    ta = X[:, loc['t']] 
    clw = X[:, loc['clwc']] 
    cli = X[:, loc['ciwc']]
    rh_z = X[:, loc['rh_z']]
    
    rh0 = 0.6025
    ta0 = 257.06
    
    if np.abs(a_4) > 1e-5:
        rh = np.maximum(rh, (rh0-a_2/a_4) - a_5/(2.*a_4)*(ta-ta0)**2)
        
    I1 = a_1 + a_2*(rh-rh0) + a_3*(ta-ta0) + a_4/2.*(rh-rh0)**2 + a_5/2.*(ta-ta0)**2*(rh-rh0)
    I2 = a_6**3*(rh_z + 3.0/2*a_7)*rh_z**2
    I3 = -1/(clw/a_8 + cli/a_9 + eps)
    
    return 100*(I1 + I2 + I3)

In [None]:
import scipy as sci
from scipy.optimize import minimize

In [None]:
def objective(P, X,Y,force_zero=None):
    '''
        The objective function.
    '''
    a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps = P
    
    # A way to remove terms using a list. Somehow I cannot use locals or exec here...
    if force_zero == 'a_1': a_1 = 0
    elif force_zero == 'a_2': a_2 = 0
    elif force_zero == 'a_3': a_3 = 0
    elif force_zero == 'a_4': a_4 = 0
    elif force_zero == 'a_5': a_5 = 0
    elif force_zero == 'a_6': a_6 = 0
    elif force_zero == 'a_7': a_7 = 0
    elif force_zero == 'a_8': a_8 = 0
    elif force_zero == 'a_9': a_9 = 0
    elif force_zero == 'eps': eps = 0
            
    train_preds = np.minimum(np.maximum(func(X, a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps), 0), 100) 
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)

    return train_mse

(a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps) = (0.4435, 1.1593, -0.0145, 4.06, 0.0013176, 584.8036, 0.002, 1.1573e-6, 3.073e-7, 1.06)

**Evaluate reduced equations**

In [None]:
valid_mses = {}

In [None]:
parameters = [None,'a_1','a_2','a_3','a_4','a_5','a_6','a_7','a_8','a_9']

for par_ind in range(len(parameters)):
    force_zero = parameters[par_ind]

    # Nelder-Mead gives me the same result
    res_bfgs = minimize(objective, (a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps), args=(flattened_input_train, flattened_output_train, force_zero), \
                   method='BFGS', options={'disp': True})

    res_nm = minimize(objective, (a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,eps), args=(flattened_input_train, flattened_output_train, force_zero), \
                   method='Nelder-Mead', options={'disp': True})

    # Compute objective for both minima
    valid_reg_mse_bfgs = objective(res_bfgs.x, flattened_input_valid, flattened_output_valid, force_zero)
    valid_reg_mse_nm = objective(res_nm.x, flattened_input_valid, flattened_output_valid, force_zero)

    valid_reg_mse = np.minimum(valid_reg_mse_bfgs, valid_reg_mse_nm)

    print('On the entire dataset')
    print('Valid MSE: %.5f'%valid_reg_mse)

    # Add to dictionary
    if force_zero == None:
        valid_mses['full_eq'] = valid_reg_mse
    else:      
        valid_mses[force_zero] = valid_reg_mse

In [None]:
with open('/home/b/b309170/my_work/published_code/grundner23james_EquationDiscovery_CloudCover_addressing_reviews/sec6_physical_interpretation/ablation_study_new/ablation_study_era5_phys_seed_%d.json'%SEED, 'w') as file:
    json.dump(valid_mses, file)