### Evaluate the PySR equation on higher-res DYAMOND data

- Data path: /home/b/b309170/bd1179_work/DYAMOND/hcg_data*

In [1]:
import os
import sys
import json
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

# subset_exp = int(sys.argv[1])
# subset_exp = 2
# number_horizontal_locations = 10**subset_exp
tl_bool = True

sys.path.insert(0, os.environ['HOME'] + '/my_work/published_code/grundner23james_EquationDiscovery_CloudCover_addressing_reviews/sec2_data/')
import my_classes
from my_classes import read_mean_and_std
from my_classes import load_data
from functions import append_dict_to_json

**Load data**

In [2]:
order_of_vars = ['q', 'qc', 'qi', 't', 'pres', 'zg', 'clc']

data_path = '/home/b/b309170/bd1179_work/DYAMOND/hcg_data_r2b6'
data_dict = load_data(source='split_by_var_name', days='all', vert_interp=False, \
                      resolution='R02B06', order_of_vars=order_of_vars, path=data_path)

TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

data_dict['zg'] = np.repeat(np.expand_dims(data_dict['zg'].T, axis=0), TIMESTEPS, axis=0)

# Only keep the lowest 60 levels (ensure that all fields have the same vertical grid)
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, -60:].copy()
    print(data_dict[key].shape)

for key in data_dict.keys():
    print(data_dict[key].shape)
    assert data_dict[key].shape == data_dict[key].shape

will change. To retain the existing behavior, pass
combine='nested'. To use future default behavior, pass
combine='by_coords'. See
http://xarray.pydata.org/en/stable/combining.html#combining-multi

  DS = xr.open_mfdataset(path+'/zg/zg*')
to use the new `combine_by_coords` function (or the
`combine='by_coords'` option to `open_mfdataset`) to order the datasets
before concatenation. Alternatively, to continue concatenating based
on the order the datasets are supplied in future, please use the new
`combine_nested` function (or the `combine='nested'` option to
open_mfdataset).
  from_openmfds=True,


q
qc
qi
t
pres
clc
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)
(80, 60, 327680)


In [3]:
import time
t0 = time.time()

# Add rh
T0 = 273.15
r = 0.00263*data_dict['pres']*data_dict['q']*np.exp((17.67*(data_dict['t']-T0))/(data_dict['t']-29.65))**(-1)
data_dict['rh'] = r

# Update
TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

# Add rh_z
data_dict['rh_z'] = (r[:, :-1] - r[:, 1:])/(data_dict['zg'][:, :-1] - data_dict['zg'][:, 1:])

# Only keep the lowest 58 levels (ensure that all fields have the same vertical grid)
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, -58:].copy()
    print(data_dict[key].shape)
    
# Data output
data_output = 100*data_dict['clc']
del data_dict['clc']

(80, 58, 327680)
(80, 58, 327680)
(80, 58, 327680)
(80, 58, 327680)
(80, 58, 327680)
(80, 58, 327680)
(80, 58, 327680)
(80, 58, 327680)
(80, 58, 327680)


**Reshaping and keeping only the relevant features**

In [4]:
# Keeping only the relevant features
features = ['rh', 't', 'qc', 'qi', 'rh_z']
for key in features:
    data_dict[key] = np.reshape(data_dict[key], -1)
    
data_output = np.reshape(data_output, -1)

del data_dict['q']
del data_dict['pres']
del data_dict['zg']

no_features = len(data_dict.keys())

**Cast dict into ndarray**

In [5]:
# data_array = np.zeros((data_dict['q'].size, len(data_dict.keys())), dtype=np.float32)

k = 0
data_array_not_T = []
for key in features:
    print(key)
    data_array_not_T.append(np.reshape(data_dict[key], -1))
    del data_dict[key]
    k += 1

# Convert into np array and transpose
data_array = np.transpose(np.array(data_array_not_T, dtype=np.float32))

rh
t
qc
qi
rh_z


In [6]:
# Update loc
loc = {}
for i in range(len(features)):
    loc[features[i]] = i

**Normalize the features**

In [7]:
all_possible_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']
loc = {}
for i in range(len(all_possible_features)):
    loc[all_possible_features[i]] = i
features = ['rh', 'ta', 'clw', 'cli', 'rh_z']

# Scale the data
mean_all = [4.12205844e-03,2.25493498e-05,3.38180032e-06,2.57065512e+02,6.00030443e+04,5.64080139e+03,2.35046400e-01,1.32776682e+01,6.02512234e-01,9.86270417e+04,-1.27545273e-06,-4.02484958e-10,1.65204582e-08,-4.34660202e-11,4.29441131e-10,-1.82817316e-12,-4.68742483e-03,-7.54899040e-07,-7.51544542e+00,-1.06989723e-04,1.65615172e-03,-9.27604679e-06,-4.76200071e-05,-1.32246548e-07]
std_all = [5.07648249e-03,5.69702638e-05,1.01308124e-05,3.00533874e+01,3.12514292e+04,5.66963918e+03,4.11184302e-01,1.11389888e+01,3.32494615e-01,6.24039256e+03,2.03179260e-06,1.17041141e-08,1.33311867e-07,1.42840744e-09,6.73384546e-09,5.07424672e-11,5.82875686e-03,6.34826092e-05,3.53136052e+00,1.13215264e-02,6.62892130e-03,6.08144307e-05,2.58065098e-04,2.49552692e-06]

mean = np.concatenate([np.expand_dims(mean_all[loc[sel_var]], axis=0) for sel_var in features], axis = 0)
std = np.concatenate([np.expand_dims(std_all[loc[sel_var]], axis=0) for sel_var in features], axis = 0)

# Work with scaled training folds
data_scaled = (data_array - mean)/std

Optimize coefficients

In [8]:
# See ~/symbolic_regression/finding_symmetries/pysr_results_dyamond_on_regimes/optimize_coefs_EQ4.ipynb
def func(X, a,b,c,d,e,f,g,h,i,j):
    x0 = X[:, 0] 
    x1 = X[:, 1] 
    x2 = X[:, 2] 
    x3 = X[:, 3]
    x4 = X[:, 4]
    
    # Modified to always satisfy RH-constraint
    x0 = np.maximum(x0, 1/(2*c*d)*(-c*x1**2-a))
    
    return a*x0 - b*x1 + c*x0*(d*x0 + x1**2) + x4**2*(e*x4 + f) + g - h/(x2 + i*x3 + j)

In [9]:
def objective(P, X,Y):
    '''
        The objective function.
    '''
    a,b,c,d,e,f,g,h,i,j = P
    train_preds = np.minimum(np.maximum(func(X, a,b,c,d,e,f,g,h,i,j), 0), 100)
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)

    return train_mse

(a,b,c,d,e,f,g,h,i,j) = (38.6562122, 43.53500518, 19.78403208, 1.13637902, 0.35299939,\
                         4.04888686, 44.21730274, 2.03128527, 0.66971589, 0.6409019)

# Compute the MSE and terminate if not tl_bool
P = (a,b,c,d,e,f,g,h,i,j)
mse = objective(P, data_scaled, data_output)
var = np.var(data_output)
r2 = 1-mse/var

results = {}
parent_key = 'pysr_EQ4'
results[parent_key] = {}
results[parent_key]['MSE'] = mse
results[parent_key]['R2'] = r2

**Save results**

In [10]:
# Dump results
append_dict_to_json(results, os.environ['HOME'] + '/my_work/published_code/grundner23james_EquationDiscovery_CloudCover_addressing_reviews/sec5_results/transfer_to_higher_resolutions/results/pysr_eq4_r2b6.json')

New file created or first entry added
