Executed through ~scripts/run_era5_evalute_and_transfer_learn_1.sh

In [1]:
# Ran with 240GB (see evaluate_and_optimize_EQ1_mod-checkmem.ipynb)

In [2]:
import os
import sys
import json
import numpy as np
import xarray as xr
import sympy as sp
import matplotlib.pyplot as plt

subset_exp = int(sys.argv[1])
# subset_exp = 2
number_horizontal_locations = 10**subset_exp
tl_bool = True

sys.path.insert(0, '~/workspace_icon-ml/cloud_cover_parameterization/')
import my_classes
from my_classes import load_data

sys.path.insert(0, '~/workspace_icon-ml/symbolic_regression/')
from functions import add_derivatives
from functions import append_dict_to_json

# era5
evaluate_on = 'era5'

SEED = int(sys.argv[2])

num_cells = int(sys.argv[3])

ValueError: invalid literal for int() with base 10: '-f'

**Load data**

In [None]:
order_of_vars = ['q', 'clwc', 'ciwc', 't', 'pa', 'zg', 'cc']
data_dict = load_data(source='era5', days='all', order_of_vars=order_of_vars)

TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

# Removing four upper-most levels
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, 4:].copy()

# Data output
data_output = data_dict['cc']
del data_dict['cc']

for key in data_dict.keys():
    print(data_dict[key].shape)
    assert data_dict[key].shape == data_dict[key].shape

In [None]:
import time
t0 = time.time()

# Add rh
T0 = 273.15
r = 0.00263*data_dict['pa']*data_dict['q']*np.exp((17.67*(data_dict['t']-T0))/(data_dict['t']-29.65))**(-1)
data_dict['rh'] = r

# Add rh_z
from contextlib import contextmanager
import multiprocessing as mlp
import gc

# Add rh_z
folder = 'rh_z'

# Initialize all_npy_files with empty tensor
all_npy_files = np.zeros((0, VLAYERS-4, HFIELDS))

# Load all filenames in the folder containing the derivatives. The filenames are sorted chronologically.
npy_file_names = sorted(os.listdir(os.path.join('~/bd1179_work/ERA5/hvcg_data', folder)))        

for file in npy_file_names:
    # Load three-hourly data and convert directly to float32
    npy_file = np.load('~/bd1179_work/ERA5/hvcg_data/%s/%s'%(folder,file), mmap_mode='r')
    npy_file = np.float32(npy_file[0::3].copy())
    all_npy_files = np.concatenate((all_npy_files, npy_file), axis=0)
data_dict[folder] = all_npy_files

**Reshaping and keeping only the relevant features**

In [None]:
# Keeping only the relevant features
features = ['rh', 't', 'clwc', 'ciwc', 'rh_z']
for key in features:
    data_dict[key] = np.reshape(data_dict[key], -1)
    
data_output = np.reshape(data_output, -1)

del data_dict['q']
del data_dict['pa']
del data_dict['zg']

no_features = len(data_dict.keys())

**Cast dict into ndarray**

In [None]:
# data_array = np.zeros((data_dict['q'].size, len(data_dict.keys())), dtype=np.float32)

k = 0
data_array_not_T = []
for key in features:
    print(key)
    data_array_not_T.append(np.reshape(data_dict[key], -1))
    del data_dict[key]
    k += 1

# Convert into np array and transpose
data_array = np.transpose(np.array(data_array_not_T, dtype=np.float32))

In [None]:
# Update loc
loc = {}
for i in range(len(features)):
    loc[features[i]] = i

**Pick the subset**

In [None]:
seed = np.random.seed(SEED)
subset = np.random.randint(0, HFIELDS, number_horizontal_locations)
# Convert to regular int to make check_sum JSON serializable
check_sum = int(np.sum(subset))

# Collecting all grid cell indices for the horizontal fields given by subset
Z = np.zeros((TIMESTEPS, 27, HFIELDS), dtype=int)
for k in range(HFIELDS):
    Z[:,:,k] = k
Z_res = np.reshape(Z, -1)
subset_inds = np.concatenate([np.where(Z_res == s)[0] for s in subset])

In [None]:
train_input = data_array[subset_inds[:num_cells]] #num_hours*27
train_output = data_output[subset_inds[:num_cells]] #num_hours*27

**Already remove the regime with clw + cli = 0**

In [None]:
reg_0 = np.where(data_array[:, loc['clwc']] + data_array[:, loc['ciwc']] <= 1e-20)[0]
reg_not_0 = np.where(data_array[:, loc['clwc']] + data_array[:, loc['ciwc']] > 1e-20)[0]

# Relevant values to compute final MSE/R2-scores
mse_reg_0 = np.mean(data_output[reg_0]**2)
len_reg_0 = len(reg_0)
len_reg_not_0 = len(reg_not_0)
len_data_output = len(data_output)
var_data_output = np.var(data_output)

data_array = data_array[reg_not_0].copy()
data_output = data_output[reg_not_0].copy()

In [None]:
print(mse_reg_0)
print(data_array.shape)
print(data_output.shape)

# Should be 338023
len_reg_0

**Normalize the features**

In [None]:
all_possible_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']
loc = {}
for i in range(len(all_possible_features)):
    loc[all_possible_features[i]] = i
features = ['rh', 'ta', 'clw', 'cli', 'rh_z']

# Scale the data
mean_all = [4.12205844e-03,2.25493498e-05,3.38180032e-06,2.57065512e+02,6.00030443e+04,5.64080139e+03,2.35046400e-01,1.32776682e+01,6.02512234e-01,9.86270417e+04,-1.27545273e-06,-4.02484958e-10,1.65204582e-08,-4.34660202e-11,4.29441131e-10,-1.82817316e-12,-4.68742483e-03,-7.54899040e-07,-7.51544542e+00,-1.06989723e-04,1.65615172e-03,-9.27604679e-06,-4.76200071e-05,-1.32246548e-07]
std_all = [5.07648249e-03,5.69702638e-05,1.01308124e-05,3.00533874e+01,3.12514292e+04,5.66963918e+03,4.11184302e-01,1.11389888e+01,3.32494615e-01,6.24039256e+03,2.03179260e-06,1.17041141e-08,1.33311867e-07,1.42840744e-09,6.73384546e-09,5.07424672e-11,5.82875686e-03,6.34826092e-05,3.53136052e+00,1.13215264e-02,6.62892130e-03,6.08144307e-05,2.58065098e-04,2.49552692e-06]

mean = np.concatenate([np.expand_dims(mean_all[loc[sel_var]], axis=0) for sel_var in features], axis = 0)
std = np.concatenate([np.expand_dims(std_all[loc[sel_var]], axis=0) for sel_var in features], axis = 0)

# Work with scaled training folds
data_scaled = (data_array - mean)/std
train_input = (train_input - mean)/std

del data_array

Optimize coefficients

In [None]:
# symbolic_regression/finding_symmetries/pysr_results_dyamond_on_regimes/no_of_regimes_2/notes.txt
def func(X, a,b,c,d,e,f,g,h,i):
    # X = ['rh', 't', 'clwc', 'ciwc', 'rh_z']
    x0 = X[:, 0] 
    x1 = X[:, 1] 
    x2 = X[:, 2] 
    x3 = X[:, 3]
    x4 = X[:, 4]
    
    # Modified to always satisfy RH-constrain
    x0 = np.maximum(x0, 1/(2*c*d)*(-c*x1**2-a))
    
    return a*x0 - b*x1 + c*x0*(d*x0 + x1**2) + e*x4**2 + f - g/(x2 + h*x3 + i)

In [None]:
import scipy as sci
from scipy.optimize import minimize

In [None]:
def objective(P, X,Y):
    '''
        The objective function.
    '''
    a,b,c,d,e,f,g,h,i = P
    train_preds = np.minimum(np.maximum(func(X, a,b,c,d,e,f,g,h,i), 0), 100)
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)
    return train_mse

(a,b,c,d,e,f,g,h,i) = [38.85954116, 42.70818472, 19.34746465, 1.11321032, 2.36741444,\
                       44.99763015, 1.90033063, 0.65718667, 0.63587944]
if tl_bool:
    res = minimize(objective, (a,b,c,d,e,f,g,h,i), args=(train_input, train_output), \
                   method='Nelder-Mead', options={'disp': True})
else:
    # Compute the MSE and terminate if not tl_bool
    P = (a,b,c,d,e,f,g,h,i)
    mse_reg_1 = objective(P, data_scaled, data_output)
    
    results = {}

    mse_new_total = (mse_reg_0*len_reg_0 + mse_reg_1*len_reg_not_0)/len_data_output
    r2_new_total = 1 - mse_new_total/var_data_output

    print(mse_new_total, r2_new_total)

    parent_key = 'pysr_EQ1_no_tl'
    results[parent_key] = {}
    results[parent_key]['MSE'] = mse_new_total
    results[parent_key]['R2'] = r2_new_total
    results[parent_key]['Coefficients'] = list(res.x)
    # Should be the same for all runs
    results[parent_key]['Check_sum'] = check_sum

In [16]:
print(res.x)

[ 1.93925530e+00  1.40195085e-02  9.24687637e-01  6.07912348e+01
  1.38495251e+02  2.76480640e+01 -4.04303333e-01  1.07719122e+00
  1.32919169e-01 -1.77971834e+00  1.58883122e+02  2.48770600e+01]


In [17]:
list(np.round(res.x, 2))

[1.94, 0.01, 0.92, 60.79, 138.5, 27.65, -0.4, 1.08, 0.13, -1.78, 158.88, 24.88]

New values

In [18]:
mse_reg_1 = objective(res.x, data_scaled, data_output)

In [19]:
results = {}

mse_new_total = (mse_reg_0*len_reg_0 + mse_reg_1*len_reg_not_0)/len_data_output
r2_new_total = 1 - mse_new_total/var_data_output

print(mse_new_total, r2_new_total)

parent_key = 'pysr_EQ1_tl_%d_num_cells_%d_seed_%d'%(subset_exp,num_cells,SEED)
results[parent_key] = {}
results[parent_key]['MSE'] = mse_new_total
results[parent_key]['R2'] = r2_new_total
results[parent_key]['Coefficients'] = list(res.x)
# Should be the same for all runs
results[parent_key]['Check_sum'] = check_sum

688.7440801766825 -1.1178964262207782


**Save results**

In [None]:
# Dump results
append_dict_to_json(results, '~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/era5_tuned_pysr_EQ1_mod.json')

In [21]:
# def predict(P,X):
#     '''
#         The objective function.
#     '''
#     a,b,c,d,e,f,g,h,i = P
#     preds = [np.minimum(np.maximum(func(X[k_ind], a,b,c,d,e,f,g,h,i), 0), 100) for k_ind in range(X.shape[0])]

#     return preds

# predict(res.x, data_scaled)

# plt.hist(data_output,bins=100, histtype='step', color='k')
# plt.hist(predict(res.x, data_scaled),bins=100, histtype='step')

# plt.yscale('log')
# plt.legend(['ERA5', 'Eq. 1'])
# plt.savefig('~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/era5_tuned_pysr_EQ1_mod.pdf')

Original values

In [22]:
# mse_reg_1 = objective((1,1,0.77,41.39,20.69,20.69,0.66,0.53,0.35,0.23,60.6,3.44), data_scaled, data_output)

In [23]:
# mse_orig_total = (mse_reg_0*len_reg_0 + mse_reg_1*len_reg_not_0)/len_data_output
# r2_orig_total = 1 - mse_orig_total/var_data_output

# print(mse_orig_total, r2_orig_total)

**Is the output of the objective function the same as in evaluate_pysr_schemes.ipynb? [objective]**

In [24]:
# # The output should be 52.99172553
# X = np.zeros((1, 5))
# [np.minimum(np.maximum(func(X[k_ind], a,b,c,d,e,f,g,h,i), 0), 100) for k_ind in range(X.shape[0])]