In [19]:
import numpy as np
import sympy as sp
import pandas as pd
# import matplotlib.pyplot as plt
import json
import sys
import os
import gc

sys.path.insert(0, '~/workspace_icon-ml/symbolic_regression')
from functions import append_dict_to_json
from sklearn import tree

np.random.seed(10)

In [20]:
no_of_regimes = 2
regime = 1

# no_of_regimes = int(sys.argv[1])
# regime = int(sys.argv[2])

In [21]:
# Can be used to quickly compute the mse on all regimes!
def mse_all_regimes(M):
    '''
        M: Contains [mse_reg_1, mse_reg_2, ...] depending on no_regimes
        Computes the validation error on all regimes knowing the ones on the single regimes
    '''

    # Known parameters
    mse_reg_0 = 0.0353
    n_0 = 32419018

    n_21 = 62640812

    n_31 = 5742663
    n_32 = 56898149

    n_41 = 5742663
    n_42 = 18367245
    n_43 = 38530904

    N = n_0 + n_21
    
    # Two regimes
    if len(M) == 1:
        return (n_0*mse_reg_0 + n_21*M[0])/N
        
    # Three regimes
    if len(M) == 2:
        return (n_0*mse_reg_0 + n_31*M[0] + n_32*M[1])/N
        
    # Four regimes:
    if len(M) == 3:
        return (n_0*mse_reg_0 + n_41*M[0] + n_42*M[1] + n_43*M[2])/N        
    

**Read data**

In [22]:
all_possible_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

loc_all = {}
for i in range(len(all_possible_features)):
    loc_all[all_possible_features[i]] = i
    
# For the five-feature equations only the first five features are used
features = ['rh', 'ta', 'clw', 'cli', 'rh_z', 'rh_zz', 'pa_z', 'pa_zz']
# elif regime_1_improving_eq_16:
#     features = ['rh_zz', 'pa_z', 'pa_zz']
    
no_features = len(features)

loc_sel = {}
for i in range(len(features)):
    loc_sel[features[i]] = i

In [23]:
path_data = os.path.join('~/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND')

# Load the input data and pick the five best features (rh, ta, clw, cli, rh_z)
input_data = np.load(path_data + '/cloud_cover_input_dyamond.npy')
input_data = np.concatenate([np.expand_dims(input_data[:, loc_all[sel_var]], axis=1) for sel_var in features], axis = 1)

output_data = np.load(path_data + '/cloud_area_output_dyamond.npy')

In [24]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(285179494, 8)

In [25]:
# Construct training and validation data
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

In [26]:
# The second fold yields the best model
flattened_input_train = input_data[training_folds[1]]
flattened_input_valid = input_data[validation_folds[1]]
flattened_output_train = output_data[training_folds[1]]
flattened_output_valid = output_data[validation_folds[1]]
    
# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

250

In [27]:
if no_of_regimes > 1:
    # Already remove the regime with clw + cli = 0
    reg_not_0_train = np.where(flattened_input_train[:, loc_sel['clw']] + flattened_input_train[:, loc_sel['cli']] > 1e-20)[0]
    flattened_input_train = flattened_input_train[reg_not_0_train]
    flattened_output_train = flattened_output_train[reg_not_0_train]

    reg_not_0_valid = np.where(flattened_input_valid[:, loc_sel['clw']] + flattened_input_valid[:, loc_sel['cli']] > 1e-20)[0]
    flattened_input_valid = flattened_input_valid[reg_not_0_valid]
    flattened_output_valid = flattened_output_valid[reg_not_0_valid]

    # We only need to split the regimes further if no_of_regimes > 2
    if no_of_regimes > 2:
        # Take a subset of the data to train the decision tree on
        subset_size = 10**7 # or 10**6

        inds = np.random.randint(0, flattened_input_train.shape[0], subset_size)
        input_subset = flattened_input_train[inds]
        output_subset = flattened_output_train[inds]

        classification_tree = tree.DecisionTreeRegressor(max_depth=3, max_leaf_nodes=(no_of_regimes-1)) # set max_depth to [2,3]
        classification_tree.fit(input_subset, output_subset)
        text_representation = tree.export_text(classification_tree, feature_names=features)
        print(text_representation)

        ind_reg_train = np.where(classification_tree.apply(flattened_input_train) == regime)
        ind_reg_valid = np.where(classification_tree.apply(flattened_input_valid) == regime)

        # Sometimes, the regime is called differently...
        if np.sum(ind_reg_train) == 0:
            print('The regime %d does not exist, switching to regime %d instead.'%(regime, no_of_regimes))
            ind_reg_train = np.where(classification_tree.apply(flattened_input_train) == no_of_regimes)
            ind_reg_valid = np.where(classification_tree.apply(flattened_input_valid) == no_of_regimes)

        flattened_input_train = flattened_input_train[ind_reg_train]
        flattened_input_valid = flattened_input_valid[ind_reg_valid]

        flattened_output_train = flattened_output_train[ind_reg_train]
        flattened_output_valid = flattened_output_valid[ind_reg_valid]

**Normalize the features**

In [28]:
# Scale the data
mean_all = [4.12205844e-03,2.25493498e-05,3.38180032e-06,2.57065512e+02,6.00030443e+04,5.64080139e+03,2.35046400e-01,1.32776682e+01,6.02512234e-01,9.86270417e+04,-1.27545273e-06,-4.02484958e-10,1.65204582e-08,-4.34660202e-11,4.29441131e-10,-1.82817316e-12,-4.68742483e-03,-7.54899040e-07,-7.51544542e+00,-1.06989723e-04,1.65615172e-03,-9.27604679e-06,-4.76200071e-05,-1.32246548e-07]
std_all = [5.07648249e-03,5.69702638e-05,1.01308124e-05,3.00533874e+01,3.12514292e+04,5.66963918e+03,4.11184302e-01,1.11389888e+01,3.32494615e-01,6.24039256e+03,2.03179260e-06,1.17041141e-08,1.33311867e-07,1.42840744e-09,6.73384546e-09,5.07424672e-11,5.82875686e-03,6.34826092e-05,3.53136052e+00,1.13215264e-02,6.62892130e-03,6.08144307e-05,2.58065098e-04,2.49552692e-06]

mean = np.concatenate([np.expand_dims(mean_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)
std = np.concatenate([np.expand_dims(std_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)

# Work with scaled training folds
train_data_scaled = (flattened_input_train - mean)/std
valid_data_scaled = (flattened_input_valid - mean)/std

**Read and evaluate the equations**

In [29]:
def round_expr(expr, num_digits):
    return expr.xreplace({n : round(n, num_digits) for n in expr.atoms(sp.Number)})

In [30]:
def cube(x):
    return x**3

def pow_abs(x, y):
    return np.abs(x)**y

def sqrt_abs(x):
    # sqrt(abs(x))
    return (x**2)**(1/4)

def relu(x):
    # max(0, x)
    return (x/np.abs(x) + 1)/2*x

def neg(x):
    return (-x)

In [31]:
x0, x1, x2, x3, x4, x5, x6, x7 = sp.symbols('x0 x1 x2 x3 x4 x5 x6 x7')
rh, ta, clw, cli, rh_z, rh_zz, pa_z, pa_zz = sp.symbols('rh ta clw cli rh_z rh_zz pa_z pa_zz')

X0 = (rh - mean[0])/std[0]
X1 = (ta - mean[1])/std[1]
X2 = (clw - mean[2])/std[2]
X3 = (cli - mean[3])/std[3]
X4 = (rh_z - mean[4])/std[4]
X5 = (rh_zz - mean[5])/std[5]
X6 = (pa_z - mean[6])/std[6]
X7 = (pa_zz - mean[7])/std[7]

#### Evaluate on a specified number of regimes and a specified regime

In [32]:
def b(x):
    return np.minimum(np.maximum(x, 0), 100)
    
def process_hof_file(folder_abs_path, regime=regime, features=features, train_data_scaled=train_data_scaled, \
                     flattened_output_train=flattened_output_train, valid_data_scaled=valid_data_scaled, flattened_output_valid=flattened_output_valid):
    '''
        folder: Abs path of parent folder of the Hall of fame CSV
                E.g.: '~/workspace_icon-ml/symbolic_regression/finding_symmetries/pysr_results_dyamond_on_regimes/no_of_regimes_2/regime_1/tmp5t8dshps'
    '''
    if 'hall_of_fame.csv' not in os.listdir(os.path.join(folder_abs_path)):
        return 1
    
    # Read hall_of_fame.csv
    file = os.path.join(folder_abs_path, 'hall_of_fame.csv')
    hof = pd.read_csv(file, sep=',')
    
    folder = folder_abs_path.split('/')[-1]
    
    # Dictionary to save the results in
    d = {}
    d[folder] = {}
    
    for eq_num in range(len(hof)):
        d[folder]['Equation %d'%eq_num] = {}
        eq = hof.iloc[eq_num]['Equation']
        eq_sp = sp.sympify(eq, locals={'cube': cube, 'pow_abs': pow_abs, 'sqrt_abs': sqrt_abs, 'relu': relu, 'neg': neg})

        input_tuple = []
        for k in range(len(features)):
            input_tuple.append(globals()['x%d'%k])
        input_tuple = tuple(input_tuple)
        eq_lb = sp.lambdify(input_tuple, eq_sp)

        ## Evaluate the equations
        train_preds = b(eq_lb(*train_data_scaled.T))
        valid_preds = b(eq_lb(*valid_data_scaled.T))

        ## Compute MSE
        train_mse = np.mean((train_preds - flattened_output_train)**2, dtype=np.float64)
        valid_mse = np.mean((valid_preds - flattened_output_valid)**2, dtype=np.float64)

        # Write output to json
        d[folder]['Equation %d'%eq_num]['Equation w.r.t. normalized vars'] = str(round_expr(sp.simplify(eq), 2))

        # Cannot always plug in symbols into lambdified function
        subs_set = []
        for k in range(len(features)):
            subs_set.append((input_tuple[k], globals()['X%d'%k]))
        subs_set = set(subs_set)
        eq_sp_orig_inputs = eq_sp.subs(subs_set)
        
        # Troublesome rounding! It would probably have been better to use the rounding-function from 
        # symbolic_regression/finding_symmetries/pysr_results_dyamond_on_regimes/save_optimized_eqns.ipynb
        d[folder]['Equation %d'%eq_num]['Equation w.r.t. physical vars'] = str(round_expr(sp.simplify(eq_sp_orig_inputs), 2))
        try:
            d[folder]['Equation %d'%eq_num]['$df/drh$'] = '%s'%round_expr(sp.simplify(eq_sp_orig_inputs.diff('rh')), 2)
            d[folder]['Equation %d'%eq_num]['$df/dclw$'] = '%s'%round_expr(sp.simplify(eq_sp_orig_inputs.diff('clw')), 2)
            d[folder]['Equation %d'%eq_num]['$df/dcli$'] = '%s'%round_expr(sp.simplify(eq_sp_orig_inputs.diff('cli')), 2)
            d[folder]['Equation %d'%eq_num]['$df/dT$'] = '%s'%round_expr(sp.simplify(eq_sp_orig_inputs.diff('ta')), 2)
        except: 
            pass
        d[folder]['Equation %d'%eq_num]['Train MSE in regime'] = '%d'%train_mse
        d[folder]['Equation %d'%eq_num]['Valid MSE in regime'] = '%d'%valid_mse
        if no_of_regimes == 2:
            d[folder]['Equation %d'%eq_num]['Train MSE'] = '%d'%mse_all_regimes([train_mse])
            d[folder]['Equation %d'%eq_num]['Valid MSE'] = '%d'%mse_all_regimes([valid_mse])

        append_dict_to_json(d, os.path.join(folder_abs_path.rsplit('/', maxsplit=2)[0], 'combined_results.json'))
    
    return 0

**Process folders**

In [33]:
abspath = '~/workspace_icon-ml/symbolic_regression/finding_symmetries/pysr_results_dyamond_on_regimes/no_of_regimes_%d'%no_of_regimes

ppfolders = os.listdir(abspath)

for ppfolder in ppfolders:
    # We would have to adjust process_hof_file for the '*_improving_*' output
    if ppfolder in ppfolders:
        pfolders = os.listdir(os.path.join(abspath, ppfolder))
        for folder in pfolders:
            if folder.startswith('tmp'):
                folder_abs_path=os.path.join(abspath, ppfolder, folder)
                process_hof_file(folder_abs_path)

### Validation MSE on the entire dataset

Requires all regimes to be present in combined_results.json

In [34]:
# no_of_regimes = 4 # Choose out of [3, 4]

In [35]:
# mse_reg_0 = 0.0353
# n_0 = 32419018

# n_21 = 62640812

# n_31 = 5742663
# n_32 = 56898149

# n_41 = 5742663
# n_42 = 18367245
# n_43 = 38530904

# N = n_0 + n_21
# print(N)

In [36]:
# abspath = '~/workspace_icon-ml/symbolic_regression/finding_symmetries/pysr_results_dyamond_on_regimes/no_of_regimes_%d'%no_of_regimes
# combined_results_file = os.path.join(abspath, 'combined_results.json')

# reg_folders = {}
# reg_folders['regime_1'] = {}
# reg_folders['regime_2'] = {}
# reg_folders['regime_3'] = {}

# print(os.listdir(abspath))
# for ppfolder in os.listdir(abspath):
#     if ppfolder.startswith('regime'):
#         pfolders = os.listdir(os.path.join(abspath, ppfolder))
#         reg_folders[ppfolder] = [folder for folder in pfolders if folder.startswith('tmp')]          
            
# valid_mses_all_data = {}

# with open(combined_results_file, 'r') as file:
#     d = json.load(file)

# #     ## Extract MSEs depending on no_of_regimes
# #     if no_of_regimes == 2:
# #         # Loop through folders
# #         for key_1 in d['regime_1'].keys():
# #             # Loop through equation numbers
# #             for eq_num_1 in d['regime_1'][key_1].keys():
# #                 mse_total = n_0*mse_reg_0
# #                 mse_total += float(d['regime_1'][key_1][eq_num_1]['Valid MSE in regime'])*n_21

# #                 new_key = (key_1 + '_' + eq_num_1).replace(' ', '_')
# #                 valid_mses_all_data[new_key] = mse_total/N
    
#     if no_of_regimes == 3:
#         # Loop through folders
#         for key_1 in reg_folders['regime_1']:
#             for key_2 in reg_folders['regime_2']:
#                 # Loop through equation numbers
#                 for eq_num_1 in d[key_1].keys():
#                     for eq_num_2 in d[key_2].keys():
#                         mse_total = n_0*mse_reg_0
#                         mse_total += float(d[key_1][eq_num_1]['Valid MSE in regime'])*n_31    
#                         mse_total += float(d[key_2][eq_num_2]['Valid MSE in regime'])*n_32

#                         new_key = (key_1 + '_' + eq_num_1 + '_' + key_2 + '_' + eq_num_2).replace(' ', '_')
#                         valid_mses_all_data[new_key] = mse_total/N
                        
#     if no_of_regimes == 4:
#         # Loop through folders
#         for key_1 in reg_folders['regime_1']:
#             for key_2 in reg_folders['regime_2']:
#                 for key_3 in reg_folders['regime_3']:
#                     # Loop through equation numbers
#                     for eq_num_1 in d[key_1].keys():
#                         for eq_num_2 in d[key_2].keys():
#                             for eq_num_3 in d[key_3].keys():
#                                 mse_total = n_0*mse_reg_0
#                                 mse_total += float(d[key_1][eq_num_1]['Valid MSE in regime'])*n_41    
#                                 mse_total += float(d[key_2][eq_num_2]['Valid MSE in regime'])*n_42
#                                 mse_total += float(d[key_3][eq_num_3]['Valid MSE in regime'])*n_43

#                                 new_key = (key_1 + '_' + eq_num_1 + '_' + key_2 + '_' + eq_num_2 + '_' + key_3 + '_' + eq_num_3).replace(' ', '_')
#                                 valid_mses_all_data[new_key] = mse_total/N

# all_valid_mses_file = os.path.join(abspath, 'valid_mses_entire_dataset.json')
# with open(all_valid_mses_file, 'w') as file:
#     json.dump(valid_mses_all_data, file)