**tmp6jn1vyyl, equation 24**

In [1]:
import numpy as np
import sympy as sp
import pandas as pd
import matplotlib.pyplot as plt
import json
import sys
import os
import gc

sys.path.insert(0, '~/workspace_icon-ml/symbolic_regression')
from functions import append_dict_to_json
from sklearn import tree

np.random.seed(10)

In [2]:
no_of_regimes = 2
regime = 1

**Read data**

In [3]:
all_possible_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

loc_all = {}
for i in range(len(all_possible_features)):
    loc_all[all_possible_features[i]] = i
    
# Features
features = ['rh', 'ta', 'clw', 'cli', 'rh_z']
no_features = len(features)

loc = {}
for i in range(len(features)):
    loc[features[i]] = i

In [4]:
path_data = os.path.join('~/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND')

# Load the input data and pick the five best features (rh, ta, clw, cli, rh_z)
input_data = np.load(path_data + '/cloud_cover_input_dyamond.npy')
input_data = np.concatenate([np.expand_dims(input_data[:, loc_all[sel_var]], axis=1) for sel_var in features], axis = 1)

output_data = np.load(path_data + '/cloud_area_output_dyamond.npy')

In [5]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(285179494, 5)

In [6]:
# Construct training and validation data
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

In [7]:
# The second fold yields the best model
flattened_input_train = input_data[training_folds[1]]
flattened_input_valid = input_data[validation_folds[1]]
flattened_output_train = output_data[training_folds[1]]
flattened_output_valid = output_data[validation_folds[1]]
    
# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

109

In [8]:
if no_of_regimes > 1:
    # Already remove the regime with clw + cli = 0
    reg_not_0_train = np.where(flattened_input_train[:, loc['clw']] + flattened_input_train[:, loc['cli']] > 1e-20)[0]
    flattened_input_train = flattened_input_train[reg_not_0_train]
    flattened_output_train = flattened_output_train[reg_not_0_train]

    reg_not_0_valid = np.where(flattened_input_valid[:, loc['clw']] + flattened_input_valid[:, loc['cli']] > 1e-20)[0]
    flattened_input_valid = flattened_input_valid[reg_not_0_valid]
    flattened_output_valid = flattened_output_valid[reg_not_0_valid]

**Normalize the features**

In [9]:
# Scale the data
mean_all = [4.12205844e-03,2.25493498e-05,3.38180032e-06,2.57065512e+02,6.00030443e+04,5.64080139e+03,2.35046400e-01,1.32776682e+01,6.02512234e-01,9.86270417e+04,-1.27545273e-06,-4.02484958e-10,1.65204582e-08,-4.34660202e-11,4.29441131e-10,-1.82817316e-12,-4.68742483e-03,-7.54899040e-07,-7.51544542e+00,-1.06989723e-04,1.65615172e-03,-9.27604679e-06,-4.76200071e-05,-1.32246548e-07]
std_all = [5.07648249e-03,5.69702638e-05,1.01308124e-05,3.00533874e+01,3.12514292e+04,5.66963918e+03,4.11184302e-01,1.11389888e+01,3.32494615e-01,6.24039256e+03,2.03179260e-06,1.17041141e-08,1.33311867e-07,1.42840744e-09,6.73384546e-09,5.07424672e-11,5.82875686e-03,6.34826092e-05,3.53136052e+00,1.13215264e-02,6.62892130e-03,6.08144307e-05,2.58065098e-04,2.49552692e-06]

mean = np.concatenate([np.expand_dims(mean_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)
std = np.concatenate([np.expand_dims(std_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)

# Work with scaled training folds
train_data_scaled = (flattened_input_train - mean)/std
valid_data_scaled = (flattened_input_valid - mean)/std

Optimize coefficients

In [10]:
# See ~/workspace_icon-ml/symbolic_regression/finding_symmetries/pysr_results_dyamond_on_regimes/no_of_regimes_2/notes.txt
def func(X, a,b,c,d,e,f,g,h,i,j,k,l,m,n):
    x0 = X[:, 0] 
    x1 = X[:, 1] 
    x2 = X[:, 2] 
    x3 = X[:, 3]
    x4 = X[:, 4]
    return a*x0 - b*x1 + c*(x0 - d)*(e*x1 + f)*(g*x1 + h*x4 + i) + j*x3 + k - l/(x2 + m*x3 + n)

In [11]:
import scipy as sci
from scipy.optimize import minimize

In [12]:
def objective(P, X,Y):
    '''
        The objective function.
    '''
    a,b,c,d,e,f,g,h,i,j,k,l,m,n = P
    train_preds = np.minimum(np.maximum(func(X, a,b,c,d,e,f,g,h,i,j,k,l,m,n), 0), 100) 
    train_mse = np.mean((train_preds - Y)**2, dtype=np.float64)

    return train_mse

# T_subset = 10**6
# inds = np.random.randint(0, train_data_scaled.shape[0], T_subset)

# (a,b,c,d,e,f,g,h,i,j,k,l,m,n) = (32.59, 16.3, 1, 0.89, 1, 2.1, 10.53, 4.05, 5.94, 1, 63.03, 2.84, 1, 0.77)

# res = minimize(objective, (a,b,c,d,e,f,g,h,i,j,k,l,m,n), args=(train_data_scaled[inds], flattened_output_train[inds]), \
#                method='BFGS', options={'disp': True})

In [13]:
print(res.x)

[32.44329895 19.70440805  1.0861006   0.91481989  1.30598435  2.1434675
  9.05590254  4.14792458  8.26124985  1.2544264  65.70232142  1.78147668
  0.66383166  0.63743728]


New values

In [16]:
objective(res.x, train_data_scaled, flattened_output_train)

170.91627901190438

In [15]:
objective(res.x, valid_data_scaled, flattened_output_valid)

169.58606278270565

In [18]:
# From finding_symmetries/pysr_results_dyamond_on_regimes/save_optimized_eqns.ipynb
mse_reg_0 = 0.0353
n_0 = 32419018
n_21 = 62640812
N = n_0 + n_21

valid_reg_mse = 169.58606278270565

print('On the entire dataset')
print('Valid MSE: %.5f'%((n_0*mse_reg_0 + n_21*valid_reg_mse)/N))

On the entire dataset
Valid MSE: 111.76280


Original values

In [16]:
objective((a,b,c,d,e,f,g,h,i,j,k,l,m,n), train_data_scaled, flattened_output_train)

176.34490790728015

In [17]:
objective((a,b,c,d,e,f,g,h,i,j,k,l,m,n), valid_data_scaled, flattened_output_valid)

174.94523968583943