### Teixeira Scheme

Do not spawn more than 20 processes! The job will just hang.

In [1]:
# All samples (190119664): 5.5s per entry of the tensor

In [2]:
import os
import gc
import sys
import time
import numpy as np
import datetime
import matplotlib
import matplotlib.pyplot as plt

sys.path.insert(0, '~/workspace_icon-ml/symbolic_regression/')
from functions import evaluate_sundqvist

# Shall we use the tuned hyperparameters?
tuned = False

# Added to the PDF name
hour_min = '%d_%d'%(datetime.datetime.now().hour, datetime.datetime.now().minute)

matplotlib.use('PDF')
output_var = sys.argv[1] 

In [None]:
# Load columns of data
folder_data = '~/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND/'

input_data = np.load(os.path.join(folder_data, 'cloud_cover_input_dyamond.npy'))
if output_var == 'cl_volume':
    output_data = np.load(os.path.join(folder_data, 'cloud_cover_output_dyamond.npy'))
elif output_var == 'cl_area':
    output_data = np.load(os.path.join(folder_data, 'cloud_area_output_dyamond.npy'))

new_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
                'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

In [None]:
samples_total, no_of_features = input_data.shape

# Split into train/valid
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

# The second fold yields the best model
input_train = input_data[training_folds[1]]
input_valid = input_data[validation_folds[1]]
output_train = output_data[training_folds[1]]
output_valid = output_data[validation_folds[1]]

# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

In [None]:
# To locate variables
loc = {}
for i in range(len(new_features)):
    loc[new_features[i]] = i
    
input_train.shape

In [None]:
# First try (20/400):
D_range = np.linspace(10**(-9), 10**(-2), 20)
K_range = np.linspace(10**(-9), 10**(-2), 400)

# Time for len(D_range)/len(K_range) = 3/2: 49s
# Time for len(D_range)/len(K_range) = 20/2: 82s
# Time for len(D_range)/len(K_range) = 3/100: 2192s

# Time for len(D_range)/len(K_range) = 50/2: Doesn't terminate
# Time for len(D_range)/len(K_range) = 100/2: Doesn't terminate
# Time for len(D_range)/len(K_range) = 30/2: Doesn't terminate
# Time for len(D_range)/len(K_range) = 40/2: Doesn't terminate

In [None]:
# Estimated required time to run the notebook in hours
# Factor of 10 instead of len(rsat_range_land) due to multiprocessing
print('Estimated required time to run the notebook: %.1f hours'%((5.5*(10*len(K_range)))/3600))

In [None]:
def search_hyperparams_outer(D):
    return search_hyperparams_inner(D, K_range)

In [None]:
from contextlib import contextmanager
import multiprocessing as mlp
import gc

@contextmanager
def poolcontext(*args, **kwargs):
    pool = mlp.Pool(*args, **kwargs)
    yield pool
    pool.terminate()

In [None]:
def search_hyperparams_inner(D, K_range):
    mse_tensor = -np.ones((1, len(K_range)))
    i2 = -1
    for K in K_range:
        i2 += 1
        
        # What is the average error with this set of tuning parameters?
        clw = input_train[:, loc['clw']]
        cli = input_train[:, loc['cli']]
        ta = input_train[:, loc['ta']]
        r = input_train[:, loc['rh']]
        p = input_train[:, loc['pa']]

        # Clausius-Clapeyron assuming a constant latent heat of vaporization and the ideal gas law (Lohmann, eq. 2.60)
        e0 = 611.2
        Lv = 2.5*10**6
        Rv = 461.5
        T0 = 273.15
        e = e0*np.exp(Lv/Rv*(1/T0-1/ta))
        
        # q_s (Saturation specific humidity): The specific humidity of water vapor corresponding to the saturation mixing ratio
        # Assuming the water pressure to be much smaller than the atmospheric pressure (Lohmann, eq. 2.80)
        eps = 0.622
        qs = eps*e/p
        
        # According to Teixeira, for qs/clw values of 50, 100 and 200 are realistic
        # We have: np.mean(qs)/np.mean(clw) = 241, np.max(qs)/np.max(clw) = 36 which fits nicely
        
        # Small threshold to avoid division by zero
        thr = 1e-9
        c = D*clw/(2*qs*(1-np.minimum(r, 1-thr))*K)*(np.sqrt(np.maximum(0, 1 + (4*qs*(1-np.minimum(r, 1-thr))*K)/(D*np.maximum(clw, thr)))) - 1)
        
        # c can exceed 1 very slightly
        c = np.minimum(c, 1)
        
        mse_tensor[0, i2] = np.mean((100*c - output_train)**2)
                    
    return mse_tensor

### Fitting hyperparameters
Originally: $D = 4e-6, K = 10e-6$

In [None]:
t0 = time.time()

In [None]:
procs = len(D_range)
with poolcontext(processes=procs) as pool:
    # Every process received a part of data_dict
    mse_tensor = pool.map(search_hyperparams_outer, D_range)
    
mse_tensor = np.squeeze(np.array(mse_tensor))
                
# assert np.all(mse_tensor >= 0)
np.save('~/workspace_icon-ml/symbolic_regression/baselines/teixeira_tuning_dyamond/mse_tensor_%s.npy'%hour_min, mse_tensor)

In [None]:
min_mse = 10**10
for i in range(mse_tensor.shape[0]):
    for j in range(mse_tensor.shape[1]):
        if mse_tensor[i,j] < min_mse:
            min_mse = mse_tensor[i,j]
            opt_ind = [i,j]                    
                    
with open('~/workspace_icon-ml/symbolic_regression/baselines/teixeira_tuning_dyamond/best_results.txt', 'a') as file:
    file.write('Time it took: %.3f\n'%(time.time() - t0))
    file.write('Output variable: %s\n'%output_var)
    file.write('Best values: %s\n'%str([D_range[opt_ind[0]], K_range[opt_ind[1]]]))

### Plotting the results (To run)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

mse_tensor = np.load('~/workspace_icon-ml/symbolic_regression/baselines/teixeira_tuning_dyamond/mse_tensor_9_42.npy')
plt.imshow(mse_tensor)

### Performance with the best hyperparameter setting (To run)

In [22]:
def teixeira_eval(input_data, output_data, loc, D, K):
    # What is the average error with this set of tuning parameters?
    clw = input_data[:, loc['clw']]
    cli = input_data[:, loc['cli']]
    ta = input_data[:, loc['ta']]
    r = input_data[:, loc['rh']]
    p = input_data[:, loc['pa']]

    # Clausius-Clapeyron assuming a constant latent heat of vaporization and the ideal gas law (Lohmann, eq. 2.60)
    e0 = 611.2
    Lv = 2.5*10**6
    Rv = 461.5
    T0 = 273.15
    e = e0*np.exp(Lv/Rv*(1/T0-1/ta))

    # q_s (Saturation specific humidity): The specific humidity of water vapor corresponding to the saturation mixing ratio
    # Assuming the water pressure to be much smaller than the atmospheric pressure (Lohmann, eq. 2.80)
    eps = 0.622
    qs = eps*e/p

    # Small threshold to avoid division by zero
    thr = 1e-9
    c = D*clw/(2*qs*(1-np.minimum(r, 1-thr))*K)*(np.sqrt(np.maximum(0, 1 + (4*qs*(1-np.minimum(r, 1-thr))*K)/(D*np.maximum(clw, thr)))) - 1)

    # c can exceed 1 very slightly
    c = np.minimum(c, 1)

    mse = np.mean((100*c - output_data)**2)
    var = np.var(output_data)
    r2 = 1-mse/var
    
    return mse, r2

In [19]:
if output_var == 'cl_volume':
    D_opt = 0.0031578954210526315
    K_opt = 0.00012531427067669174
elif output_var == 'cl_area':
    D_opt = 0.01
    K_opt = 0.0001002516165413534

In [24]:
mse_train, r2_train = teixeira_eval(input_train, output_train, loc, D_opt, K_opt)
mse_valid, r2_valid = teixeira_eval(input_valid, output_valid, loc, D_opt, K_opt)

In [25]:
with open('~/workspace_icon-ml/symbolic_regression/baselines/teixeira_tuning_dyamond/best_results.txt', 'a') as file:
    file.write('Output variable: %s\n'%output_var)
    file.write('Training score:\n')
    file.write('MSE: %.3f, R2: %.3f\n'%(mse_train, r2_train))
    file.write('Validation score:\n')
    file.write('MSE: %.3f, R2: %.3f\n\n'%(mse_valid, r2_valid))