### Sundqvist Scheme

How about we fit the Sundqvist model where we fit the tuning parameters to the data? <br>
We let the parameters depend on whether they are taken over land or over the sea.

In [1]:
# All samples (190119664): 5.5s per entry of the tensor

In [2]:
import os
import gc
import sys
import time
import json
import numpy as np
import datetime
import matplotlib.pyplot as plt

sys.path.insert(0, '/home/b/b309170/workspace_icon-ml/symbolic_regression/')
from functions import evaluate_sundqvist

output_var = sys.argv[1]

In [4]:
# Load columns of data
folder_data = '/home/b/b309170/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND/'

input_data = np.load(os.path.join(folder_data, 'cloud_cover_input_dyamond.npy'))
if output_var == 'cl_volume':
    output_data = np.load(os.path.join(folder_data, 'cloud_cover_output_dyamond.npy'))
elif output_var == 'cl_area':
    output_data = np.load(os.path.join(folder_data, 'cloud_area_output_dyamond.npy'))

new_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
                'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

In [5]:
samples_total, no_of_features = input_data.shape

# Split into train/valid
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

# The second fold yields the best model
input_train = input_data[training_folds[1]]
input_valid = input_data[validation_folds[1]]
output_train = output_data[training_folds[1]]
output_valid = output_data[validation_folds[1]]

# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

38

In [6]:
# To locate variables
loc = {}
for i in range(len(new_features)):
    loc[new_features[i]] = i
    
input_train.shape

(190119664, 24)

In [7]:
# We split the training data into cells over land vs sea
land_ind = np.where(input_train[:, loc['fr_land']] > 0.5)[0]
sea_ind = np.where(input_train[:, loc['fr_land']] <= 0.5)[0]

input_land = input_train[land_ind]
output_land = output_train[land_ind]
input_sea = input_train[sea_ind]
output_sea = output_train[sea_ind]

### Performance with the best hyperparameter setting (To run)

In [7]:
# From best_results.txt
if output_var == 'cl_volume':
    best_land = [0.95, 0.16, 0.5, 2.12]
    best_sea = [0.95, 0.11, 0.9, 2.12]
elif output_var == 'cl_area':
    best_land = [0.9, 0.01, 0.55, 2.12]
    best_sea = [0.95, 0.01, 0.55, 2.12]

In [24]:
# Differentiate between original, manually and automatically tuned! I expect this codes needs an hour to run
mse_train = evaluate_sundqvist(input_train, output_train, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)
mse_train_land = evaluate_sundqvist(input_land, output_land, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)
mse_train_sea = evaluate_sundqvist(input_sea, output_sea, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)
mse_valid = evaluate_sundqvist(input_valid, output_valid, loc, tuned='custom', best_land=best_land, best_sea=best_sea, compute_r2=False)

In [31]:
# Write results to JSON
results = {}
results['Order of parameters'] = 'rsat, r0_top, r0_surf, n'
results['Best params land'] = str(best_land)
results['Best params sea'] = str(best_sea)
results['Training MSE'] = mse_train
results['Land MSE'] = mse_train_land
results['Sea MSE'] = mse_train_sea
results['Validation MSE'] = mse_valid

with open('/home/b/b309170/workspace_icon-ml/symbolic_regression/baselines/sundqvist_tuning_dyamond/results_grid_search_%s.json'%output_var, 'w') as file:
    json.dump(results, file)

Quick test of old parameters (found on QUBICC)

In [8]:
mse_train = evaluate_sundqvist(input_train, output_train, loc, tuned='manually', compute_r2=False)
mse_train_land = evaluate_sundqvist(input_land, output_land, loc, tuned='manually', compute_r2=False)
mse_train_sea = evaluate_sundqvist(input_sea, output_sea, loc, tuned='manually', compute_r2=False)
mse_valid = evaluate_sundqvist(input_valid, output_valid, loc, tuned='manually', compute_r2=False)

In [9]:
# It is slightly better than a constant output model...
print(mse_train)
print(mse_train_land)
print(mse_train_sea)
print(mse_valid)

1373.151097511083
1510.730314937607
1329.9443211258374
1382.5448965542282
