### SR NNs

Executed through ~scripts/run_era5_evalute_and_transfer_learn_8.sh

In [1]:
# Should run with 650GB

In [None]:
import os
import gc
import sys
import json
import time
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib
matplotlib.use('PDF')

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow import nn 

## Transfer learn? ##
# tl_bool = bool(int(sys.argv[1]))
tl_bool = True

# Try 0,1,2 first!!
subset_exp = int(sys.argv[1])

# The number of samples in the TL training set. Cannot be less than 27.
number_horizontal_locations = 10**subset_exp
print(subset_exp)
# How long to re-train? Setting it to 40 minutes.
timeout = 40
# The original LR_INIT was 4.33e-4.
LR_INIT = 4.33e-4

# Run with SEED=10,20,30
SEED = int(sys.argv[2])

sys.path.insert(0, '~/workspace_icon-ml/cloud_cover_parameterization/')
import my_classes
from my_classes import read_mean_and_std
from my_classes import load_data
from my_classes import TimeOut

sys.path.insert(0, '~/workspace_icon-ml/symbolic_regression/')
from functions import add_derivatives
from functions import append_dict_to_json

num_cells = int(sys.argv[3])

In [None]:
PWD = '~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/'

**Load data**

-- Takes 10 seconds per day <br>
-- 28 minutes for 172 days

In [None]:
# Takes 10-15 minutes. 150 GB suffice here.
order_of_vars = ['q', 'clwc', 'ciwc', 't', 'pa', 'u', 'v', 'zg', 'fr_land', 'cc']
data_dict = load_data(source='era5', days='all', order_of_vars=order_of_vars)

TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

data_dict['fr_land'] = np.repeat(np.expand_dims(data_dict['fr_land'], axis=1), VLAYERS, axis=1)

In [30]:
# Add magnitude of horizontal wind
data_dict['U'] = np.sqrt(data_dict['u']**2 + data_dict['v']**2)
del data_dict['u']
del data_dict['v']

# Add RH
T0 = 273.15
r = 0.00263*data_dict['pa']*data_dict['q']*np.exp((17.67*(data_dict['t']-T0))/(data_dict['t']-29.65))**(-1)
data_dict['rh'] = r

# Add ps
ps = np.repeat(np.expand_dims(data_dict['pa'][:, -1], axis=1), VLAYERS, axis=1)
data_dict['ps'] = ps

# Removing four upper-most levels
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, 4:].copy()

# Data output
data_output = data_dict['cc']
del data_dict['cc']

In [None]:
# Requires another 133G: Requires 9.5G = 56G/(2*3) per folder
# Requires 15 - 20 minutes: Takes around one minute per folder. There are 14 folders
# --> Actually it runs much longer than expected! (> 45 minutes)

# Load derivatives
for folder in os.listdir('~/bd1179_work/ERA5/hvcg_data'):
    if folder.endswith('z'):
        # Initialize all_npy_files with empty tensor. It's important to specify the dtype here!
        all_npy_files = np.zeros((0, VLAYERS-4, HFIELDS), dtype=np.float32)
        
        # Load all filenames in the folder containing the derivatives. The filenames are sorted chronologically.
        npy_file_names = sorted(os.listdir(os.path.join('~/bd1179_work/ERA5/hvcg_data', folder)))        
        
        for file in npy_file_names:
            # Load three-hourly data and convert directly to float32
            npy_file = np.load('~/bd1179_work/ERA5/hvcg_data/%s/%s'%(folder,file), mmap_mode='r')
            npy_file = np.float32(npy_file[0::3].copy())
            all_npy_files = np.concatenate((all_npy_files, npy_file), axis=0)
        data_dict[folder] = all_npy_files

In [None]:
for key in data_dict.keys():
    print(data_dict[key].shape)

In [33]:
no_features = len(data_dict.keys())
data_dict.keys()

odict_keys(['q', 'clwc', 'ciwc', 't', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'pa_z', 'pa_zz', 'ciwc_z', 'clwc_zz', 'clwc_z', 'U_zz', 'rh_z', 'q_zz', 'U_z', 'q_z', 't_z', 't_zz', 'rh_zz', 'ciwc_zz'])

**SR NN**

In [None]:
nn_path = '~/workspace_icon-ml/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND/saved_models'

custom_objects = {}
custom_objects['leaky_relu'] = nn.leaky_relu

model_name = 'cross_validation_neighborhood_based_sr_cl_area_fold_2.h5'
model = load_model(os.path.join(nn_path, model_name), custom_objects)

mean, std = read_mean_and_std(os.path.join(nn_path, 'cross_validation_neighborhood_based_sr_cl_area_fold_2.txt'))

# To ensure that data_array_scaled will be in float32. 
# The difference between values is < 1e-5
mean = np.float32(mean)
std = np.float32(std)

**All features**

In [9]:
features_nn = ['q', 'clwc', 'ciwc', 't', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'q_z', 'q_zz', 'clwc_z',\
            'clwc_zz', 'ciwc_z', 'ciwc_zz', 't_z', 't_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

k = 0
loc = {}
for feat in features_nn:
    loc[feat] = k
    k = k + 1

**Cast dict into ndarray and reshape**

In [8]:
# data_array = np.zeros((data_dict['q'].size, len(data_dict.keys())), dtype=np.float32)

k = 0
data_array_scaled = []
for key in features_nn:
    data_array_scaled.append(np.reshape(data_dict[key], -1))
    del data_dict[key]
    k += 1

# Convert into np array and transpose
data_array_scaled = (np.transpose(np.array(data_array_scaled, dtype=np.float32)) - mean)/std
data_output = np.reshape(data_output, -1)

**Pick the subset to train on. Only relevant if tl_bool is True**

In [10]:
seed = np.random.seed(SEED)
subset = np.random.randint(0, HFIELDS, number_horizontal_locations)
# Convert to regular int to make check_sum JSON serializable
check_sum = int(np.sum(subset))

# Collecting all grid cell indices for the horizontal fields given by subset
Z = np.zeros((TIMESTEPS, 27, HFIELDS), dtype=int)
for k in range(HFIELDS):
    Z[:,:,k] = k
Z_res = np.reshape(Z, -1)
subset_inds = np.concatenate([np.where(Z_res == s)[0] for s in subset])

In [11]:
train_input_scaled = data_array_scaled[subset_inds[:num_cells]] #num_hours*27
train_output = data_output[subset_inds[:num_cells]] #num_hours*27

**1) Evaluate SR NN**

In [12]:
results = {}

# If tl_bool, we transfer learn to a subset first before evaluating the model!
if tl_bool:
    parent_key = 'SFS_NN_24_tl_%d_num_cells_%d_seed_%d'%(subset_exp, num_cells, SEED)
    results[parent_key] = {}  
    results['number_horizontal_locations'] = number_horizontal_locations
    
    ## Training the model ##
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LR_INIT, epsilon=0.1),
        loss=tf.keras.losses.MeanSquaredError()
    )
    
    t0 = time.time()
    time_callback = TimeOut(t0, timeout)
    
    print('Should be 3693600')
    print(train_input_scaled.shape)
    print(train_output.shape)
    
    # 20 mins per epoch
    history = model.fit(x=train_input_scaled, y=train_output, 
                        epochs=50, verbose=2, callbacks=[time_callback])
    
    #Save the model
    #Serialize model to YAML
    model_json = model.to_json()
    with open(os.path.join(PWD, 'results/era5_1979-2021/models', parent_key+".json"), "w") as json_file:
        json_file.write(model_json)
    #Serialize model and weights to a single HDF5-file
    model.save(os.path.join(PWD, 'results/era5_1979-2021/models', parent_key+'.h5'), "w")
    print('Saved model to disk')

    #Plot the training history
    # if len(history.history['loss']) > len(history.history['val_loss']):
    #     del history.history['loss'][-1]
    pd.DataFrame(history.history).plot(figsize=(8,5))
    plt.grid(True)
    plt.ylabel('Mean Squared Error')
    plt.xlabel('Number of epochs')
    plt.savefig(os.path.join(PWD, 'results/era5_1979-2021/models', parent_key+'.pdf'))
else:
    parent_key = 'SFS_NN_24'  
    results[parent_key] = {}  
    
## Looks like we have to outsource the predictions... ##

# # Reduces memory requirement by roughly 200GB    
# del train_input_scaled, train_output
# gc.collect()
    
# predictions = model.predict_on_batch(data_array_scaled)
# predictions = np.minimum(np.maximum(predictions, 0), 100)

# # Mean-squared error
# mse = np.mean((predictions[:, 0] - data_output)**2)
# results[parent_key]['MSE'] = float(mse)
# print(mse)

# # R2-value
# r2 = 1 - mse/np.var(data_output)
# results[parent_key]['R2'] = float(r2)

# # # Plot results
# # plt.hist(predictions,bins=100)
# # plt.hist(data_output,bins=100,alpha=0.7)
# # plt.yscale('log')
# # plt.legend(['NN', 'ERA5'])
# # plt.savefig('~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/SR_NN.pdf')
# # plt.clf()

# results[parent_key]['Check_sum'] = check_sum

# # Dump results
# append_dict_to_json(results, '~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/sr_based_nn.json')

**Extra plots**

In [None]:
# # All predictions in one plot instead of 10 plots?
# plt.hist(data_output,bins=100,histtype='step',color='k')
# for k in range(0,3):
#     plt.hist(all_preds[k],bins=100,histtype='step')
# plt.yscale('log')
# plt.legend(['ERA5', 'SFS_NN_1', 'SFS_NN_2', 'SFS_NN_3'])
# plt.savefig('~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/SFS_NN_1-3.pdf')
# plt.clf()

In [None]:
# # All predictions in one plot instead of 10 plots?
# plt.hist(data_output,bins=100,histtype='step',color='k')
# for k in range(3,6):
#     plt.hist(all_preds[k],bins=100,histtype='step')
# plt.yscale('log')
# plt.legend(['ERA5', 'SFS_NN_4', 'SFS_NN_5', 'SFS_NN_6'])
# plt.savefig('~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/SFS_NN_4-6.pdf')
# plt.clf()

In [None]:
# # All predictions in one plot instead of 10 plots?
# plt.hist(data_output,bins=100,histtype='step',color='k')
# for k in range(6, 10):
#     plt.hist(all_preds[k],bins=100,histtype='step')
# plt.yscale('log')
# plt.legend(['ERA5', 'SFS_NN_7', 'SFS_NN_8', 'SFS_NN_9', 'SFS_NN_10'])
# plt.savefig('~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/SFS_NN_7-10.pdf')
# plt.clf()

**Plot R2 vs number of features**

In [None]:
# with open('~/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/sfs_based_nn.json', 'r') as file:
#     d = json.load(file)

In [None]:
# # %matplotlib inline
# plt.plot(np.arange(1,11), [d['SFS_NN_%d'%k]['R2'] for k in range(1,11)], 'bo')
# plt.xlabel('Number of features')
# plt.ylabel('R2 score')