### SFS NNs

Executed through /home/b/b309170/scripts/run_era5_evalute_and_transfer_learn_4.sh

In [None]:
# Run with 960GB!

In [2]:
import os
import gc
import sys
import json
import time
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib
matplotlib.use('PDF')

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow import nn 

## Transfer learn? ##
# tl_bool = bool(int(sys.argv[1]))
tl_bool = True
subset_exp = int(sys.argv[1])
# subset_exp = 2
number_horizontal_locations = 10**subset_exp
# How long to re-train? Setting it to 40 minutes.
timeout = 40
# The original LR_INIT was 4.33e-4.
LR_INIT = 4.33e-4
SEED = int(sys.argv[2])
# SEED = 10

# If the model was already transfer-learned we can set this to True
already_trained = False

sys.path.insert(0, '/home/b/b309170/workspace_icon-ml/cloud_cover_parameterization/')
import my_classes
from my_classes import read_mean_and_std
from my_classes import load_data
from my_classes import TimeOut

sys.path.insert(0, '/home/b/b309170/workspace_icon-ml/symbolic_regression/')
from functions import add_derivatives
from functions import append_dict_to_json

num_cells = int(sys.argv[3])

In [3]:
# How long does it take to load the data
print('Starting the script:')
print(time.time())

Starting the script:
1674464690.6535661


In [4]:
PWD = '/home/b/b309170/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/'

**Load data**

In [5]:
order_of_vars = ['q', 'clwc', 'ciwc', 't', 'pa', 'u', 'v', 'zg', 'fr_land', 'cc']
data_dict = load_data(source='era5', days='all', order_of_vars=order_of_vars)

TIMESTEPS, VLAYERS, HFIELDS = data_dict['q'].shape

data_dict['fr_land'] = np.repeat(np.expand_dims(data_dict['fr_land'], axis=1), VLAYERS, axis=1)

q
clwc
ciwc
t
pa
u
v
cc
all
100.000015


In [6]:
# Add magnitude of horizontal wind
data_dict['U'] = np.sqrt(data_dict['u']**2 + data_dict['v']**2)
del data_dict['u']
del data_dict['v']

# Add RH
T0 = 273.15
r = 0.00263*data_dict['pa']*data_dict['q']*np.exp((17.67*(data_dict['t']-T0))/(data_dict['t']-29.65))**(-1)
data_dict['rh'] = r

# Add ps
ps = np.repeat(np.expand_dims(data_dict['pa'][:, -1], axis=1), VLAYERS, axis=1)
data_dict['ps'] = ps

# Removing four upper-most levels
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, 4:].copy()

# Data output
data_output = data_dict['cc']
del data_dict['cc']

In [None]:
# Requires 15 - 20 minutes: Takes around one minute per folder. There are 14 folders
# Requires 133G: Requires 9.5G = 56G/(2*3) per folder

# Load derivatives
for folder in os.listdir('/home/b/b309170/bd1179_work/ERA5/hvcg_data'):
    if folder.endswith('z'):
        # Initialize all_npy_files with empty tensor
        all_npy_files = np.zeros((0, VLAYERS-4, HFIELDS), dtype=np.float32)
        
        # Load all filenames in the folder containing the derivatives. The filenames are sorted chronologically.
        npy_file_names = sorted(os.listdir(os.path.join('/home/b/b309170/bd1179_work/ERA5/hvcg_data', folder)))        
        
        for file in npy_file_names:
            # Load three-hourly data and convert directly to float32
            npy_file = np.load('/home/b/b309170/bd1179_work/ERA5/hvcg_data/%s/%s'%(folder,file), mmap_mode='r')
            npy_file = np.float32(npy_file[0::3].copy())
            all_npy_files = np.concatenate((all_npy_files, npy_file), axis=0)
        data_dict[folder] = all_npy_files

In [None]:
no_features = len(data_dict.keys())
data_dict.keys()

In [None]:
# Data loading takes ~3 hours, 15 minutes
print('End of data loading:')
print(time.time())

**All features**

In [None]:
features_nn = ['q', 'clwc', 'ciwc', 't', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'q_z', 'q_zz', 'clwc_z',\
            'clwc_zz', 'ciwc_z', 'ciwc_zz', 't_z', 't_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

k = 0
loc = {}
for feat in features_nn:
    loc[feat] = k
    k = k + 1

**Cast dict into ndarray and reshape**

In [None]:
# data_array = np.zeros((data_dict['q'].size, len(data_dict.keys())), dtype=np.float32)

k = 0
data_array_not_T = []
for key in features_nn:
    data_array_not_T.append(np.reshape(data_dict[key], -1))
    del data_dict[key]
    k += 1

# Convert into np array and transpose
data_array = np.transpose(np.array(data_array_not_T, dtype=np.float32))
data_output = np.reshape(data_output, -1)

del data_array_not_T
gc.collect()

In [None]:
# I think this takes around 3.5 hours
print('End of casting:')
print(time.time())

**Pick the subset to train on. Only relevant if tl_bool is True**

In [None]:
seed = np.random.seed(SEED)
subset = np.random.randint(0, HFIELDS, number_horizontal_locations)
# Convert to regular int to make check_sum JSON serializable
check_sum = int(np.sum(subset))

# Collecting all grid cell indices for the horizontal fields given by subset
Z = np.zeros((TIMESTEPS, 27, HFIELDS), dtype=int)
for k in range(HFIELDS):
    Z[:,:,k] = k
Z_res = np.reshape(Z, -1)
subset_inds = np.concatenate([np.where(Z_res == s)[0] for s in subset])

In [None]:
train_input = data_array[subset_inds[:num_cells]] #num_hours*27
train_output = data_output[subset_inds[:num_cells]] #num_hours*27

**2) Loop through SFS NNs**

In [None]:
def which_features(sfs_ind):
    '''
        Extract the relevant feature names and their order for a given SFS NN
    '''
    conv = {'cli': 'ciwc', 'clw': 'clwc', 'ta': 't', 'ta_z': 't_z'}
    with open('/home/b/b309170/workspace_icon-ml/cloud_cover_parameterization/neighborhood_based_on_seq_feat_sel_DYAMOND/saved_models/neighborhood_based_sfs_cl_area_no_features_%d.txt'%sfs_ind, 'r') as file:
        lines = file.readlines()
        for k in range(len(lines)):
            if lines[k].startswith('The (order of) input variables'):
                out_line = lines[k+1][1:-2].split(' ')
    for ind in range(len(out_line)):
        out_line[ind] = out_line[ind][1:-1]
        # Rename if the name is different in ERA5
        if out_line[ind] in conv.keys():
            out_line[ind] = conv[out_line[ind]]
    return out_line

In [None]:
print('Start training:')
print(time.time())

In [19]:
results = {} 
all_preds = []
for sfs_ind in range(1, 11):
    features_nn = which_features(sfs_ind)

    nn_path = '/home/b/b309170/workspace_icon-ml/cloud_cover_parameterization/neighborhood_based_on_seq_feat_sel_DYAMOND/saved_models'

    custom_objects = {}
    custom_objects['leaky_relu'] = nn.leaky_relu

    # Select the appropriate features    
    features_inds = []
    for k in range(sfs_ind):
        features_inds.append(loc[features_nn[k]])
    data_array_sfs_nn = data_array[:, features_inds]
    train_input_sfs_nn = train_input[:, features_inds]

    if sfs_ind in [4,5,6,7]:
        if sfs_ind == 4:
            thrd_lay = 'False'
        else:
            thrd_lay = 'True'
        model_name = 'hyperparameter_tests/neighborhood_based_sfs_cl_area_no_features_%d_False_%s_16'%(sfs_ind,thrd_lay)
        if sfs_ind == 7:
            model_name = 'hyperparameter_tests/neighborhood_based_sfs_cl_area_no_features_7_True_True_32'
    else:
        model_name = 'neighborhood_based_sfs_cl_area_no_features_%d'%sfs_ind

    # Override model name and model path if the model was already trained    
    if already_trained:
        nn_path = '/home/b/b309170/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/models'
        model_name = 'SFS_NN_%d_tl_%d_seed_%d.h5'%(sfs_ind, subset_exp, SEED)

    model = load_model(os.path.join(nn_path, model_name + '.h5'), custom_objects)

    mean, std = read_mean_and_std(os.path.join(nn_path, model_name + '.txt'))

    # To ensure that the matrices stay in float32
    mean = np.float32(mean)
    std = np.float32(std)

    data_array_scaled = np.float32((data_array_sfs_nn - mean)/std)
    train_input_scaled = np.float32((train_input_sfs_nn - mean)/std)

    del data_array_sfs_nn, train_input_sfs_nn
    gc.collect()

    # If tl_bool, we transfer learn to a subset first before evaluating the model!
    if tl_bool:
        parent_key = 'SFS_NN_%d_tl_%d_num_cells_%d_seed_%d'%(sfs_ind, subset_exp, num_cells, SEED)
        results[parent_key] = {}  
        results['number_horizontal_locations'] = number_horizontal_locations

        ## Training the model ##
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=LR_INIT, epsilon=0.1),
            loss=tf.keras.losses.MeanSquaredError()
        )

        t0 = time.time()
        time_callback = TimeOut(t0, timeout)

        print('Should be 3693600')
        print(train_input_scaled.shape)
        print(train_output.shape)

        # 20 mins per epoch
        history = model.fit(x=train_input_scaled, y=train_output,
                            epochs=50, verbose=2, callbacks=[time_callback])

        #Serialize model to YAML
        model_json = model.to_json()
        with open(os.path.join(PWD, 'results/era5_1979-2021/models', parent_key+".json"), "w") as json_file:
            json_file.write(model_json)
        #Serialize model and weights to a single HDF5-file
        model.save(os.path.join(PWD, 'results/era5_1979-2021/models', parent_key+'.h5'), "w")
        print('Saved model to disk')

        #Plot the training history
        # if len(history.history['loss']) > len(history.history['val_loss']):
        #     del history.history['loss'][-1]
        pd.DataFrame(history.history).plot(figsize=(8,5))
        plt.grid(True)
        plt.ylabel('Mean Squared Error')
        plt.xlabel('Number of epochs')
        plt.savefig(os.path.join(PWD, 'results/era5_1979-2021/models', parent_key+'.pdf'))
    else:
        parent_key = 'SFS_NN_%d'%sfs_ind
        results[parent_key] = {}

## Looks like we have to outsource the predictions... ##

    # del train_input_scaled
    # gc.collect()

#                 predictions = model.predict_on_batch(data_array_scaled)
#                 predictions = np.minimum(np.maximum(predictions, 0), 100)

#                 # Mean-squared error
#                 mse = np.mean((predictions[:, 0] - data_output)**2)
#                 results[parent_key]['MSE'] = float(mse)
#                 print(mse)

#                 # R2-value
#                 r2 = 1 - mse/np.var(data_output)
#                 results[parent_key]['R2'] = float(r2)

#                 all_preds.append(predictions)

#                 # ## Save plot
#                 # plt.hist(predictions,bins=100)
#                 # plt.hist(data_output,bins=100,alpha=0.7)
#                 # plt.yscale('log')
#                 # plt.legend(['NN', 'ERA5'])
#                 # plt.savefig('/home/b/b309170/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/SFS_NN_%d.pdf'%sfs_ind)
#                 # plt.clf()

#             # Dump all SFS NN results    
#             append_dict_to_json(results, '/home/b/b309170/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/sfs_based_nn.json')

Starting training
Epoch 1/5
4/4 - 6s - loss: 934.1410 - val_loss: 716.8510
Epoch 2/5
4/4 - 5s - loss: 886.0610 - val_loss: 714.3365
Epoch 3/5
4/4 - 5s - loss: 875.3347 - val_loss: 709.9414
Epoch 4/5
4/4 - 5s - loss: 864.7082 - val_loss: 704.7211
Epoch 5/5
4/4 - 5s - loss: 814.8870 - val_loss: 708.5840
Restore model weights from the end of the best epoch
Saved model to disk
700.57574
Starting training
Epoch 1/5
4/4 - 6s - loss: 714.5229 - val_loss: 557.0905
Epoch 2/5
4/4 - 5s - loss: 501.7715 - val_loss: 559.8063
Epoch 3/5
4/4 - 5s - loss: 518.5615 - val_loss: 547.6897
Epoch 4/5
4/4 - 5s - loss: 398.9344 - val_loss: 529.5034
Epoch 5/5
4/4 - 5s - loss: 346.3876 - val_loss: 508.9203
Restore model weights from the end of the best epoch
Saved model to disk
492.85507
Starting training
Epoch 1/5
4/4 - 6s - loss: 2715.5110 - val_loss: 404.7828
Epoch 2/5
4/4 - 5s - loss: 2323.5691 - val_loss: 391.5742
Epoch 3/5
4/4 - 5s - loss: 2110.9302 - val_loss: 398.5379
Epoch 4/5
4/4 - 5s - loss: 1868.7275

**Extra plots**

In [None]:
# # All predictions in one plot instead of 10 plots?
# plt.hist(data_output,bins=100,histtype='step',color='k')
# for k in range(0,3):
#     plt.hist(all_preds[k],bins=100,histtype='step')
# plt.yscale('log')
# plt.legend(['ERA5', 'SFS_NN_1', 'SFS_NN_2', 'SFS_NN_3'])
# plt.savefig('/home/b/b309170/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/SFS_NN_1-3.pdf')
# plt.clf()

In [None]:
# # All predictions in one plot instead of 10 plots?
# plt.hist(data_output,bins=100,histtype='step',color='k')
# for k in range(3,6):
#     plt.hist(all_preds[k],bins=100,histtype='step')
# plt.yscale('log')
# plt.legend(['ERA5', 'SFS_NN_4', 'SFS_NN_5', 'SFS_NN_6'])
# plt.savefig('/home/b/b309170/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/SFS_NN_4-6.pdf')
# plt.clf()

In [None]:
# # All predictions in one plot instead of 10 plots?
# plt.hist(data_output,bins=100,histtype='step',color='k')
# for k in range(6, 10):
#     plt.hist(all_preds[k],bins=100,histtype='step')
# plt.yscale('log')
# plt.legend(['ERA5', 'SFS_NN_7', 'SFS_NN_8', 'SFS_NN_9', 'SFS_NN_10'])
# plt.savefig('/home/b/b309170/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/SFS_NN_7-10.pdf')
# plt.clf()

**Plot R2 vs number of features**

In [None]:
# with open('/home/b/b309170/workspace_icon-ml/symbolic_regression/evaluate_schemes/on_era5/results/era5_1979-2021/sfs_based_nn.json', 'r') as file:
#     d = json.load(file)

In [None]:
# # %matplotlib inline
# plt.plot(np.arange(1,11), [d['SFS_NN_%d'%k]['R2'] for k in range(1,11)], 'bo')
# plt.xlabel('Number of features')
# plt.ylabel('R2 score')