## Commence training

In [2]:
import sys
import numpy as np
import time
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
# import importlib
import os
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

path = '/pf/b/b309170'
# Add path with my_classes to sys.path
sys.path.insert(0, path + '/workspace_icon-ml/cloud_cover_parameterization/')

from my_classes import TimeOut

path_data = path + '/my_work/icon-ml_data/cloud_cover_parameterization/region_based/based_on_var_interpolated_data'
path_model = path + '/workspace_icon-ml/cloud_cover_parameterization/region_based/saved_models'
path_fig = path + '/workspace_icon-ml/cloud_cover_parameterization/region_based/figures'

NUM = 1
# no_NNs = 27 #How many NNs do we train
no_NNs = 9 # How many NNs do we train (usually 10). We can't train all of them in a single batch job.
# no_NNs = 2
timeout = 220 #Stop after how many minutes (per NN) (usually 210)
# timeout = 15

# gpus = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
ALL_INPUT_VARIABLES = np.array(['qv_i-2', 'qv_i-1', 'qv_i', 'qv_i+1', 'qv_i+2', 'qc_i-2', 'qc_i-1',
       'qc_i', 'qc_i+1', 'qc_i+2', 'qi_i-2', 'qi_i-1', 'qi_i', 'qi_i+1',
       'qi_i+2', 'temp_i-2', 'temp_i-1', 'temp_i', 'temp_i+1', 'temp_i+2',
       'pres_i-2', 'pres_i-1', 'pres_i', 'pres_i+1', 'pres_i+2', 'rho_i-2',
       'rho_i-1', 'rho_i', 'rho_i+1', 'rho_i+2', 'zg_i-2', 'zg_i-1', 'zg_i',
       'zg_i+1', 'zg_i+2', 'fr_lake', 'clc_prev'])

In [71]:
# Load data
input_train = np.load(path_data + '/cloud_cover_input_train_%d.npy'%NUM)
input_valid = np.load(path_data + '/cloud_cover_input_valid_%d.npy'%NUM)
input_test = np.load(path_data + '/cloud_cover_input_test_%d.npy'%NUM)
output_train = np.load(path_data + '/cloud_cover_output_train_%d.npy'%NUM)
output_valid = np.load(path_data + '/cloud_cover_output_valid_%d.npy'%NUM)
output_test = np.load(path_data + '/cloud_cover_output_test_%d.npy'%NUM)

In [None]:
for i in range(0, no_NNs): #for i in range(0, no_NNs)
    t0 = time.time() 
    
    # Training data for the i-th NN
    start_ind_train = (output_train.shape[0]//27)*i
    end_ind_train = (output_train.shape[0]//27)*(i+1)
    start_ind_valid = (output_valid.shape[0]//27)*i
    end_ind_valid = (output_valid.shape[0]//27)*(i+1)
    start_ind_test = (output_test.shape[0]//27)*i
    end_ind_test = (output_test.shape[0]//27)*(i+1) 
    
    input_train_NN = input_train[start_ind_train:end_ind_train]
    output_train_NN = output_train[start_ind_train:end_ind_train]
    input_valid_NN = input_valid[start_ind_valid:end_ind_valid]
    output_valid_NN = output_valid[start_ind_valid:end_ind_valid]
    input_test_NN = input_test[start_ind_test:end_ind_test]
    output_test_NN = output_test[start_ind_test:end_ind_test]
    
    # We remove the input variables with zero variance. We compute the resulting input dimension for the NN.
    input_dim = 37
    vars_to_remove = []
    for j in range(37):
        if np.var(input_train_NN[:, j]) == 0 or np.isnan(np.var(input_train_NN[:, j])):
            input_dim -= 1
            vars_to_remove.append(j)
    
    input_train_NN = np.delete(input_train_NN, vars_to_remove, 1)
    input_valid_NN = np.delete(input_valid_NN, vars_to_remove, 1)
    input_test_NN = np.delete(input_test_NN, vars_to_remove, 1)
    
    # Defining the model
    model = Sequential()
    model.add(Dense(256, activation='relu', input_dim = input_dim))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='linear'))

    # The default initial learning rate of 0.001 yields a bad training process (loss keeps jumping up and down)
    # This is only for the 0th and 1st layer, and a lower lr doesn't really improve training
    # I'm using the default lr from the 11th layer on
    model.compile(loss='mse', optimizer=Nadam())
    time_callback = TimeOut(t0, timeout)
    history = model.fit(input_train_NN, output_train_NN, batch_size=32, epochs=70, 
                        validation_data=(input_valid_NN, output_valid_NN), verbose=2, 
                        callbacks=[time_callback])
    
    filename = "model_clc_all_days_final_%d_%d"%(NUM,i)

    #Serialize model to YAML
    model_yaml = model.to_yaml()
    with open(os.path.join(path_model, filename+".yaml"), "w") as yaml_file:
        yaml_file.write(model_yaml)

    #Serialize model and weights to a single HDF5-file
    model.save(os.path.join(path_model, filename+'.h5'))
    print('Saved model to disk')

    #Plotting the training progress
    if len(history.history['loss']) > len(history.history['val_loss']):
        del history.history['loss'][-1]
    pd.DataFrame(history.history).plot(figsize=(8,5))
    plt.grid(True)
    plt.ylabel('Mean Squared Error')
    plt.xlabel('Number of epochs')
    plt.title('Training of the NN parameterization for the %d. layer'%(i+1))
    plt.savefig(os.path.join(path_fig, filename+'.pdf'))

    train_loss = model.evaluate(input_train_NN, output_train_NN, verbose=2, batch_size=300)
    valid_loss = model.evaluate(input_valid_NN, output_valid_NN, verbose=2, batch_size=30)
    test_loss = model.evaluate(input_test_NN, output_test_NN, verbose=2, batch_size=30)
    
    with open(os.path.join(path_model, 'model_region_based_final_%d.txt'%NUM), 'a') as file:
        file.write('\nRemoved input variables %d: %s\n'%(i, np.array(ALL_INPUT_VARIABLES)[vars_to_remove]))
        file.write('Training loss %d: %.4f\n'%(i, train_loss))
        file.write('Validation loss %d: %.4f\n'%(i, valid_loss))
        file.write('Test loss %d: %.4f\n'%(i, test_loss))
        file.write('Training epochs %d: %d'%(i, len(history.history['val_loss'])))