**Preparations for Figure 3 of the paper**

**We need a ML-created clc file to compare to int_var_clc_R02B04_NARVALII_2016072800_cloud_DOM01_0017**

The output file has np.nans outside the NARVAL region. <br>
Inside the NARVAL region, where topography led to issues, we have set those points to -10. <br>
This allows us in Paraview to create a 3D visualization and in this 3D visualization to focus on a region that has non-negative cloud cover values.

It would have been easiest to set both the outside of the NARVAL region and the topography-induced troublesome columns to nan, but this leads to issues inside Paraview.

In [57]:
import os
import sys
import numpy as np
import xarray as xr
import pandas as pd

from tensorflow.keras.models import load_model

# Add path with my_classes to sys.path
sys.path.insert(0, '/pf/b/b309170/workspace_icon-ml/cloud_cover_parameterization/')
from my_classes import read_mean_and_std

path = '/pf/b/b309170/my_work/NARVAL'
path_vertinterp = os.path.join(path, 'data_var_vertinterp')
model_path = '/pf/b/b309170/workspace_icon-ml/cloud_cover_parameterization/grid_column_based/saved_models'

model = load_model(os.path.join(model_path, 'model_grid_column_based_final_1.h5'))

VERT_LAYERS = 27

In [58]:
def load_data(date, hour):
    '''
        date: YYYYMMDD00
        hour: 00HH
    '''
    
    # Output
    data_dict = {}

    # Set paths
    pres_path = os.path.join(path_vertinterp, 'pres', 'int_var_pres_R02B04_NARVALII_%s_fg_DOM01_%s.nc'%(date, hour))
    qc_path = os.path.join(path_vertinterp, 'qc', 'int_var_qc_R02B04_NARVALII_%s_fg_DOM01_%s.nc'%(date, hour))
    qv_path = os.path.join(path_vertinterp, 'qv', 'int_var_qv_R02B04_NARVALII_%s_fg_DOM01_%s.nc'%(date, hour))
    qi_path = os.path.join(path_vertinterp, 'qi', 'int_var_qi_R02B04_NARVALII_%s_fg_DOM01_%s.nc'%(date, hour))
    temp_path = os.path.join(path_vertinterp, 'temp', 'int_var_temp_R02B04_NARVALII_%s_fg_DOM01_%s.nc'%(date, hour))
    rho_path = os.path.join(path_vertinterp, 'rho', 'int_var_rho_R02B04_NARVALII_%s_fg_DOM01_%s.nc'%(date, hour))
    zg_path = os.path.join(path_vertinterp, 'zg', 'zg_icon-a_capped.nc')
    fr_lake_path = os.path.join(path, 'grid_extpar', 'fr_lake_R02B04_NARVAL_fg_DOM01.nc')
    
    # Load data and remove upper-most layers
    data_dict['qv'] = (xr.open_dataset(qv_path)).qv.values[:,4:]
    data_dict['qc'] = (xr.open_dataset(qc_path)).qc.values[:,4:]
    data_dict['qi'] = (xr.open_dataset(qi_path)).qi.values[:,4:]
    data_dict['temp'] = (xr.open_dataset(temp_path)).temp.values[:,4:]
    data_dict['pres'] = (xr.open_dataset(pres_path)).pres.values[:,4:]
    data_dict['rho'] = (xr.open_dataset(rho_path)).rho.values[:,4:]
    data_dict['zg'] = (xr.open_dataset(zg_path)).zg.values[4:]
    data_dict['fr_lake'] = (xr.open_dataset(fr_lake_path)).FR_LAKE.values
    
    data_dict['zg'] = np.repeat(np.expand_dims(data_dict['zg'], 0), 1, axis=0)
    data_dict['fr_lake'] = np.repeat(np.expand_dims(data_dict['fr_lake'], 0), 1, axis=0)
    
    # Remove nans
    notnan = ~np.isnan(data_dict['pres'][0,-1,:])
    
    for key in data_dict.keys():
        if data_dict[key].shape[1] == VERT_LAYERS:
            data_dict[key] = data_dict[key][:, :, notnan]
        else:
            data_dict[key] = data_dict[key][:, notnan]
        
    HORIZ_FIELDS = data_dict['pres'].shape[2]
    
    assert np.all(np.isnan(data_dict['pres']) == False)
    
    return data_dict, notnan

In [59]:
def model_predict(model, date, hour):
    '''
        model: neural network
        date: YYYYMMDD00
        hour: 00HH
    '''
    
    data_dict, notnan = load_data(date, hour)
    
    # Transform data into viable input for the model
    # One sample should contain a column of information
    data_dict_reshaped = {}
    for key in data_dict.keys():
        if data_dict[key].shape[1] == VERT_LAYERS:  
            for i in range(4, 4+VERT_LAYERS):
                new_key = '{}{}{:d}'.format(key,'_',i)
                data_dict_reshaped[new_key] = np.reshape(data_dict[key][:,i-4,:], -1)
        else:
            data_dict_reshaped[key] = np.reshape(data_dict[key], -1)

    # Remove constant fields
    del data_dict_reshaped['zg_4']
    del data_dict_reshaped['zg_5']
    del data_dict_reshaped['zg_6']
    del data_dict_reshaped['qc_4']
    
    #Converting dict into a DataFrame-object 
    df = pd.DataFrame.from_dict(data_dict_reshaped)
    df.head()
    
    # Scale input data
    mean, std = read_mean_and_std(os.path.join(model_path, 'model_grid_column_based_final_1.txt'))
    df_scaled = ((df - mean)/std).to_numpy()
    
    # Predict the output
    clc = model.predict(df_scaled)
    clc = np.maximum(np.minimum(clc, 100), 0)
    
    # Reinsert nans. Actually, the amount of nans make it impossible for paraview to visualize the data in 3D.
    # So we insert -10 instead
    clc_with_nans = -np.ones((len(notnan), VERT_LAYERS))
    k = 0
    for i in range(len(notnan)):
        if notnan[i]:
            clc_with_nans[i, :] = clc[k, :]
            k += 1
        else:
            # -10 instead of np.nan
            clc_with_nans[i, :] = -10

    assert np.all(clc_with_nans != -1)
    
    return clc_with_nans

**Predict clc**

In [60]:
# Day and hour of the file to predict
day = '2016072800'
hour = '0012' 

clc_output = model_predict(model, day, hour)
clc_output = np.expand_dims(clc_output, 0)

**Create a netcdf-file with the clc-predictions** <br>
The easiest way is to load a nc-file with the correct dimensions, coordinates and variable names

In [61]:
template_file_path = '/pf/b/b309170/my_work/NARVAL/data_var_vertinterp/clc/int_var_clc_R02B04_NARVALII_%s_cloud_DOM01_%s.nc'%(day, hour)
template_file = xr.open_dataset(template_file_path)

In [62]:
# Add layers above to make the dimensionality equal to the one expected in the file
add_upper_layers = np.zeros((1, 20480, 4))
clc_with_upper_layers = np.concatenate((add_upper_layers, clc_output), axis=2)
clc_with_upper_layers = np.swapaxes(clc_with_upper_layers, 1, 2)

# Set NARVAL range to nan again
for k in range(20480):
    if np.isnan(template_file['clc'][0, -1])[k] == True:
        clc_with_upper_layers[0, :, k] = np.nan
        
template_file['clc'] = (['time', 'height', 'cell'], clc_with_upper_layers)

In [63]:
# Save it in a netcdf file
output_file = '/pf/b/b309170/my_work/NARVAL/for_paraview/clc_pred_R02B04_NARVALII_%s_%s.nc'%(day, hour)
template_file.to_netcdf(output_file)

In one cell to loop over multiple files

In [65]:
# Day and hour of the file to predict
day = '2016072800'
for i in range(12, 37):
    hour = '00%d'%i

    clc_output = model_predict(model, day, hour)
    clc_output = np.expand_dims(clc_output, 0)
    template_file_path = '/pf/b/b309170/my_work/NARVAL/data_var_vertinterp/clc/int_var_clc_R02B04_NARVALII_%s_cloud_DOM01_%s.nc'%(day, hour)
    template_file = xr.open_dataset(template_file_path)
    add_upper_layers = np.zeros((1, 20480, 4))
    clc_with_upper_layers = np.concatenate((add_upper_layers, clc_output), axis=2)
    clc_with_upper_layers = np.swapaxes(clc_with_upper_layers, 1, 2)
    
    # Set NARVAL range to nan again
    for k in range(20480):
        if np.isnan(template_file['clc'][0, -1])[k] == True:
            clc_with_upper_layers[0, :, k] = np.nan
        
    template_file['clc'] = (['time', 'height', 'cell'], clc_with_upper_layers)
    
    # Save it in a netcdf file
    output_file = '/pf/b/b309170/my_work/NARVAL/for_paraview/clc_pred_R02B04_NARVALII_%s_%s.nc'%(day, hour)
    template_file.to_netcdf(output_file)