## Preprocessing Narval

**This data was not used to train the NNs**

Converting the data into npy makes it possible for us to work with it efficiently; originally we require 500GB of RAM which is always difficult to guarantee. We preprocess QUBICC in another ipynb notebook precisely because of this issue.

1) We read the data
2) Reshape variables so that they have equal dimensionality
3) Reshape into data samples fit for the NN and convert into a DataFrame
4) Downsample the data: Remove data above 21kms, remove condensate-free clouds, combat class-imbalance
5) Split into input and output
6) Save as npy

Note: We neither scale nor split the data into training/validation/test sets. <br>
The reason is that i) in order to scale we need the entire dataset but this can only be done in conjunction with the Qubicc dataset. Also for cross-validation different scalings will be necessary based on different subsets of the data, ii) The split into subsets will be done by the cross-validation procedure or not at all when training the final model.

In [1]:
import sys
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
# import importlib
# importlib.reload(my_classes)

base_path = '/pf/b/b309170'
output_path = base_path + '/my_work/icon-ml_data/cloud_cover_parameterization/region_based_one_nn_R02B05/based_on_var_interpolated_data'

# Add path with my_classes to sys.path
sys.path.insert(0, base_path + '/workspace_icon-ml/cloud_cover_parameterization/')

# Which days to load
days_narval = 'all'

from my_classes import load_data

VERT_LAYERS = 31

#Set a numpy seed for the permutation later on!
np.random.seed(10)

# Set output_var to one of {'clc', 'cl_area'}
output_var = 'cl_area'

## 1) Reading the data
### Input:
- coriolis: Coriolis parameter
- zg: Geometric height at full levels (3D)
- qv: Specific water vapor content (3D)
- qc: Specific cloud water content (3D)
- qi: Specific cloud ice content (3D)
- temp: Temperature (3D)
- pres: Pressure (3D)
- u: Zonal wind (3D)
- v: Meridional wind (3D)

$10$ input nodes

### Output:
- clc: Cloud Cover

$1$ output nodes

The data above 21km is capped.

In [None]:
# For cl_area I only need the output as I already have the input
# I still need 'qc', 'qi', 'clc' for condensate-free clouds
# If I were to use 'cl_area' for condensate-free clouds I would get an estimate 
# which is slightly different due to coarse-graining

order_of_vars_narval = ['qv', 'qc', 'qi', 'temp', 'pres', 'u', 'v', 'zg', 'coriolis', output_var]

In [2]:
# Load NARVAL data
data_dict = load_data(source='narval', days=days_narval, resolution='R02B05', 
                             order_of_vars=order_of_vars_narval)

qv
qc
qi
temp
pres
u
v


In [3]:
for key in data_dict.keys():
    print(key, data_dict[key].shape)

qv (1721, 31, 4450)
qc (1721, 31, 4450)
qi (1721, 31, 4450)
temp (1721, 31, 4450)
pres (1721, 31, 4450)
u (1721, 31, 4450)
v (1721, 31, 4450)
zg (31, 4450)
coriolis (4450,)
clc (1721, 31, 4450)


In [4]:
(TIME_STEPS, VERT_LAYERS, HORIZ_FIELDS) = data_dict[output_var].shape

In [5]:
#Reshaping into nd-arrays of equaling shapes (don't reshape in the vertical)
data_dict['zg'] = np.repeat(np.expand_dims(data_dict['zg'], 0), TIME_STEPS, axis=0)
try:
    data_dict['coriolis'] = np.repeat(np.expand_dims(data_dict['coriolis'], 0), TIME_STEPS, axis=0)
    data_dict['coriolis'] = np.repeat(np.expand_dims(data_dict['coriolis'], 1), VERT_LAYERS, axis=1)
except:
    print('There is probably no coriolis in order_of_vars_narval')
    pass

In [6]:
try: 
    # Surface temperature (Try without?)
    temp_sfc = np.repeat(np.expand_dims(data_dict['temp'][:, -1, :], axis=1), VERT_LAYERS, axis=1)
except: 
    print('There is probably no temperature in order_of_vars_narval')
    pass

In [7]:
# Carry along information about the vertical layer of a grid cell. int16 is sufficient for < 1000.
vert_layers = np.int16(np.repeat(np.expand_dims(np.arange(1, VERT_LAYERS+1), 0), TIME_STEPS, axis=0))
vert_layers = np.repeat(np.expand_dims(vert_layers, 2), HORIZ_FIELDS, axis=2)
vert_layers.shape

(1721, 31, 4450)

In [8]:
# Add variables below and above
def add_above_and_below(var_array, key):
    '''
        var_array: 3D tensor
    '''
    if key == 'pres':
        factor = 3/4
    else:
        factor = 1
    above = (np.insert(var_array, obj=0, values=1000*np.ones((TIME_STEPS, HORIZ_FIELDS)), axis=1))[:, :-1, :]
    # Replace by the entry from the same cell if the one above is nan.
    # It is a bit suboptimal that the grid cells above can be nan in NARVAL. At least decrease pressure by 3/4.
    nan_indices = np.where(np.isnan(above))
    above[nan_indices] = factor*above[nan_indices[0], nan_indices[1]+1, nan_indices[2]]
    
    # Below is the same value as the grid cell for surface-closest layer
    below = (np.append(var_array, values=var_array[:, -1:, :], axis=1))[:, 1:, :]
    return above, below

above = {}
below = {}

# 1000 is a value that cannot be attained physically and serves as our way of checking whether the grid cell is at the model top
# It makes sense to insert 0 as the difference at the lowest levels. (Note that the values won't stay 0 after normalization) 
# The NN could get around these values that are not really physical by weighing the influence from below with a zg-factor.
# Alternatively we would have to remove the variable from below altogether

for key in order_of_vars_narval[:-2]:
    above[key], below[key] = add_above_and_below(data_dict[key], key)

In [9]:
# Reshaping into 1D-arrays and converting dict into a DataFrame-object (the following is based on Aurelien Geron)
for key in data_dict.keys():
    data_dict[key] = np.reshape(data_dict[key], -1) 
    vert_layers = np.reshape(vert_layers, -1)
    
for key in order_of_vars_narval[:-2]:
    data_dict['%s_below'%key] = np.reshape(below[key], -1)
    data_dict['%s_above'%key] = np.reshape(above[key], -1)
    
try: 
    data_dict['temp_sfc'] = np.reshape(temp_sfc, -1)
except: 
    print('There is probably no temperature in order_of_vars_narval')
    pass

df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,qv,qc,qi,temp,pres,u,v,zg,coriolis,clc,...,temp_above,pres_below,pres_above,u_below,u_above,v_below,v_above,zg_below,zg_above,temp_sfc
0,,,,,,,,28193.783203,2.1e-05,,...,1000.0,,1000.0,,1000.0,,1000.0,26201.697266,1000.0,301.795624
1,,,,,,,,28193.783203,2.1e-05,,...,1000.0,,1000.0,,1000.0,,1000.0,26201.697266,1000.0,299.384064
2,,,,,,,,28193.783203,2.6e-05,,...,1000.0,,1000.0,,1000.0,,1000.0,26201.697266,1000.0,295.616211
3,,,,,,,,28193.783203,1.6e-05,,...,1000.0,,1000.0,,1000.0,,1000.0,26201.697266,1000.0,300.337311
4,,,,,,,,28193.783203,1.4e-05,,...,1000.0,,1000.0,,1000.0,,1000.0,26201.697266,1000.0,300.02356


**Downsampling the data (minority class: clc = 0)**

In [10]:
# Remove data above 21kms
df = df.loc[df['zg'] < 21000]

In [11]:
# There are no nans left
assert np.all(np.isnan(df) == False) == True

In [12]:
if output_var == 'clc':
    #The upper levels have been cut off so there are no wrong values in the data anymore
    assert df[df['temp_above']==1000].shape[0] == 0 
    # Some quick sanity checks regarding the input data
    assert np.all(df['temp'] > 150) and np.all(df['pres'] > 150)

In [14]:
# Remove condensate-free clouds (7.3% of clouds)
df = df.loc[~((df['clc'] > 0) & (df['qc'] == 0) & (df['qi'] == 0))]

In [15]:
# We ensure that clc != 0 is as large as clc = 0 (which then has 294 Mio samples) and keep the original order intact
df_noclc = df.loc[df['clc']==0]
print(len(df_noclc))

# len(downsample_indices) will be the number of noclc samples that remain
downsample_ratio = (len(df) - len(df_noclc))/len(df_noclc)
shuffled_indices = np.random.permutation(df.loc[df['clc']==0].index)
size_noclc = int(len(df_noclc)*downsample_ratio)
downsample_indices = shuffled_indices[:size_noclc] 

# Concatenate df.loc[df['cl']!=0].index and downsample_indices
final_indices = np.concatenate((downsample_indices, df.loc[df['clc']!=0].index))

# Sort final_indices so that we can more or less recover the timesteps
final_indices = np.sort(final_indices)

# Label-based (loc) not positional-based
df = df.loc[final_indices]

138399774


In [16]:
# Number of samples after downsampling
len(df)

126853676

In [17]:
#Modifies df as well
def split_input_output(dataset):
    output_df = dataset[output_var]
    del dataset[output_var]
    return output_df

In [18]:
output_df = split_input_output(df)

In [19]:
# Save the data
if output_var == 'clc':
    np.save(output_path + '/cloud_cover_input_narval.npy', np.float32(df))
    np.save(output_path + '/cloud_cover_output_narval.npy', np.float32(output_df))
elif output_var == 'cl_area':
    np.save(output_path + '/cloud_area_output_narval.npy', np.float32(output_df))

# Save the corresponding vertical layers (int16 is sufficient for layers < 1000)
if output_var == 'clc':
    np.save(output_path + '/samples_vertical_layers_narval.npy', vert_layers[df.index])

In [None]:
old_input = np.load(output_path + '/cloud_cover_input_narval.npy')
# If this yields True then we're done
print(np.all(old_input[:,2] == df['qi']))

In [None]:
clc_in_narv = np.save(output_path + '/cloud_cover_input_narval.npy', np.float32(df))